deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/catalog/main/impl.py
CHANGED
@@ -1,74 +1,263 @@
|
|
1
|
-
from typing import Any, Dict, List, Optional, Union, Tuple
|
1
|
+
from typing import Any, Dict, List, Optional, Union, Tuple, Set
|
2
2
|
import logging
|
3
|
+
from collections import defaultdict
|
3
4
|
|
4
|
-
|
5
|
+
import numpy as np
|
6
|
+
import pyarrow as pa
|
7
|
+
import pandas as pd
|
8
|
+
import daft
|
9
|
+
import deltacat as dc
|
10
|
+
|
11
|
+
from deltacat.storage.model.manifest import ManifestAuthor
|
12
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
5
13
|
from deltacat.exceptions import (
|
6
14
|
NamespaceAlreadyExistsError,
|
7
|
-
StreamNotFoundError,
|
8
15
|
TableAlreadyExistsError,
|
9
16
|
TableVersionNotFoundError,
|
17
|
+
TableNotFoundError,
|
18
|
+
TableVersionAlreadyExistsError,
|
19
|
+
TableValidationError,
|
20
|
+
SchemaValidationError,
|
10
21
|
)
|
11
22
|
from deltacat.catalog.model.table_definition import TableDefinition
|
12
23
|
from deltacat.storage.model.sort_key import SortScheme
|
13
24
|
from deltacat.storage.model.list_result import ListResult
|
14
25
|
from deltacat.storage.model.namespace import Namespace, NamespaceProperties
|
15
|
-
from deltacat.storage.model.schema import
|
26
|
+
from deltacat.storage.model.schema import (
|
27
|
+
Schema,
|
28
|
+
SchemaUpdate,
|
29
|
+
)
|
16
30
|
from deltacat.storage.model.table import TableProperties, Table
|
17
31
|
from deltacat.storage.model.types import (
|
18
|
-
|
32
|
+
Dataset,
|
19
33
|
LifecycleState,
|
20
|
-
LocalDataset,
|
21
|
-
LocalTable,
|
22
34
|
StreamFormat,
|
35
|
+
SchemaConsistencyType,
|
23
36
|
)
|
24
37
|
from deltacat.storage.model.partition import (
|
25
38
|
Partition,
|
26
39
|
PartitionLocator,
|
27
40
|
PartitionScheme,
|
28
41
|
)
|
29
|
-
from deltacat.storage.model.table_version import
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
from deltacat.types
|
34
|
-
from deltacat.
|
42
|
+
from deltacat.storage.model.table_version import (
|
43
|
+
TableVersion,
|
44
|
+
TableVersionProperties,
|
45
|
+
)
|
46
|
+
from deltacat.storage.model.types import DeltaType
|
47
|
+
from deltacat.storage import Delta
|
48
|
+
from deltacat.storage.model.types import CommitState
|
49
|
+
from deltacat.storage.model.transaction import (
|
50
|
+
Transaction,
|
51
|
+
setup_transaction,
|
52
|
+
)
|
53
|
+
from deltacat.types.media import (
|
54
|
+
ContentType,
|
55
|
+
DatasetType,
|
56
|
+
StorageType,
|
57
|
+
SCHEMA_CONTENT_TYPES,
|
58
|
+
)
|
59
|
+
from deltacat.types.tables import (
|
60
|
+
SchemaEvolutionMode,
|
61
|
+
TableProperty,
|
62
|
+
TablePropertyDefaultValues,
|
63
|
+
TableReadOptimizationLevel,
|
64
|
+
TableWriteMode,
|
65
|
+
get_dataset_type,
|
66
|
+
get_table_schema,
|
67
|
+
get_table_column_names,
|
68
|
+
from_pyarrow,
|
69
|
+
concat_tables,
|
70
|
+
empty_table,
|
71
|
+
infer_table_schema,
|
72
|
+
to_pandas,
|
73
|
+
)
|
74
|
+
from deltacat.utils import pyarrow as pa_utils
|
75
|
+
from deltacat.utils.reader_compatibility_mapping import get_compatible_readers
|
76
|
+
from deltacat.utils.pyarrow import get_base_arrow_type_name
|
35
77
|
from deltacat import logs
|
36
78
|
from deltacat.constants import DEFAULT_NAMESPACE
|
37
|
-
from deltacat.storage import metastore as storage_impl
|
38
79
|
|
39
80
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
40
81
|
|
41
82
|
"""
|
42
|
-
|
83
|
+
Default Catalog interface implementation using DeltaCAT native storage.
|
43
84
|
|
44
|
-
|
45
|
-
|
85
|
+
The functions here should not be invoked directly, but should instead be
|
86
|
+
invoked through `delegate.py` (e.g., to support passing catalog's by name, and
|
87
|
+
to ensure that each initialized `Catalog` implementation has its `inner`
|
88
|
+
property set to the `CatalogProperties` returned from `initialize()`).
|
46
89
|
|
47
|
-
`CatalogProperties`
|
90
|
+
The `CatalogProperties` instance returned by `initialize()` contains all
|
91
|
+
durable state required to deterministically reconstruct the associated DeltaCAT
|
92
|
+
native `Catalog` implementation (e.g., the root URI for the catalog metastore).
|
48
93
|
"""
|
49
94
|
|
50
95
|
|
51
96
|
# catalog functions
|
52
|
-
def initialize(
|
97
|
+
def initialize(
|
98
|
+
config: Optional[CatalogProperties] = None,
|
99
|
+
*args,
|
100
|
+
**kwargs,
|
101
|
+
) -> CatalogProperties:
|
53
102
|
"""
|
54
|
-
|
103
|
+
Performs any required one-time initialization and validation of this
|
104
|
+
catalog implementation based on the input configuration. If no config
|
105
|
+
instance is given, a new `CatalogProperties` instance is constructed
|
106
|
+
using the given keyword arguments.
|
55
107
|
|
56
|
-
|
108
|
+
Returns the input config if given, and the newly created config otherwise.
|
57
109
|
"""
|
58
110
|
if config is not None:
|
111
|
+
if not isinstance(config, CatalogProperties):
|
112
|
+
raise ValueError(
|
113
|
+
f"Expected `CatalogProperties` but found `{type(config)}`."
|
114
|
+
)
|
59
115
|
return config
|
60
116
|
else:
|
61
117
|
return CatalogProperties(*args, **kwargs)
|
62
118
|
|
63
119
|
|
64
120
|
# table functions
|
121
|
+
def _validate_write_mode_and_table_existence(
|
122
|
+
table: str,
|
123
|
+
namespace: str,
|
124
|
+
mode: TableWriteMode,
|
125
|
+
**kwargs,
|
126
|
+
) -> bool:
|
127
|
+
"""Validate write mode against table existence and return whether table exists."""
|
128
|
+
table_exists_flag = table_exists(
|
129
|
+
table,
|
130
|
+
namespace=namespace,
|
131
|
+
**kwargs,
|
132
|
+
)
|
133
|
+
logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
|
134
|
+
|
135
|
+
if mode == TableWriteMode.CREATE and table_exists_flag:
|
136
|
+
raise ValueError(
|
137
|
+
f"Table {namespace}.{table} already exists and mode is CREATE."
|
138
|
+
)
|
139
|
+
elif (
|
140
|
+
mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
|
141
|
+
and not table_exists_flag
|
142
|
+
):
|
143
|
+
raise TableNotFoundError(
|
144
|
+
f"Table {namespace}.{table} does not exist and mode is {mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()}. Use CREATE or AUTO mode to create a new table."
|
145
|
+
)
|
146
|
+
|
147
|
+
return table_exists_flag
|
148
|
+
|
149
|
+
|
150
|
+
def _get_table_and_validate_write_mode(
|
151
|
+
table: str,
|
152
|
+
namespace: str,
|
153
|
+
table_version: Optional[str],
|
154
|
+
mode: TableWriteMode,
|
155
|
+
**kwargs,
|
156
|
+
) -> Tuple[bool, TableDefinition]:
|
157
|
+
"""Validate write mode against table and table version existence.
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
Tuple of (table_exists_flag, table_definition)
|
161
|
+
"""
|
162
|
+
# First validate table, table version, and stream existence
|
163
|
+
existing_table_def = get_table(
|
164
|
+
table,
|
165
|
+
namespace=namespace,
|
166
|
+
table_version=table_version,
|
167
|
+
**kwargs,
|
168
|
+
)
|
169
|
+
table_exists_flag = (
|
170
|
+
existing_table_def is not None
|
171
|
+
and existing_table_def.table_version
|
172
|
+
and existing_table_def.stream
|
173
|
+
)
|
174
|
+
logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
|
175
|
+
|
176
|
+
# Then validate table existence constraints
|
177
|
+
if mode == TableWriteMode.CREATE and table_exists_flag and table_version is None:
|
178
|
+
raise TableAlreadyExistsError(
|
179
|
+
f"Table {namespace}.{table} already exists and mode is CREATE."
|
180
|
+
)
|
181
|
+
elif (
|
182
|
+
mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
|
183
|
+
and existing_table_def is None
|
184
|
+
):
|
185
|
+
raise TableNotFoundError(
|
186
|
+
f"Table {namespace}.{table} does not exist and write mode is {mode}. Use CREATE or AUTO mode to create a new table."
|
187
|
+
)
|
188
|
+
|
189
|
+
# Then validate table version existence constraints
|
190
|
+
if table_version is not None and table_exists_flag:
|
191
|
+
if mode == TableWriteMode.CREATE:
|
192
|
+
raise TableVersionAlreadyExistsError(
|
193
|
+
f"Table version {namespace}.{table}.{table_version} already exists and mode is CREATE."
|
194
|
+
)
|
195
|
+
logger.info(f"Table version ({namespace}.{table}.{table_version}) exists.")
|
196
|
+
elif (
|
197
|
+
mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
|
198
|
+
and table_version is not None
|
199
|
+
and not table_exists_flag
|
200
|
+
):
|
201
|
+
raise TableVersionNotFoundError(
|
202
|
+
f"Table version {namespace}.{table}.{table_version} does not exist and write mode is {mode}. "
|
203
|
+
f"Use CREATE or AUTO mode to create a new table version, or omit table_version "
|
204
|
+
f"to use the latest version."
|
205
|
+
)
|
206
|
+
return table_exists_flag, existing_table_def
|
207
|
+
|
208
|
+
|
209
|
+
def _validate_content_type_against_supported_content_types(
|
210
|
+
namespace: str,
|
211
|
+
table: str,
|
212
|
+
content_type: ContentType,
|
213
|
+
supported_content_types: Optional[List[ContentType]],
|
214
|
+
) -> None:
|
215
|
+
if supported_content_types and content_type not in supported_content_types:
|
216
|
+
raise ValueError(
|
217
|
+
f"Content type proposed for write to table {namespace}.{table} ({content_type}) "
|
218
|
+
f"conflicts with the proposed list of new supported content types: {supported_content_types}"
|
219
|
+
)
|
220
|
+
|
221
|
+
|
222
|
+
def _create_table_for_write(
|
223
|
+
data: Dataset,
|
224
|
+
table: str,
|
225
|
+
namespace: str,
|
226
|
+
table_version: Optional[str],
|
227
|
+
content_type: ContentType,
|
228
|
+
existing_table_definition: Optional[TableDefinition],
|
229
|
+
*args,
|
230
|
+
**kwargs,
|
231
|
+
) -> TableDefinition:
|
232
|
+
"""Creates a new table, table version, and/or stream in preparation for a write operation."""
|
233
|
+
if "schema" not in kwargs:
|
234
|
+
kwargs["schema"] = infer_table_schema(data)
|
235
|
+
|
236
|
+
_validate_content_type_against_supported_content_types(
|
237
|
+
namespace,
|
238
|
+
table,
|
239
|
+
content_type,
|
240
|
+
kwargs.get("content_types"),
|
241
|
+
)
|
242
|
+
return create_table(
|
243
|
+
table,
|
244
|
+
namespace=namespace,
|
245
|
+
table_version=table_version,
|
246
|
+
existing_table_definition=existing_table_definition,
|
247
|
+
*args,
|
248
|
+
**kwargs,
|
249
|
+
)
|
250
|
+
|
251
|
+
|
65
252
|
def write_to_table(
|
66
|
-
data:
|
253
|
+
data: Dataset,
|
67
254
|
table: str,
|
68
255
|
*args,
|
69
256
|
namespace: Optional[str] = None,
|
257
|
+
table_version: Optional[str] = None,
|
70
258
|
mode: TableWriteMode = TableWriteMode.AUTO,
|
71
259
|
content_type: ContentType = ContentType.PARQUET,
|
260
|
+
transaction: Optional[Transaction] = None,
|
72
261
|
**kwargs,
|
73
262
|
) -> None:
|
74
263
|
"""Write local or distributed data to a table. Raises an error if the
|
@@ -77,79 +266,1137 @@ def write_to_table(
|
|
77
266
|
When creating a table, all `create_table` parameters may be optionally
|
78
267
|
specified as additional keyword arguments. When appending to, or replacing,
|
79
268
|
an existing table, all `alter_table` parameters may be optionally specified
|
80
|
-
as additional keyword arguments.
|
81
|
-
raise NotImplementedError("write_to_table not implemented")
|
269
|
+
as additional keyword arguments.
|
82
270
|
|
271
|
+
Args:
|
272
|
+
data: Local or distributed data to write to the table.
|
273
|
+
table: Name of the table to write to.
|
274
|
+
namespace: Optional namespace for the table. Uses default if not specified.
|
275
|
+
table_version: Optional version of the table to write to. If specified,
|
276
|
+
will create this version if it doesn't exist (in CREATE mode) or
|
277
|
+
get this version if it exists (in other modes). If not specified,
|
278
|
+
uses the latest version.
|
279
|
+
mode: Write mode (AUTO, CREATE, APPEND, REPLACE, MERGE, DELETE).
|
280
|
+
content_type: Content type used to write the data files. Defaults to PARQUET.
|
281
|
+
transaction: Optional transaction to append write operations to instead of
|
282
|
+
creating and committing a new transaction.
|
283
|
+
**kwargs: Additional keyword arguments.
|
284
|
+
"""
|
285
|
+
namespace = namespace or default_namespace()
|
83
286
|
|
84
|
-
|
287
|
+
# Set up transaction handling
|
288
|
+
write_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
289
|
+
kwargs["transaction"] = write_transaction
|
290
|
+
|
291
|
+
try:
|
292
|
+
# Validate write mode and table/table version/stream existence
|
293
|
+
(table_exists_flag, table_definition,) = _get_table_and_validate_write_mode(
|
294
|
+
table,
|
295
|
+
namespace,
|
296
|
+
table_version,
|
297
|
+
mode,
|
298
|
+
**kwargs,
|
299
|
+
)
|
300
|
+
|
301
|
+
# Get or create table, table version, and/or stream
|
302
|
+
if not table_exists_flag:
|
303
|
+
table_definition = _create_table_for_write(
|
304
|
+
data,
|
305
|
+
table,
|
306
|
+
namespace,
|
307
|
+
table_version,
|
308
|
+
content_type,
|
309
|
+
table_definition,
|
310
|
+
*args,
|
311
|
+
**kwargs,
|
312
|
+
)
|
313
|
+
else:
|
314
|
+
# call alter_table if there are any alter_table kwargs provided
|
315
|
+
if (
|
316
|
+
"lifecycle_state" in kwargs
|
317
|
+
or "schema_updates" in kwargs
|
318
|
+
or "partition_updates" in kwargs
|
319
|
+
or "sort_scheme" in kwargs
|
320
|
+
or "table_description" in kwargs
|
321
|
+
or "table_version_description" in kwargs
|
322
|
+
or "table_properties" in kwargs
|
323
|
+
or "table_version_properties" in kwargs
|
324
|
+
):
|
325
|
+
alter_table(
|
326
|
+
table,
|
327
|
+
namespace=namespace,
|
328
|
+
table_version=table_version,
|
329
|
+
*args,
|
330
|
+
**kwargs,
|
331
|
+
)
|
332
|
+
|
333
|
+
# Get the active table version and stream
|
334
|
+
table_version_obj = _get_latest_active_or_given_table_version(
|
335
|
+
namespace=table_definition.table.namespace,
|
336
|
+
table_name=table_definition.table.table_name,
|
337
|
+
table_version=table_version or table_definition.table_version.table_version,
|
338
|
+
**kwargs,
|
339
|
+
)
|
340
|
+
|
341
|
+
# Validate schema compatibility for schemaless content types with schema tables
|
342
|
+
if (
|
343
|
+
content_type.value not in SCHEMA_CONTENT_TYPES
|
344
|
+
and table_version_obj.schema is not None
|
345
|
+
):
|
346
|
+
schemaless_types = {
|
347
|
+
ct for ct in ContentType if ct.value not in SCHEMA_CONTENT_TYPES
|
348
|
+
}
|
349
|
+
raise TableValidationError(
|
350
|
+
f"Content type '{content_type.value}' cannot be written to a table with a schema. "
|
351
|
+
f"Table '{namespace}.{table}' has a schema, but content type '{content_type.value}' "
|
352
|
+
f"is schemaless. Schemaless content types ({', '.join(sorted([ct.value for ct in schemaless_types]))}) "
|
353
|
+
f"can only be written to schemaless tables."
|
354
|
+
)
|
355
|
+
|
356
|
+
# Handle different write modes and get stream and delta type
|
357
|
+
stream, delta_type = _handle_write_mode(
|
358
|
+
mode,
|
359
|
+
table_definition,
|
360
|
+
table_version_obj,
|
361
|
+
namespace,
|
362
|
+
table,
|
363
|
+
**kwargs,
|
364
|
+
)
|
365
|
+
|
366
|
+
if not stream:
|
367
|
+
raise ValueError(f"No default stream found for table {namespace}.{table}")
|
368
|
+
|
369
|
+
# Automatically set entry_params for DELETE/MERGE modes if not provided
|
370
|
+
_set_entry_params_if_needed(
|
371
|
+
mode,
|
372
|
+
table_version_obj,
|
373
|
+
kwargs,
|
374
|
+
)
|
375
|
+
|
376
|
+
# Validate table configuration
|
377
|
+
_validate_table_configuration(
|
378
|
+
stream,
|
379
|
+
table_version_obj,
|
380
|
+
namespace,
|
381
|
+
table,
|
382
|
+
)
|
383
|
+
|
384
|
+
# Handle partition creation/retrieval
|
385
|
+
partition, commit_staged_partition = _handle_partition_creation(
|
386
|
+
mode,
|
387
|
+
table_exists_flag,
|
388
|
+
delta_type,
|
389
|
+
stream,
|
390
|
+
**kwargs,
|
391
|
+
)
|
392
|
+
|
393
|
+
# Get table properties for schema evolution
|
394
|
+
schema_evolution_mode = table_version_obj.read_table_property(
|
395
|
+
TableProperty.SCHEMA_EVOLUTION_MODE
|
396
|
+
)
|
397
|
+
default_schema_consistency_type = table_version_obj.read_table_property(
|
398
|
+
TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE
|
399
|
+
)
|
400
|
+
|
401
|
+
# Convert unsupported dataset types and NumPy arrays that need schema validation
|
402
|
+
if isinstance(data, np.ndarray) and table_version_obj.schema is not None:
|
403
|
+
# NumPy arrays need conversion to Pandas for proper column naming in schema validation
|
404
|
+
converted_data = _convert_numpy_for_schema_validation(
|
405
|
+
data, table_version_obj.schema
|
406
|
+
)
|
407
|
+
else:
|
408
|
+
# Convert other unsupported dataset types (e.g., Daft) or keep NumPy as-is for schemaless tables
|
409
|
+
converted_data = _convert_data_if_needed(data)
|
410
|
+
|
411
|
+
# Capture original field set before schema coercion for partial UPSERT support
|
412
|
+
original_fields = set(get_table_column_names(converted_data))
|
413
|
+
|
414
|
+
# Validate and coerce data against schema
|
415
|
+
# This ensures proper schema evolution and type handling
|
416
|
+
(
|
417
|
+
validated_data,
|
418
|
+
schema_modified,
|
419
|
+
updated_schema,
|
420
|
+
) = _validate_and_coerce_data_against_schema(
|
421
|
+
converted_data, # Use converted data for NumPy, original for others
|
422
|
+
table_version_obj.schema,
|
423
|
+
schema_evolution_mode=schema_evolution_mode,
|
424
|
+
default_schema_consistency_type=default_schema_consistency_type,
|
425
|
+
)
|
426
|
+
|
427
|
+
# Convert validated data to supported format for storage if needed
|
428
|
+
converted_data = _convert_data_if_needed(validated_data)
|
429
|
+
|
430
|
+
# Validate reader compatibility against supported reader types
|
431
|
+
supported_reader_types = table_version_obj.read_table_property(
|
432
|
+
TableProperty.SUPPORTED_READER_TYPES
|
433
|
+
)
|
434
|
+
_validate_reader_compatibility(
|
435
|
+
converted_data,
|
436
|
+
content_type,
|
437
|
+
supported_reader_types,
|
438
|
+
)
|
439
|
+
|
440
|
+
# Update table version if schema was modified during evolution
|
441
|
+
if schema_modified:
|
442
|
+
# Extract catalog properties and filter kwargs
|
443
|
+
catalog_kwargs = {
|
444
|
+
"catalog": kwargs.get("catalog"),
|
445
|
+
"inner": kwargs.get("inner"),
|
446
|
+
"transaction": write_transaction, # Pass transaction to update_table_version
|
447
|
+
}
|
448
|
+
|
449
|
+
_get_storage(**catalog_kwargs).update_table_version(
|
450
|
+
namespace=namespace,
|
451
|
+
table_name=table,
|
452
|
+
table_version=table_version_obj.table_version,
|
453
|
+
schema=updated_schema,
|
454
|
+
**catalog_kwargs,
|
455
|
+
)
|
456
|
+
|
457
|
+
# Stage and commit delta, handle compaction
|
458
|
+
# Remove schema from kwargs to avoid duplicate parameter conflict
|
459
|
+
filtered_kwargs = {k: v for k, v in kwargs.items() if k != "schema"}
|
460
|
+
# Use updated schema if schema evolution occurred, otherwise use original schema
|
461
|
+
_stage_commit_and_compact(
|
462
|
+
converted_data,
|
463
|
+
partition,
|
464
|
+
delta_type,
|
465
|
+
content_type,
|
466
|
+
commit_staged_partition,
|
467
|
+
table_version_obj,
|
468
|
+
namespace,
|
469
|
+
table,
|
470
|
+
schema=updated_schema if schema_modified else table_version_obj.schema,
|
471
|
+
original_fields=original_fields,
|
472
|
+
**filtered_kwargs,
|
473
|
+
)
|
474
|
+
except Exception as e:
|
475
|
+
# If any error occurs, the transaction remains uncommitted
|
476
|
+
commit_transaction = False
|
477
|
+
logger.error(f"Error during write_to_table: {e}")
|
478
|
+
raise
|
479
|
+
finally:
|
480
|
+
if commit_transaction:
|
481
|
+
# Seal the interactive transaction to commit all operations atomically
|
482
|
+
write_transaction.seal()
|
483
|
+
|
484
|
+
|
485
|
+
def _handle_write_mode(
|
486
|
+
mode: TableWriteMode,
|
487
|
+
table_definition: TableDefinition,
|
488
|
+
table_version_obj: TableVersion,
|
489
|
+
namespace: str,
|
85
490
|
table: str,
|
86
|
-
*args,
|
87
|
-
namespace: Optional[str] = None,
|
88
|
-
table_version: Optional[str] = None,
|
89
|
-
table_type: Optional[TableType] = TableType.PYARROW,
|
90
|
-
distributed_dataset_type: Optional[
|
91
|
-
DistributedDatasetType
|
92
|
-
] = DistributedDatasetType.RAY_DATASET,
|
93
|
-
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
94
|
-
stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
|
95
|
-
merge_on_read: Optional[bool] = False,
|
96
|
-
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
97
491
|
**kwargs,
|
98
|
-
) ->
|
99
|
-
"""
|
492
|
+
) -> Tuple[Any, DeltaType]: # Using Any for stream type to avoid complex imports
|
493
|
+
"""Handle different write modes and return appropriate stream and delta type."""
|
494
|
+
table_schema = table_definition.table_version.schema
|
495
|
+
|
496
|
+
if mode == TableWriteMode.REPLACE:
|
497
|
+
return _handle_replace_mode(
|
498
|
+
table_schema,
|
499
|
+
namespace,
|
500
|
+
table,
|
501
|
+
table_version_obj,
|
502
|
+
**kwargs,
|
503
|
+
)
|
504
|
+
elif mode == TableWriteMode.APPEND:
|
505
|
+
return _handle_append_mode(
|
506
|
+
table_schema,
|
507
|
+
namespace,
|
508
|
+
table,
|
509
|
+
table_version_obj,
|
510
|
+
**kwargs,
|
511
|
+
)
|
512
|
+
elif mode in (TableWriteMode.MERGE, TableWriteMode.DELETE):
|
513
|
+
return _handle_merge_delete_mode(
|
514
|
+
mode,
|
515
|
+
table_schema,
|
516
|
+
namespace,
|
517
|
+
table,
|
518
|
+
table_version_obj,
|
519
|
+
**kwargs,
|
520
|
+
)
|
521
|
+
else:
|
522
|
+
# AUTO and CREATE modes
|
523
|
+
return _handle_auto_create_mode(
|
524
|
+
table_schema,
|
525
|
+
namespace,
|
526
|
+
table,
|
527
|
+
table_version_obj,
|
528
|
+
**kwargs,
|
529
|
+
)
|
100
530
|
|
101
|
-
if reader_kwargs is None:
|
102
|
-
reader_kwargs = {}
|
103
531
|
|
104
|
-
|
532
|
+
def _handle_replace_mode(
|
533
|
+
table_schema,
|
534
|
+
namespace: str,
|
535
|
+
table: str,
|
536
|
+
table_version_obj: TableVersion,
|
537
|
+
**kwargs,
|
538
|
+
) -> Tuple[Any, DeltaType]:
|
539
|
+
"""Handle REPLACE mode by staging and committing a new stream."""
|
540
|
+
stream = _get_storage(**kwargs).stage_stream(
|
105
541
|
namespace=namespace,
|
106
|
-
|
107
|
-
|
108
|
-
|
542
|
+
table_name=table,
|
543
|
+
table_version=table_version_obj.table_version,
|
544
|
+
**kwargs,
|
545
|
+
)
|
546
|
+
|
547
|
+
stream = _get_storage(**kwargs).commit_stream(stream=stream, **kwargs)
|
548
|
+
delta_type = (
|
549
|
+
DeltaType.UPSERT
|
550
|
+
if table_schema and table_schema.merge_keys
|
551
|
+
else DeltaType.APPEND
|
552
|
+
)
|
553
|
+
return stream, delta_type
|
554
|
+
|
555
|
+
|
556
|
+
def _handle_append_mode(
|
557
|
+
table_schema,
|
558
|
+
namespace: str,
|
559
|
+
table: str,
|
560
|
+
table_version_obj: TableVersion,
|
561
|
+
**kwargs,
|
562
|
+
) -> Tuple[Any, DeltaType]:
|
563
|
+
"""Handle APPEND mode by validating no merge keys and getting existing stream."""
|
564
|
+
if table_schema and table_schema.merge_keys:
|
565
|
+
raise SchemaValidationError(
|
566
|
+
f"APPEND mode cannot be used with tables that have merge keys. "
|
567
|
+
f"Table {namespace}.{table} has merge keys: {table_schema.merge_keys}. "
|
568
|
+
f"Use MERGE mode instead."
|
569
|
+
)
|
570
|
+
|
571
|
+
stream = _get_table_stream(
|
572
|
+
namespace,
|
573
|
+
table,
|
574
|
+
table_version_obj.table_version,
|
575
|
+
**kwargs,
|
576
|
+
)
|
577
|
+
return stream, DeltaType.APPEND
|
578
|
+
|
579
|
+
|
580
|
+
def _handle_merge_delete_mode(
|
581
|
+
mode: TableWriteMode,
|
582
|
+
table_schema,
|
583
|
+
namespace: str,
|
584
|
+
table: str,
|
585
|
+
table_version_obj: TableVersion,
|
586
|
+
**kwargs,
|
587
|
+
) -> Tuple[Any, DeltaType]:
|
588
|
+
"""Handle MERGE/DELETE modes by validating merge keys and getting existing stream."""
|
589
|
+
if not table_schema or not table_schema.merge_keys:
|
590
|
+
raise TableValidationError(
|
591
|
+
f"{mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()} mode requires tables to have at least one merge key. "
|
592
|
+
f"Table {namespace}.{table}.{table_version_obj.table_version} has no merge keys. "
|
593
|
+
f"Use APPEND, AUTO, or REPLACE mode instead."
|
594
|
+
)
|
595
|
+
|
596
|
+
stream = _get_table_stream(
|
597
|
+
namespace,
|
598
|
+
table,
|
599
|
+
table_version_obj.table_version,
|
600
|
+
**kwargs,
|
601
|
+
)
|
602
|
+
delta_type = DeltaType.UPSERT if mode == TableWriteMode.MERGE else DeltaType.DELETE
|
603
|
+
return stream, delta_type
|
604
|
+
|
605
|
+
|
606
|
+
def _handle_auto_create_mode(
|
607
|
+
table_schema,
|
608
|
+
namespace: str,
|
609
|
+
table: str,
|
610
|
+
table_version_obj: TableVersion,
|
611
|
+
**kwargs,
|
612
|
+
) -> Tuple[Any, DeltaType]:
|
613
|
+
"""Handle AUTO and CREATE modes by getting existing stream."""
|
614
|
+
stream = _get_table_stream(
|
615
|
+
namespace,
|
616
|
+
table,
|
617
|
+
table_version_obj.table_version,
|
618
|
+
**kwargs,
|
619
|
+
)
|
620
|
+
delta_type = (
|
621
|
+
DeltaType.UPSERT
|
622
|
+
if table_schema and table_schema.merge_keys
|
623
|
+
else DeltaType.APPEND
|
624
|
+
)
|
625
|
+
return stream, delta_type
|
626
|
+
|
627
|
+
|
628
|
+
def _validate_table_configuration(
|
629
|
+
stream,
|
630
|
+
table_version_obj: TableVersion,
|
631
|
+
namespace: str,
|
632
|
+
table: str,
|
633
|
+
) -> None:
|
634
|
+
"""Validate table configuration for unsupported features."""
|
635
|
+
# Check if table is partitioned
|
636
|
+
if (
|
637
|
+
stream.partition_scheme
|
638
|
+
and stream.partition_scheme.keys is not None
|
639
|
+
and len(stream.partition_scheme.keys) > 0
|
640
|
+
):
|
641
|
+
raise NotImplementedError(
|
642
|
+
f"write_to_table does not yet support partitioned tables. "
|
643
|
+
f"Table {namespace}.{table} has partition scheme with "
|
644
|
+
f"{len(stream.partition_scheme.keys)} partition key(s): "
|
645
|
+
f"{[key.name or key.key[0] for key in stream.partition_scheme.keys]}. "
|
646
|
+
f"Please use the lower-level metastore API for partitioned tables."
|
647
|
+
)
|
648
|
+
|
649
|
+
# Check if table has sort keys
|
650
|
+
if (
|
651
|
+
table_version_obj.sort_scheme
|
652
|
+
and table_version_obj.sort_scheme.keys is not None
|
653
|
+
and len(table_version_obj.sort_scheme.keys) > 0
|
654
|
+
):
|
655
|
+
raise NotImplementedError(
|
656
|
+
f"write_to_table does not yet support tables with sort keys. "
|
657
|
+
f"Table {namespace}.{table} has sort scheme with "
|
658
|
+
f"{len(table_version_obj.sort_scheme.keys)} sort key(s): "
|
659
|
+
f"{[key.key[0] for key in table_version_obj.sort_scheme.keys]}. "
|
660
|
+
f"Please use the lower-level metastore API for sorted tables."
|
661
|
+
)
|
662
|
+
|
663
|
+
|
664
|
+
def _handle_partition_creation(
|
665
|
+
mode: TableWriteMode,
|
666
|
+
table_exists_flag: bool,
|
667
|
+
delta_type: DeltaType,
|
668
|
+
stream,
|
669
|
+
**kwargs,
|
670
|
+
) -> Tuple[Any, bool]: # partition, commit_staged_partition
|
671
|
+
"""Handle partition creation/retrieval based on write mode."""
|
672
|
+
if mode == TableWriteMode.REPLACE or not table_exists_flag:
|
673
|
+
# REPLACE mode or new table: Stage a new partition
|
674
|
+
partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
|
675
|
+
# If we're doing UPSERT/DELETE operations, let compaction handle the commit
|
676
|
+
commit_staged_partition = delta_type not in (DeltaType.UPSERT, DeltaType.DELETE)
|
677
|
+
return partition, commit_staged_partition
|
678
|
+
elif delta_type in (DeltaType.UPSERT, DeltaType.DELETE):
|
679
|
+
# UPSERT/DELETE operations: Try to use existing committed partition first
|
680
|
+
partition = _get_storage(**kwargs).get_partition(
|
681
|
+
stream_locator=stream.locator,
|
682
|
+
partition_values=None,
|
683
|
+
**kwargs,
|
684
|
+
)
|
685
|
+
commit_staged_partition = False
|
686
|
+
|
687
|
+
if not partition:
|
688
|
+
# No existing committed partition found, stage a new one
|
689
|
+
partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
|
690
|
+
commit_staged_partition = False # Let compaction handle the commit
|
691
|
+
|
692
|
+
return partition, commit_staged_partition
|
693
|
+
else:
|
694
|
+
# APPEND mode on existing table: Get existing partition
|
695
|
+
partition = _get_storage(**kwargs).get_partition(
|
696
|
+
stream_locator=stream.locator,
|
697
|
+
partition_values=None,
|
698
|
+
**kwargs,
|
699
|
+
)
|
700
|
+
commit_staged_partition = False
|
701
|
+
|
702
|
+
if not partition:
|
703
|
+
# No existing partition found, create a new one
|
704
|
+
partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
|
705
|
+
commit_staged_partition = True
|
706
|
+
|
707
|
+
return partition, commit_staged_partition
|
708
|
+
|
709
|
+
|
710
|
+
def _convert_numpy_for_schema_validation(
|
711
|
+
data: np.ndarray, schema: Optional[Schema]
|
712
|
+
) -> Dataset:
|
713
|
+
"""Convert NumPy array to Pandas DataFrame with proper column names for schema validation.
|
714
|
+
|
715
|
+
Args:
|
716
|
+
data: NumPy array to convert
|
717
|
+
schema: DeltaCAT Schema object for column naming
|
718
|
+
|
719
|
+
Returns:
|
720
|
+
Pandas DataFrame with proper column names matching schema
|
721
|
+
|
722
|
+
Raises:
|
723
|
+
ValueError: If array has more columns than schema or schema is invalid
|
724
|
+
"""
|
725
|
+
if not isinstance(schema, Schema) or not schema.arrow:
|
726
|
+
raise ValueError(
|
727
|
+
f"Expected DeltaCAT schema for Numpy schema validation, but found: {schema}"
|
728
|
+
)
|
729
|
+
|
730
|
+
# Use schema subset matching NumPy array dimensions
|
731
|
+
arrow_schema = schema.arrow
|
732
|
+
num_cols = data.shape[1] if data.ndim > 1 else 1
|
733
|
+
|
734
|
+
if len(arrow_schema) >= num_cols:
|
735
|
+
# Use the first N columns from the schema to match data dimensions
|
736
|
+
subset_fields = [arrow_schema.field(i) for i in range(num_cols)]
|
737
|
+
subset_schema = pa.schema(subset_fields)
|
738
|
+
return to_pandas(data, schema=subset_schema)
|
739
|
+
else:
|
740
|
+
raise ValueError(
|
741
|
+
f"NumPy array has {num_cols} columns but table schema only has {len(arrow_schema)} columns. "
|
742
|
+
f"Cannot write NumPy data with more columns than the table schema supports."
|
743
|
+
)
|
744
|
+
|
745
|
+
|
746
|
+
def _build_entry_index_to_schema_mapping(
|
747
|
+
qualified_deltas: List[Delta], table_version_obj, **kwargs
|
748
|
+
) -> List[Schema]:
|
749
|
+
"""Build a mapping from manifest entry index to schema for reading operations.
|
750
|
+
|
751
|
+
Args:
|
752
|
+
qualified_deltas: List of deltas to process
|
753
|
+
table_version_obj: Table version containing schemas
|
754
|
+
**kwargs: Additional arguments passed to storage operations
|
755
|
+
|
756
|
+
Returns:
|
757
|
+
List mapping each manifest entry index to its corresponding schema
|
758
|
+
|
759
|
+
Raises:
|
760
|
+
ValueError: If a manifest's schema ID is not found in table version schemas
|
761
|
+
"""
|
762
|
+
entry_index_to_schema = []
|
763
|
+
for delta in qualified_deltas:
|
764
|
+
if delta.manifest:
|
765
|
+
manifest = delta.manifest
|
766
|
+
else:
|
767
|
+
# Fetch manifest from storage
|
768
|
+
manifest = _get_storage(**kwargs).get_delta_manifest(
|
769
|
+
delta.locator,
|
770
|
+
**kwargs,
|
771
|
+
)
|
772
|
+
# Map manifest entry index to schema ID
|
773
|
+
schema_id = manifest.meta.schema_id
|
774
|
+
|
775
|
+
# Find the schema that matches this manifest's schema_id
|
776
|
+
matching_schema = None
|
777
|
+
if table_version_obj.schemas:
|
778
|
+
for schema in table_version_obj.schemas:
|
779
|
+
if schema.id == schema_id:
|
780
|
+
matching_schema = schema
|
781
|
+
break
|
782
|
+
|
783
|
+
if matching_schema is None:
|
784
|
+
available_schema_ids = (
|
785
|
+
[s.id for s in table_version_obj.schemas]
|
786
|
+
if table_version_obj.schemas
|
787
|
+
else []
|
788
|
+
)
|
789
|
+
raise ValueError(
|
790
|
+
f"Manifest schema ID {schema_id} not found in table version schemas. "
|
791
|
+
f"Available schema IDs: {available_schema_ids}. "
|
792
|
+
)
|
793
|
+
|
794
|
+
# Add the matching schema for each entry in this manifest
|
795
|
+
for _ in range(len(manifest.entries)):
|
796
|
+
entry_index_to_schema.append(matching_schema)
|
797
|
+
|
798
|
+
return entry_index_to_schema
|
799
|
+
|
800
|
+
|
801
|
+
def _convert_data_if_needed(data: Dataset) -> Dataset:
|
802
|
+
"""Convert unsupported data types to supported ones."""
|
803
|
+
if isinstance(data, daft.DataFrame):
|
804
|
+
# Daft DataFrame - convert based on execution mode
|
805
|
+
ctx = daft.context.get_context()
|
806
|
+
runner = ctx.get_or_create_runner()
|
807
|
+
runner_type = runner.name
|
808
|
+
|
809
|
+
if runner_type == "ray":
|
810
|
+
# Running with Ray backend - convert to Ray Dataset
|
811
|
+
return data.to_ray_dataset()
|
812
|
+
else:
|
813
|
+
# Running with local backend - convert to PyArrow Table
|
814
|
+
return data.to_arrow()
|
815
|
+
|
816
|
+
return data
|
817
|
+
|
818
|
+
|
819
|
+
def _validate_and_coerce_data_against_schema(
|
820
|
+
data: Dataset,
|
821
|
+
schema: Optional[Schema],
|
822
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
|
823
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
|
824
|
+
) -> Tuple[Dataset, bool, Optional[Schema]]:
|
825
|
+
"""Validate and coerce data against the table schema if schema consistency types are set.
|
826
|
+
|
827
|
+
Args:
|
828
|
+
data: The dataset to validate/coerce
|
829
|
+
schema: The DeltaCAT schema to validate against (optional)
|
830
|
+
schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
|
831
|
+
default_schema_consistency_type: Default consistency type for new fields in AUTO mode
|
832
|
+
|
833
|
+
Returns:
|
834
|
+
Tuple[Dataset, bool, Optional[Schema]]: Validated/coerced data, flag indicating if schema was modified, and updated schema
|
835
|
+
|
836
|
+
Raises:
|
837
|
+
ValueError: If validation fails or coercion is not possible
|
838
|
+
"""
|
839
|
+
if not schema:
|
840
|
+
return data, False, None
|
841
|
+
|
842
|
+
validated_data, updated_schema = schema.validate_and_coerce_dataset(
|
843
|
+
data,
|
844
|
+
schema_evolution_mode=schema_evolution_mode,
|
845
|
+
default_schema_consistency_type=default_schema_consistency_type,
|
846
|
+
)
|
847
|
+
|
848
|
+
# Check if schema was modified by comparing with original
|
849
|
+
schema_modified = not updated_schema.equivalent_to(schema, True)
|
850
|
+
# Return updated schema only if it was modified
|
851
|
+
updated_schema = updated_schema if schema_modified else None
|
852
|
+
|
853
|
+
return validated_data, schema_modified, updated_schema
|
854
|
+
|
855
|
+
|
856
|
+
def _validate_reader_compatibility(
|
857
|
+
data: Dataset,
|
858
|
+
content_type: ContentType,
|
859
|
+
supported_reader_types: Optional[List[DatasetType]],
|
860
|
+
) -> None:
|
861
|
+
"""Validate that the data types being written are compatible with all supported reader types.
|
862
|
+
|
863
|
+
Args:
|
864
|
+
data: The dataset to validate
|
865
|
+
content_type: Content type being written
|
866
|
+
supported_reader_types: List of DatasetTypes that must be able to read this data
|
867
|
+
|
868
|
+
Raises:
|
869
|
+
TableValidationError: If any data types would break supported reader compatibility
|
870
|
+
"""
|
871
|
+
if not supported_reader_types:
|
872
|
+
return
|
873
|
+
|
874
|
+
# Get the schema from the data
|
875
|
+
schema = get_table_schema(data)
|
876
|
+
|
877
|
+
# Get the dataset type of the current data
|
878
|
+
writer_dataset_type = get_dataset_type(data)
|
879
|
+
|
880
|
+
# PYARROW_PARQUET is equivalent to PYARROW for compatibility
|
881
|
+
writer_type_str = (
|
882
|
+
writer_dataset_type.value
|
883
|
+
if writer_dataset_type != DatasetType.PYARROW_PARQUET
|
884
|
+
else "pyarrow"
|
885
|
+
)
|
886
|
+
|
887
|
+
content_type_str = content_type.value
|
888
|
+
|
889
|
+
# Check each field type for compatibility
|
890
|
+
incompatible_fields = []
|
891
|
+
|
892
|
+
for field in schema:
|
893
|
+
field_name = field.name
|
894
|
+
arrow_type_str = str(field.type)
|
895
|
+
|
896
|
+
# Get the base type name from PyArrow field type
|
897
|
+
base_type_name = get_base_arrow_type_name(field.type)
|
898
|
+
|
899
|
+
# Get compatible readers for this (arrow_type, writer_dataset_type, content_type) combination
|
900
|
+
compatible_readers = get_compatible_readers(
|
901
|
+
base_type_name,
|
902
|
+
writer_type_str,
|
903
|
+
content_type_str,
|
904
|
+
)
|
905
|
+
|
906
|
+
# Check if all supported reader types are compatible
|
907
|
+
for required_reader in supported_reader_types:
|
908
|
+
reader_is_compatible = required_reader in compatible_readers
|
909
|
+
|
910
|
+
# Special case: PYARROW_PARQUET is equivalent to PYARROW for compatibility if we're writing parquet
|
911
|
+
if (
|
912
|
+
not reader_is_compatible
|
913
|
+
and content_type == ContentType.PARQUET
|
914
|
+
and required_reader == DatasetType.PYARROW_PARQUET
|
915
|
+
):
|
916
|
+
reader_is_compatible = DatasetType.PYARROW in compatible_readers
|
917
|
+
|
918
|
+
if not reader_is_compatible:
|
919
|
+
incompatible_fields.append(
|
920
|
+
{
|
921
|
+
"field_name": field_name,
|
922
|
+
"arrow_type": arrow_type_str,
|
923
|
+
"incompatible_reader": required_reader,
|
924
|
+
"writer_type": writer_dataset_type,
|
925
|
+
"content_type": content_type,
|
926
|
+
}
|
927
|
+
)
|
928
|
+
|
929
|
+
# Raise error if any incompatibilities found
|
930
|
+
if incompatible_fields:
|
931
|
+
error_details = []
|
932
|
+
for incompatible in incompatible_fields:
|
933
|
+
error_details.append(
|
934
|
+
f"Field '{incompatible['field_name']}' with type '{incompatible['arrow_type']}' "
|
935
|
+
f"written by {incompatible['writer_type']} to {incompatible['content_type']} "
|
936
|
+
f"cannot be read by required reader type {incompatible['incompatible_reader']}. "
|
937
|
+
f"If you expect this write to succeed and this reader is not required, then it "
|
938
|
+
f"can be removed from the table's supported reader types property."
|
939
|
+
)
|
940
|
+
|
941
|
+
raise TableValidationError(
|
942
|
+
f"Reader compatibility validation failed. The following fields would break "
|
943
|
+
f"supported reader types:\n" + "\n".join(error_details)
|
944
|
+
)
|
945
|
+
|
946
|
+
|
947
|
+
def _stage_commit_and_compact(
|
948
|
+
converted_data: Dataset,
|
949
|
+
partition,
|
950
|
+
delta_type: DeltaType,
|
951
|
+
content_type: ContentType,
|
952
|
+
commit_staged_partition: bool,
|
953
|
+
table_version_obj: TableVersion,
|
954
|
+
namespace: str,
|
955
|
+
table: str,
|
956
|
+
schema: Schema,
|
957
|
+
original_fields: Set[str],
|
958
|
+
**kwargs,
|
959
|
+
) -> None:
|
960
|
+
"""Stage and commit delta, then handle compaction if needed."""
|
961
|
+
# Remove schema from kwargs to avoid duplicate parameter conflict
|
962
|
+
# We explicitly pass the correct schema parameter
|
963
|
+
kwargs.pop("schema", None)
|
964
|
+
|
965
|
+
# Stage a delta with the data
|
966
|
+
delta = _get_storage(**kwargs).stage_delta(
|
967
|
+
data=converted_data,
|
968
|
+
partition=partition,
|
969
|
+
delta_type=delta_type,
|
970
|
+
content_type=content_type,
|
971
|
+
author=ManifestAuthor.of(
|
972
|
+
name="deltacat.write_to_table", version=dc.__version__
|
973
|
+
),
|
974
|
+
schema=schema,
|
975
|
+
**kwargs,
|
976
|
+
)
|
977
|
+
|
978
|
+
delta = _get_storage(**kwargs).commit_delta(delta=delta, **kwargs)
|
979
|
+
|
980
|
+
if commit_staged_partition:
|
981
|
+
_get_storage(**kwargs).commit_partition(partition=partition, **kwargs)
|
982
|
+
|
983
|
+
# Check compaction trigger decision
|
984
|
+
should_compact = _trigger_compaction(
|
985
|
+
table_version_obj,
|
986
|
+
delta,
|
987
|
+
TableReadOptimizationLevel.MAX,
|
109
988
|
**kwargs,
|
110
989
|
)
|
990
|
+
if should_compact:
|
991
|
+
# Run V2 compaction session to merge or delete data
|
992
|
+
if table_version_obj.schema:
|
993
|
+
all_column_names = table_version_obj.schema.arrow.names
|
994
|
+
else:
|
995
|
+
raise RuntimeError("Table version schema is required to run compaction.")
|
996
|
+
_run_compaction_session(
|
997
|
+
table_version_obj=table_version_obj,
|
998
|
+
partition=partition,
|
999
|
+
latest_delta_stream_position=delta.stream_position,
|
1000
|
+
namespace=namespace,
|
1001
|
+
table=table,
|
1002
|
+
original_fields=original_fields,
|
1003
|
+
all_column_names=all_column_names,
|
1004
|
+
**kwargs,
|
1005
|
+
)
|
1006
|
+
|
1007
|
+
|
1008
|
+
def _trigger_compaction(
|
1009
|
+
table_version_obj: TableVersion,
|
1010
|
+
latest_delta: Optional[Delta],
|
1011
|
+
target_read_optimization_level: TableReadOptimizationLevel,
|
1012
|
+
**kwargs,
|
1013
|
+
) -> bool:
|
1014
|
+
# Import inside function to avoid circular imports
|
1015
|
+
from deltacat.compute.compactor.utils import round_completion_reader as rci
|
1016
|
+
|
1017
|
+
# Extract delta type from latest_delta if available, otherwise default to no compaction
|
1018
|
+
if latest_delta is not None:
|
1019
|
+
delta_type = latest_delta.type
|
1020
|
+
partition_values = latest_delta.partition_locator.partition_values
|
1021
|
+
logger.info(
|
1022
|
+
f"Using delta type {delta_type} from latest delta {latest_delta.locator}"
|
1023
|
+
)
|
1024
|
+
else:
|
1025
|
+
logger.info(f"No latest delta discovered, defaulting to no compaction.")
|
1026
|
+
return False
|
1027
|
+
|
1028
|
+
if (
|
1029
|
+
table_version_obj.read_table_property(TableProperty.READ_OPTIMIZATION_LEVEL)
|
1030
|
+
== target_read_optimization_level
|
1031
|
+
):
|
1032
|
+
if delta_type == DeltaType.DELETE or delta_type == DeltaType.UPSERT:
|
1033
|
+
return True
|
1034
|
+
elif delta_type == DeltaType.APPEND:
|
1035
|
+
# Get default stream to determine partition locator
|
1036
|
+
stream = _get_table_stream(
|
1037
|
+
table_version_obj.locator.namespace,
|
1038
|
+
table_version_obj.locator.table_name,
|
1039
|
+
table_version_obj.locator.table_version,
|
1040
|
+
**kwargs,
|
1041
|
+
)
|
1042
|
+
|
1043
|
+
if not stream:
|
1044
|
+
return False
|
1045
|
+
|
1046
|
+
# Use provided partition_values or None for unpartitioned tables
|
1047
|
+
partition_locator = PartitionLocator.of(
|
1048
|
+
stream_locator=stream.locator,
|
1049
|
+
partition_values=partition_values,
|
1050
|
+
partition_id=None,
|
1051
|
+
)
|
1052
|
+
|
1053
|
+
# Get round completion info to determine high watermark
|
1054
|
+
round_completion_info = rci.read_round_completion_info(
|
1055
|
+
source_partition_locator=partition_locator,
|
1056
|
+
destination_partition_locator=partition_locator,
|
1057
|
+
deltacat_storage=_get_storage(**kwargs),
|
1058
|
+
deltacat_storage_kwargs=kwargs,
|
1059
|
+
)
|
1060
|
+
|
1061
|
+
high_watermark = (
|
1062
|
+
round_completion_info.high_watermark
|
1063
|
+
if round_completion_info
|
1064
|
+
and isinstance(round_completion_info.high_watermark, int)
|
1065
|
+
else 0
|
1066
|
+
)
|
1067
|
+
|
1068
|
+
# Get all deltas appended since last compaction
|
1069
|
+
deltas = _get_storage(**kwargs).list_deltas(
|
1070
|
+
namespace=table_version_obj.locator.namespace,
|
1071
|
+
table_name=table_version_obj.locator.table_name,
|
1072
|
+
table_version=table_version_obj.locator.table_version,
|
1073
|
+
partition_values=partition_values,
|
1074
|
+
start_stream_position=high_watermark + 1,
|
1075
|
+
**kwargs,
|
1076
|
+
)
|
1077
|
+
|
1078
|
+
if not deltas:
|
1079
|
+
return False
|
1080
|
+
|
1081
|
+
# Count deltas appended since last compaction
|
1082
|
+
appended_deltas_since_last_compaction = len(deltas)
|
1083
|
+
delta_trigger = table_version_obj.read_table_property(
|
1084
|
+
TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER
|
1085
|
+
)
|
1086
|
+
if delta_trigger and appended_deltas_since_last_compaction >= delta_trigger:
|
1087
|
+
return True
|
1088
|
+
|
1089
|
+
# Count files appended since last compaction
|
1090
|
+
appended_files_since_last_compaction = 0
|
1091
|
+
for delta in deltas:
|
1092
|
+
if delta.manifest and delta.manifest.entries:
|
1093
|
+
appended_files_since_last_compaction += len(delta.manifest.entries)
|
1094
|
+
|
1095
|
+
file_trigger = table_version_obj.read_table_property(
|
1096
|
+
TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER
|
1097
|
+
)
|
1098
|
+
if file_trigger and appended_files_since_last_compaction >= file_trigger:
|
1099
|
+
return True
|
1100
|
+
|
1101
|
+
# Count records appended since last compaction
|
1102
|
+
appended_records_since_last_compaction = 0
|
1103
|
+
for delta in deltas:
|
1104
|
+
if delta.meta and delta.meta.record_count:
|
1105
|
+
appended_records_since_last_compaction += delta.meta.record_count
|
1106
|
+
|
1107
|
+
record_trigger = table_version_obj.read_table_property(
|
1108
|
+
TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER
|
1109
|
+
)
|
1110
|
+
if (
|
1111
|
+
record_trigger
|
1112
|
+
and appended_records_since_last_compaction >= record_trigger
|
1113
|
+
):
|
1114
|
+
return True
|
1115
|
+
return False
|
1116
|
+
|
1117
|
+
|
1118
|
+
def _get_compaction_primary_keys(table_version_obj: TableVersion) -> set:
|
1119
|
+
"""Extract primary keys from table schema for compaction."""
|
1120
|
+
table_schema = table_version_obj.schema
|
1121
|
+
return (
|
1122
|
+
set(table_schema.merge_keys)
|
1123
|
+
if table_schema and table_schema.merge_keys
|
1124
|
+
else set()
|
1125
|
+
)
|
1126
|
+
|
1127
|
+
|
1128
|
+
def _get_compaction_hash_bucket_count(
|
1129
|
+
partition: Partition, table_version_obj: TableVersion
|
1130
|
+
) -> int:
|
1131
|
+
"""Determine hash bucket count from previous compaction, table property, or default."""
|
1132
|
+
# First check if we have a hash bucket count from previous compaction
|
1133
|
+
if (
|
1134
|
+
partition.compaction_round_completion_info
|
1135
|
+
and partition.compaction_round_completion_info.hash_bucket_count
|
1136
|
+
):
|
1137
|
+
hash_bucket_count = partition.compaction_round_completion_info.hash_bucket_count
|
1138
|
+
logger.info(
|
1139
|
+
f"Using hash bucket count {hash_bucket_count} from previous compaction"
|
1140
|
+
)
|
1141
|
+
return hash_bucket_count
|
1142
|
+
|
1143
|
+
# Otherwise use the table property for default compaction hash bucket count
|
1144
|
+
hash_bucket_count = table_version_obj.read_table_property(
|
1145
|
+
TableProperty.DEFAULT_COMPACTION_HASH_BUCKET_COUNT
|
1146
|
+
)
|
1147
|
+
logger.info(f"Using hash bucket count {hash_bucket_count} from table property")
|
1148
|
+
return hash_bucket_count
|
1149
|
+
|
1150
|
+
|
1151
|
+
def _get_merge_order_sort_keys(table_version_obj: TableVersion):
|
1152
|
+
"""Extract sort keys from merge_order fields in schema for compaction.
|
1153
|
+
|
1154
|
+
Args:
|
1155
|
+
table_version_obj: The table version containing schema
|
1156
|
+
|
1157
|
+
Returns:
|
1158
|
+
List of SortKey objects from merge_order fields, or None if no merge_order fields are defined
|
1159
|
+
"""
|
1160
|
+
if table_version_obj.schema:
|
1161
|
+
return table_version_obj.schema.merge_order_sort_keys()
|
1162
|
+
return None
|
1163
|
+
|
1164
|
+
|
1165
|
+
def _create_compaction_params(
|
1166
|
+
table_version_obj: TableVersion,
|
1167
|
+
partition: Partition,
|
1168
|
+
latest_stream_position: int,
|
1169
|
+
primary_keys: set,
|
1170
|
+
hash_bucket_count: int,
|
1171
|
+
original_fields: Set[str],
|
1172
|
+
all_column_names: Optional[List[str]],
|
1173
|
+
**kwargs,
|
1174
|
+
):
|
1175
|
+
"""Create compaction parameters for the compaction session."""
|
1176
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
1177
|
+
CompactPartitionParams,
|
1178
|
+
)
|
1179
|
+
|
1180
|
+
# Remove create_table/alter_table kwargs not needed for compaction
|
1181
|
+
kwargs.pop("lifecycle_state", None)
|
1182
|
+
kwargs.pop("schema", None)
|
1183
|
+
kwargs.pop("partition_scheme", None)
|
1184
|
+
kwargs.pop("sort_keys", None)
|
1185
|
+
kwargs.pop("table_description", None)
|
1186
|
+
kwargs.pop("table_version_description", None)
|
1187
|
+
kwargs.pop("table_properties", None)
|
1188
|
+
kwargs.pop("table_version_properties", None)
|
1189
|
+
kwargs.pop("namespace_properties", None)
|
1190
|
+
kwargs.pop("content_types", None)
|
1191
|
+
kwargs.pop("fail_if_exists", None)
|
1192
|
+
kwargs.pop("schema_updates", None)
|
1193
|
+
kwargs.pop("partition_updates", None)
|
1194
|
+
kwargs.pop("sort_scheme", None)
|
1195
|
+
|
1196
|
+
table_writer_kwargs = kwargs.pop("table_writer_kwargs", {})
|
1197
|
+
table_writer_kwargs["schema"] = table_version_obj.schema
|
1198
|
+
table_writer_kwargs["sort_scheme_id"] = table_version_obj.sort_scheme.id
|
1199
|
+
deltacat_storage_kwargs = kwargs.pop("deltacat_storage_kwargs", {})
|
1200
|
+
deltacat_storage_kwargs["transaction"] = kwargs.get("transaction", None)
|
1201
|
+
list_deltas_kwargs = kwargs.pop("list_deltas_kwargs", {})
|
1202
|
+
list_deltas_kwargs["transaction"] = kwargs.get("transaction", None)
|
1203
|
+
|
1204
|
+
return CompactPartitionParams.of(
|
1205
|
+
{
|
1206
|
+
"catalog": kwargs.get("inner", kwargs.get("catalog")),
|
1207
|
+
"source_partition_locator": partition.locator,
|
1208
|
+
"destination_partition_locator": partition.locator, # In-place compaction
|
1209
|
+
"primary_keys": primary_keys,
|
1210
|
+
"last_stream_position_to_compact": latest_stream_position,
|
1211
|
+
"deltacat_storage": _get_storage(**kwargs),
|
1212
|
+
"deltacat_storage_kwargs": deltacat_storage_kwargs,
|
1213
|
+
"list_deltas_kwargs": list_deltas_kwargs,
|
1214
|
+
"table_writer_kwargs": table_writer_kwargs,
|
1215
|
+
"hash_bucket_count": hash_bucket_count,
|
1216
|
+
"records_per_compacted_file": table_version_obj.read_table_property(
|
1217
|
+
TableProperty.RECORDS_PER_COMPACTED_FILE,
|
1218
|
+
),
|
1219
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
1220
|
+
"drop_duplicates": True,
|
1221
|
+
"sort_keys": _get_merge_order_sort_keys(table_version_obj),
|
1222
|
+
"original_fields": original_fields,
|
1223
|
+
"all_column_names": all_column_names,
|
1224
|
+
}
|
1225
|
+
)
|
1226
|
+
|
1227
|
+
|
1228
|
+
def _run_compaction_session(
|
1229
|
+
table_version_obj: TableVersion,
|
1230
|
+
partition: Partition,
|
1231
|
+
latest_delta_stream_position: int,
|
1232
|
+
namespace: str,
|
1233
|
+
table: str,
|
1234
|
+
original_fields: Set[str],
|
1235
|
+
all_column_names: List[str],
|
1236
|
+
**kwargs,
|
1237
|
+
) -> None:
|
1238
|
+
"""
|
1239
|
+
Run a V2 compaction session for the given table and partition.
|
1240
|
+
|
1241
|
+
Args:
|
1242
|
+
table_version_obj: The table version object
|
1243
|
+
partition: The partition to compact
|
1244
|
+
latest_delta_stream_position: Stream position of the latest delta
|
1245
|
+
namespace: The table namespace
|
1246
|
+
table: The table name
|
1247
|
+
original_fields: The original field set for partial UPSERT support
|
1248
|
+
**kwargs: Additional arguments including catalog and storage parameters
|
1249
|
+
"""
|
1250
|
+
# Import inside function to avoid circular imports
|
1251
|
+
from deltacat.compute.compactor_v2.compaction_session import compact_partition
|
1252
|
+
|
1253
|
+
try:
|
1254
|
+
# Extract compaction configuration
|
1255
|
+
primary_keys = _get_compaction_primary_keys(table_version_obj)
|
1256
|
+
hash_bucket_count = _get_compaction_hash_bucket_count(
|
1257
|
+
partition, table_version_obj
|
1258
|
+
)
|
1259
|
+
|
1260
|
+
# Create compaction parameters
|
1261
|
+
compact_partition_params = _create_compaction_params(
|
1262
|
+
table_version_obj,
|
1263
|
+
partition,
|
1264
|
+
latest_delta_stream_position,
|
1265
|
+
primary_keys,
|
1266
|
+
hash_bucket_count,
|
1267
|
+
original_fields=original_fields,
|
1268
|
+
all_column_names=all_column_names,
|
1269
|
+
**kwargs,
|
1270
|
+
)
|
1271
|
+
|
1272
|
+
# Run V2 compaction session
|
1273
|
+
compact_partition(params=compact_partition_params)
|
1274
|
+
except Exception as e:
|
1275
|
+
logger.error(
|
1276
|
+
f"Error during compaction session for {namespace}.{table}, "
|
1277
|
+
f"partition {partition.locator}: {e}"
|
1278
|
+
)
|
1279
|
+
raise
|
1280
|
+
|
1281
|
+
|
1282
|
+
def _get_merge_key_field_names_from_schema(schema) -> List[str]:
|
1283
|
+
"""Extract merge key field names from a DeltaCAT Schema object.
|
1284
|
+
|
1285
|
+
Args:
|
1286
|
+
schema: DeltaCAT Schema object
|
1287
|
+
|
1288
|
+
Returns:
|
1289
|
+
List of field names that are marked as merge keys
|
1290
|
+
"""
|
1291
|
+
if not schema or not schema.merge_keys:
|
1292
|
+
return []
|
1293
|
+
|
1294
|
+
merge_key_field_names = []
|
1295
|
+
field_ids_to_fields = schema.field_ids_to_fields
|
1296
|
+
|
1297
|
+
for merge_key_id in schema.merge_keys:
|
1298
|
+
if merge_key_id in field_ids_to_fields:
|
1299
|
+
field = field_ids_to_fields[merge_key_id]
|
1300
|
+
merge_key_field_names.append(field.arrow.name)
|
1301
|
+
|
1302
|
+
return merge_key_field_names
|
1303
|
+
|
1304
|
+
|
1305
|
+
def _set_entry_params_if_needed(
|
1306
|
+
mode: TableWriteMode, table_version_obj, kwargs: dict
|
1307
|
+
) -> None:
|
1308
|
+
"""Automatically set entry_params to merge keys if not already set by user.
|
1309
|
+
|
1310
|
+
Args:
|
1311
|
+
mode: The table write mode
|
1312
|
+
table_version_obj: The table version object containing schema
|
1313
|
+
kwargs: Keyword arguments dictionary that may contain entry_params
|
1314
|
+
"""
|
1315
|
+
# Only set entry_params for DELETE and MERGE modes
|
1316
|
+
if mode not in [TableWriteMode.DELETE, TableWriteMode.MERGE]:
|
1317
|
+
return
|
1318
|
+
|
1319
|
+
# Don't override if user already provided entry_params
|
1320
|
+
if "entry_params" in kwargs and kwargs["entry_params"] is not None:
|
1321
|
+
return
|
1322
|
+
|
1323
|
+
# Get schema from table version
|
1324
|
+
if not table_version_obj or not table_version_obj.schema:
|
1325
|
+
return
|
1326
|
+
|
1327
|
+
# Extract merge key field names
|
1328
|
+
merge_key_field_names = _get_merge_key_field_names_from_schema(
|
1329
|
+
table_version_obj.schema
|
1330
|
+
)
|
1331
|
+
|
1332
|
+
if merge_key_field_names:
|
1333
|
+
from deltacat.storage import EntryParams
|
1334
|
+
|
1335
|
+
kwargs["entry_params"] = EntryParams.of(merge_key_field_names)
|
111
1336
|
|
112
|
-
|
1337
|
+
|
1338
|
+
def _get_table_stream(namespace: str, table: str, table_version: str, **kwargs):
|
1339
|
+
"""Helper function to get a stream for a table version."""
|
1340
|
+
return _get_storage(**kwargs).get_stream(
|
113
1341
|
namespace=namespace,
|
114
1342
|
table_name=table,
|
115
1343
|
table_version=table_version,
|
116
1344
|
**kwargs,
|
117
1345
|
)
|
118
|
-
table_version = table_version_obj.table_version
|
119
1346
|
|
1347
|
+
|
1348
|
+
def _validate_read_table_input(
|
1349
|
+
namespace: str,
|
1350
|
+
table: str,
|
1351
|
+
table_schema: Optional[Schema],
|
1352
|
+
table_type: Optional[DatasetType],
|
1353
|
+
distributed_dataset_type: Optional[DatasetType],
|
1354
|
+
) -> None:
|
1355
|
+
"""Validate input parameters for read_table operation."""
|
120
1356
|
if (
|
121
|
-
|
122
|
-
|
1357
|
+
distributed_dataset_type
|
1358
|
+
and distributed_dataset_type not in DatasetType.distributed()
|
123
1359
|
):
|
124
1360
|
raise ValueError(
|
125
|
-
"
|
126
|
-
f"
|
1361
|
+
f"{distributed_dataset_type} is not a valid distributed dataset type. "
|
1362
|
+
f"Valid distributed dataset types are: {DatasetType.distributed()}."
|
1363
|
+
)
|
1364
|
+
if table_type and table_type not in DatasetType.local():
|
1365
|
+
raise ValueError(
|
1366
|
+
f"{table_type} is not a valid local table type. "
|
1367
|
+
f"Valid table types are: {DatasetType.local()}."
|
127
1368
|
)
|
128
1369
|
|
1370
|
+
# For schemaless tables, distributed datasets are not yet supported
|
1371
|
+
if table_schema is None and distributed_dataset_type:
|
1372
|
+
raise NotImplementedError(
|
1373
|
+
f"Distributed dataset reading is not yet supported for schemaless tables. "
|
1374
|
+
f"Table '{namespace}.{table}' has no schema, but distributed_dataset_type={distributed_dataset_type} was specified. "
|
1375
|
+
f"Please use local storage by setting distributed_dataset_type=None."
|
1376
|
+
)
|
1377
|
+
|
1378
|
+
|
1379
|
+
def _get_qualified_deltas_for_read(
|
1380
|
+
table: str,
|
1381
|
+
namespace: str,
|
1382
|
+
table_version: str,
|
1383
|
+
partition_filter: Optional[List[Union[Partition, PartitionLocator]]],
|
1384
|
+
**kwargs,
|
1385
|
+
) -> List[Delta]:
|
1386
|
+
"""Get qualified deltas for reading based on partition filter."""
|
129
1387
|
logger.info(
|
130
1388
|
f"Reading metadata for table={namespace}/{table}/{table_version} "
|
131
|
-
f"with partition_filters={partition_filter}
|
132
|
-
f" range={stream_position_range_inclusive}"
|
1389
|
+
f"with partition_filters={partition_filter}."
|
133
1390
|
)
|
134
1391
|
|
1392
|
+
# Get partition filter if not provided
|
135
1393
|
if partition_filter is None:
|
136
|
-
|
137
|
-
|
138
|
-
"as partition_filter was None."
|
139
|
-
)
|
140
|
-
partition_filter = (
|
141
|
-
_get_storage(**kwargs)
|
142
|
-
.list_partitions(
|
143
|
-
table_name=table,
|
144
|
-
namespace=namespace,
|
145
|
-
table_version=table_version,
|
146
|
-
**kwargs,
|
147
|
-
)
|
148
|
-
.all_items()
|
1394
|
+
partition_filter = _get_all_committed_partitions(
|
1395
|
+
table, namespace, table_version, **kwargs
|
149
1396
|
)
|
150
1397
|
|
1398
|
+
# Get deltas from partitions
|
151
1399
|
qualified_deltas = _get_deltas_from_partition_filter(
|
152
|
-
stream_position_range_inclusive=stream_position_range_inclusive,
|
153
1400
|
partition_filter=partition_filter,
|
154
1401
|
**kwargs,
|
155
1402
|
)
|
@@ -159,30 +1406,390 @@ def read_table(
|
|
159
1406
|
f"from {len(partition_filter)} partitions."
|
160
1407
|
)
|
161
1408
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
1409
|
+
return qualified_deltas
|
1410
|
+
|
1411
|
+
|
1412
|
+
def _get_max_parallelism(
|
1413
|
+
max_parallelism: Optional[int],
|
1414
|
+
distributed_dataset_type: Optional[DatasetType],
|
1415
|
+
) -> int:
|
1416
|
+
"""Get the max parallelism for a read operation."""
|
1417
|
+
if distributed_dataset_type:
|
1418
|
+
max_parallelism = max_parallelism or 100
|
1419
|
+
else:
|
1420
|
+
# TODO(pdames): Set max parallelism using available resources and dataset size
|
1421
|
+
max_parallelism = 1
|
1422
|
+
if max_parallelism < 1:
|
1423
|
+
raise ValueError(
|
1424
|
+
f"max_parallelism must be greater than 0, but got {max_parallelism}"
|
1425
|
+
)
|
1426
|
+
logger.info(f"Using max_parallelism={max_parallelism} for read operation")
|
1427
|
+
|
1428
|
+
return max_parallelism
|
1429
|
+
|
1430
|
+
|
1431
|
+
def _handle_schemaless_table_read(
|
1432
|
+
qualified_deltas: List[Delta],
|
1433
|
+
read_as: DatasetType,
|
1434
|
+
**kwargs,
|
1435
|
+
) -> Dataset:
|
1436
|
+
"""Handle reading schemaless tables by flattening manifest entries."""
|
1437
|
+
# Create a PyArrow table for each delta
|
1438
|
+
# TODO(pdames): More efficient implementation for tables with millions/billions of entries
|
1439
|
+
tables = []
|
1440
|
+
for delta in qualified_deltas:
|
1441
|
+
# Get the manifest for this delta
|
1442
|
+
if delta.manifest:
|
1443
|
+
manifest = delta.manifest
|
1444
|
+
else:
|
1445
|
+
# Fetch manifest from storage
|
1446
|
+
manifest = _get_storage(**kwargs).get_delta_manifest(
|
1447
|
+
delta.locator,
|
1448
|
+
transaction=kwargs.get("transaction"),
|
1449
|
+
**kwargs,
|
1450
|
+
)
|
1451
|
+
# Create flattened table from this delta's manifest
|
1452
|
+
table = pa_utils.delta_manifest_to_table(
|
1453
|
+
manifest,
|
1454
|
+
delta,
|
1455
|
+
)
|
1456
|
+
tables.append(table)
|
1457
|
+
|
1458
|
+
# Concatenate all PyArrow tables
|
1459
|
+
final_table = pa_utils.concat_tables(tables)
|
1460
|
+
|
1461
|
+
# Convert from PyArrow to the requested dataset type
|
1462
|
+
return from_pyarrow(final_table, read_as)
|
1463
|
+
|
1464
|
+
|
1465
|
+
def _download_and_process_table_data(
|
1466
|
+
namespace: str,
|
1467
|
+
table: str,
|
1468
|
+
qualified_deltas: List[Delta],
|
1469
|
+
read_as: DatasetType,
|
1470
|
+
max_parallelism: Optional[int],
|
1471
|
+
columns: Optional[List[str]],
|
1472
|
+
file_path_column: Optional[str],
|
1473
|
+
table_version_obj: Optional[TableVersion],
|
1474
|
+
**kwargs,
|
1475
|
+
) -> Dataset:
|
1476
|
+
"""Download delta data and process result based on storage type."""
|
1477
|
+
|
1478
|
+
# Handle NUMPY read requests by translating to PANDAS internally
|
1479
|
+
original_read_as = read_as
|
1480
|
+
effective_read_as = read_as
|
1481
|
+
if read_as == DatasetType.NUMPY:
|
1482
|
+
effective_read_as = DatasetType.PANDAS
|
1483
|
+
logger.debug("Translating NUMPY read request to PANDAS for internal processing")
|
1484
|
+
|
1485
|
+
# Merge deltas and download data
|
1486
|
+
if not qualified_deltas:
|
1487
|
+
# Return empty table with original read_as type
|
1488
|
+
return empty_table(original_read_as)
|
1489
|
+
|
1490
|
+
# Special handling for non-empty schemaless tables
|
1491
|
+
if table_version_obj.schema is None:
|
1492
|
+
result = _handle_schemaless_table_read(
|
1493
|
+
qualified_deltas,
|
1494
|
+
effective_read_as,
|
1495
|
+
**kwargs,
|
1496
|
+
)
|
1497
|
+
# Convert to numpy if original request was for numpy
|
1498
|
+
if original_read_as == DatasetType.NUMPY:
|
1499
|
+
return _convert_pandas_to_numpy(result)
|
1500
|
+
return result
|
1501
|
+
|
1502
|
+
# Get schemas for each manifest entry
|
1503
|
+
entry_index_to_schema = _build_entry_index_to_schema_mapping(
|
1504
|
+
qualified_deltas, table_version_obj, **kwargs
|
1505
|
+
)
|
1506
|
+
# Standard non-empty schema table read path - merge deltas and download data
|
1507
|
+
merged_delta = Delta.merge_deltas(qualified_deltas)
|
1508
|
+
|
1509
|
+
# Convert read parameters to download parameters
|
1510
|
+
table_type = (
|
1511
|
+
effective_read_as
|
1512
|
+
if effective_read_as in DatasetType.local()
|
1513
|
+
else (kwargs.pop("table_type", None) or DatasetType.PYARROW)
|
1514
|
+
)
|
1515
|
+
distributed_dataset_type = (
|
1516
|
+
effective_read_as if effective_read_as in DatasetType.distributed() else None
|
1517
|
+
)
|
1518
|
+
|
1519
|
+
# Validate input parameters
|
1520
|
+
_validate_read_table_input(
|
1521
|
+
namespace,
|
1522
|
+
table,
|
1523
|
+
table_version_obj.schema,
|
1524
|
+
table_type,
|
1525
|
+
distributed_dataset_type,
|
1526
|
+
)
|
1527
|
+
|
1528
|
+
# Determine max parallelism
|
1529
|
+
max_parallelism = _get_max_parallelism(
|
1530
|
+
max_parallelism,
|
1531
|
+
distributed_dataset_type,
|
1532
|
+
)
|
1533
|
+
# Filter out parameters that are already passed as keyword arguments
|
1534
|
+
# to avoid "multiple values for argument" errors
|
1535
|
+
filtered_kwargs = {
|
1536
|
+
k: v
|
1537
|
+
for k, v in kwargs.items()
|
1538
|
+
if k
|
1539
|
+
not in [
|
1540
|
+
"delta_like",
|
1541
|
+
"table_type",
|
1542
|
+
"storage_type",
|
1543
|
+
"max_parallelism",
|
1544
|
+
"columns",
|
1545
|
+
"distributed_dataset_type",
|
1546
|
+
"file_path_column",
|
1547
|
+
]
|
1548
|
+
}
|
1549
|
+
result = _get_storage(**kwargs).download_delta(
|
1550
|
+
merged_delta,
|
1551
|
+
table_type=effective_read_as,
|
1552
|
+
storage_type=StorageType.DISTRIBUTED
|
1553
|
+
if distributed_dataset_type
|
1554
|
+
else StorageType.LOCAL,
|
1555
|
+
max_parallelism=max_parallelism,
|
1556
|
+
columns=columns,
|
1557
|
+
distributed_dataset_type=distributed_dataset_type,
|
1558
|
+
file_path_column=file_path_column,
|
1559
|
+
**filtered_kwargs,
|
1560
|
+
)
|
1561
|
+
|
1562
|
+
# Handle local storage table concatenation and PYARROW_PARQUET lazy materialization
|
1563
|
+
if not distributed_dataset_type and table_type and isinstance(result, list):
|
1564
|
+
if table_type == DatasetType.PYARROW_PARQUET:
|
1565
|
+
# For PYARROW_PARQUET, preserve lazy materialization:
|
1566
|
+
return result[0] if len(result) == 1 else result
|
1567
|
+
else:
|
1568
|
+
# For other types, perform normal concatenation
|
1569
|
+
result = _handle_local_table_concatenation(
|
1570
|
+
result,
|
1571
|
+
table_type,
|
1572
|
+
table_version_obj.schema,
|
1573
|
+
entry_index_to_schema,
|
1574
|
+
file_path_column,
|
1575
|
+
columns,
|
1576
|
+
)
|
1577
|
+
# Convert to numpy if original request was for numpy
|
1578
|
+
if original_read_as == DatasetType.NUMPY:
|
1579
|
+
return _convert_pandas_to_numpy(result)
|
1580
|
+
|
1581
|
+
return result
|
1582
|
+
|
1583
|
+
|
1584
|
+
def _convert_pandas_to_numpy(dataset: Dataset):
|
1585
|
+
"""Convert pandas DataFrame to numpy ndarray."""
|
1586
|
+
if not isinstance(dataset, pd.DataFrame):
|
1587
|
+
raise ValueError(f"Expected pandas DataFrame but found {type(dataset)}")
|
1588
|
+
return dataset.to_numpy()
|
1589
|
+
|
1590
|
+
|
1591
|
+
def _coerce_dataset_to_schema(
|
1592
|
+
dataset: Dataset, target_schema: pa.Schema, manifest_entry_schema: Schema
|
1593
|
+
) -> Dataset:
|
1594
|
+
"""Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
|
1595
|
+
# Convert target PyArrow schema to DeltaCAT schema and use its coerce method
|
1596
|
+
deltacat_schema = Schema.of(schema=target_schema)
|
1597
|
+
return deltacat_schema.coerce(dataset, manifest_entry_schema)
|
1598
|
+
|
1599
|
+
|
1600
|
+
def _coerce_results_to_schema(
|
1601
|
+
results: Dataset, target_schema: pa.Schema, entry_index_to_schema: List[Schema]
|
1602
|
+
) -> List[Dataset]:
|
1603
|
+
"""Coerce all table results to match the target schema."""
|
1604
|
+
coerced_results = []
|
1605
|
+
for i, table_result in enumerate(results):
|
1606
|
+
coerced_result = _coerce_dataset_to_schema(
|
1607
|
+
table_result, target_schema, entry_index_to_schema[i]
|
1608
|
+
)
|
1609
|
+
coerced_results.append(coerced_result)
|
1610
|
+
logger.debug(f"Coerced table {i} to unified schema")
|
1611
|
+
return coerced_results
|
1612
|
+
|
1613
|
+
|
1614
|
+
def _create_target_schema(
|
1615
|
+
arrow_schema: pa.Schema,
|
1616
|
+
columns: Optional[List[str]] = None,
|
1617
|
+
file_path_column: Optional[str] = None,
|
1618
|
+
) -> pa.Schema:
|
1619
|
+
"""Create target schema for concatenation with optional column selection and file_path_column."""
|
1620
|
+
if columns is not None:
|
1621
|
+
# Column selection - use only specified columns
|
1622
|
+
field_map = {field.name: field for field in arrow_schema}
|
1623
|
+
selected_fields = []
|
1624
|
+
|
1625
|
+
for col_name in columns:
|
1626
|
+
if col_name in field_map:
|
1627
|
+
selected_fields.append(field_map[col_name])
|
1628
|
+
arrow_schema = pa.schema(selected_fields)
|
1629
|
+
if file_path_column and file_path_column not in arrow_schema.names:
|
1630
|
+
arrow_schema = arrow_schema.append(pa.field(file_path_column, pa.string()))
|
1631
|
+
return arrow_schema
|
1632
|
+
|
1633
|
+
|
1634
|
+
def _create_entry_schemas_for_concatenation(
|
1635
|
+
entry_index_to_schema: List[Schema],
|
1636
|
+
columns: Optional[List[str]] = None,
|
1637
|
+
file_path_column: Optional[str] = None,
|
1638
|
+
) -> List[Schema]:
|
1639
|
+
"""Create entry schemas for concatenation, optionally filtered by column selection."""
|
1640
|
+
if columns is None:
|
1641
|
+
# No column selection - return original schemas as-is
|
1642
|
+
return entry_index_to_schema
|
1643
|
+
|
1644
|
+
# Column selection - filter each entry schema
|
1645
|
+
modified_schemas = []
|
1646
|
+
for entry_schema in entry_index_to_schema:
|
1647
|
+
if entry_schema and entry_schema.arrow:
|
1648
|
+
filtered_schema = _create_target_schema(
|
1649
|
+
entry_schema.arrow, columns, file_path_column
|
1650
|
+
)
|
1651
|
+
modified_schemas.append(Schema.of(schema=filtered_schema))
|
1652
|
+
else:
|
1653
|
+
modified_schemas.append(entry_schema)
|
1654
|
+
|
1655
|
+
return modified_schemas
|
1656
|
+
|
1657
|
+
|
1658
|
+
def _handle_local_table_concatenation(
|
1659
|
+
results: Dataset,
|
1660
|
+
table_type: DatasetType,
|
1661
|
+
table_schema: Optional[Schema],
|
1662
|
+
entry_index_to_schema: List[Schema],
|
1663
|
+
file_path_column: Optional[str] = None,
|
1664
|
+
columns: Optional[List[str]] = None,
|
1665
|
+
) -> Dataset:
|
1666
|
+
"""Handle concatenation of local table results with schema coercion."""
|
1667
|
+
logger.debug(f"Target table schema for concatenation: {table_schema}")
|
1668
|
+
|
1669
|
+
# Create target schema for coercion, respecting column selection
|
1670
|
+
target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
|
1671
|
+
logger.debug(f"Created target schema: {target_schema.names}")
|
1672
|
+
|
1673
|
+
# Filter entry schemas to match column selection and file_path_column
|
1674
|
+
modified_entry_schemas = _create_entry_schemas_for_concatenation(
|
1675
|
+
entry_index_to_schema, columns, file_path_column
|
1676
|
+
)
|
1677
|
+
|
1678
|
+
# Coerce results to unified schema
|
1679
|
+
coerced_results = _coerce_results_to_schema(
|
1680
|
+
results, target_schema, modified_entry_schemas
|
169
1681
|
)
|
170
1682
|
|
171
|
-
|
172
|
-
|
1683
|
+
# Second step: concatenate the coerced results
|
1684
|
+
logger.debug(
|
1685
|
+
f"Concatenating {len(coerced_results)} local tables of type {table_type} with unified schemas"
|
173
1686
|
)
|
1687
|
+
concatenated_result = concat_tables(coerced_results, table_type)
|
1688
|
+
logger.debug(f"Concatenation complete, result type: {type(concatenated_result)}")
|
1689
|
+
return concatenated_result
|
1690
|
+
|
1691
|
+
|
1692
|
+
def read_table(
|
1693
|
+
table: str,
|
1694
|
+
*args,
|
1695
|
+
namespace: Optional[str] = None,
|
1696
|
+
table_version: Optional[str] = None,
|
1697
|
+
read_as: DatasetType = DatasetType.DAFT,
|
1698
|
+
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
1699
|
+
max_parallelism: Optional[int] = None,
|
1700
|
+
columns: Optional[List[str]] = None,
|
1701
|
+
file_path_column: Optional[str] = None,
|
1702
|
+
transaction: Optional[Transaction] = None,
|
1703
|
+
**kwargs,
|
1704
|
+
) -> Dataset:
|
1705
|
+
"""Read a table into a dataset.
|
1706
|
+
|
1707
|
+
Args:
|
1708
|
+
table: Name of the table to read.
|
1709
|
+
namespace: Optional namespace of the table. Uses default if not specified.
|
1710
|
+
table_version: Optional specific version of the table to read.
|
1711
|
+
read_as: Dataset type to use for reading table files. Defaults to DatasetType.DAFT.
|
1712
|
+
partition_filter: Optional list of partitions to read from.
|
1713
|
+
max_parallelism: Optional maximum parallelism for data download. Defaults to the number of
|
1714
|
+
available CPU cores for local dataset type reads (i.e., members of DatasetType.local())
|
1715
|
+
and 100 for distributed dataset type reads (i.e., members of DatasetType.distributed()).
|
1716
|
+
columns: Optional list of columns to include in the result.
|
1717
|
+
file_path_column: Optional column name to add file paths to the result.
|
1718
|
+
transaction: Optional transaction to chain this read operation to. If provided, uncommitted
|
1719
|
+
changes from the transaction will be visible to this read operation.
|
1720
|
+
**kwargs: Additional keyword arguments.
|
1721
|
+
|
1722
|
+
Returns:
|
1723
|
+
Dataset containing the table data.
|
1724
|
+
"""
|
1725
|
+
# Set up transaction handling
|
1726
|
+
read_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1727
|
+
kwargs["transaction"] = read_transaction
|
1728
|
+
|
1729
|
+
try:
|
1730
|
+
# Resolve namespace and get table metadata
|
1731
|
+
namespace = namespace or default_namespace()
|
1732
|
+
|
1733
|
+
table_version_obj = _get_latest_active_or_given_table_version(
|
1734
|
+
namespace=namespace,
|
1735
|
+
table_name=table,
|
1736
|
+
table_version=table_version,
|
1737
|
+
**kwargs,
|
1738
|
+
)
|
1739
|
+
|
1740
|
+
# Get partitions and deltas to read
|
1741
|
+
qualified_deltas = _get_qualified_deltas_for_read(
|
1742
|
+
table,
|
1743
|
+
namespace,
|
1744
|
+
table_version_obj.table_version,
|
1745
|
+
partition_filter,
|
1746
|
+
**kwargs,
|
1747
|
+
)
|
1748
|
+
|
1749
|
+
# Download and process the data
|
1750
|
+
# TODO(pdames): Remove once we implement a custom SerDe for pa.ParquetFile
|
1751
|
+
if read_as == DatasetType.PYARROW_PARQUET:
|
1752
|
+
max_parallelism = 1
|
1753
|
+
logger.warning(
|
1754
|
+
f"Forcing max_parallelism to 1 for PyArrow Parquet reads to avoid serialization errors."
|
1755
|
+
)
|
1756
|
+
result = _download_and_process_table_data(
|
1757
|
+
namespace,
|
1758
|
+
table,
|
1759
|
+
qualified_deltas,
|
1760
|
+
read_as,
|
1761
|
+
max_parallelism,
|
1762
|
+
columns,
|
1763
|
+
file_path_column,
|
1764
|
+
table_version_obj,
|
1765
|
+
**kwargs,
|
1766
|
+
)
|
1767
|
+
return result
|
1768
|
+
except Exception as e:
|
1769
|
+
# If any error occurs, the transaction remains uncommitted
|
1770
|
+
commit_transaction = False
|
1771
|
+
logger.error(f"Error during read_table: {e}")
|
1772
|
+
raise
|
1773
|
+
finally:
|
1774
|
+
if commit_transaction:
|
1775
|
+
# Seal the interactive transaction to commit all operations atomically
|
1776
|
+
read_transaction.seal()
|
174
1777
|
|
175
1778
|
|
176
1779
|
def alter_table(
|
177
1780
|
table: str,
|
178
1781
|
*args,
|
179
1782
|
namespace: Optional[str] = None,
|
1783
|
+
table_version: Optional[str] = None,
|
180
1784
|
lifecycle_state: Optional[LifecycleState] = None,
|
181
|
-
schema_updates: Optional[
|
1785
|
+
schema_updates: Optional[SchemaUpdate] = None,
|
182
1786
|
partition_updates: Optional[Dict[str, Any]] = None,
|
183
|
-
|
184
|
-
|
185
|
-
|
1787
|
+
sort_scheme: Optional[SortScheme] = None,
|
1788
|
+
table_description: Optional[str] = None,
|
1789
|
+
table_version_description: Optional[str] = None,
|
1790
|
+
table_properties: Optional[TableProperties] = None,
|
1791
|
+
table_version_properties: Optional[TableVersionProperties] = None,
|
1792
|
+
transaction: Optional[Transaction] = None,
|
186
1793
|
**kwargs,
|
187
1794
|
) -> None:
|
188
1795
|
"""Alter deltacat table/table_version definition.
|
@@ -193,61 +1800,169 @@ def alter_table(
|
|
193
1800
|
Args:
|
194
1801
|
table: Name of the table to alter.
|
195
1802
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
1803
|
+
table_version: Optional specific version of the table to alter. Defaults to the latest active version.
|
196
1804
|
lifecycle_state: New lifecycle state for the table.
|
197
|
-
schema_updates:
|
198
|
-
partition_updates:
|
199
|
-
|
200
|
-
|
201
|
-
|
1805
|
+
schema_updates: Schema updates to apply.
|
1806
|
+
partition_updates: Partition scheme updates to apply.
|
1807
|
+
sort_scheme: New sort scheme.
|
1808
|
+
table_description: New description for the table.
|
1809
|
+
table_version_description: New description for the table version. Defaults to `table_description` if not specified.
|
1810
|
+
table_properties: New table properties.
|
1811
|
+
table_version_properties: New table version properties. Defaults to the current parent table properties if not specified.
|
1812
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
202
1813
|
|
203
1814
|
Returns:
|
204
1815
|
None
|
205
1816
|
|
206
1817
|
Raises:
|
207
1818
|
TableNotFoundError: If the table does not already exist.
|
1819
|
+
TableVersionNotFoundError: If the specified table version or active table version does not exist.
|
208
1820
|
"""
|
1821
|
+
resolved_table_properties = None
|
1822
|
+
if table_properties is not None:
|
1823
|
+
resolved_table_properties = _add_default_table_properties(table_properties)
|
1824
|
+
_validate_table_properties(resolved_table_properties)
|
1825
|
+
|
209
1826
|
namespace = namespace or default_namespace()
|
210
1827
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
table_name=table,
|
215
|
-
description=description,
|
216
|
-
properties=properties,
|
217
|
-
lifecycle_state=lifecycle_state,
|
218
|
-
**kwargs,
|
219
|
-
)
|
1828
|
+
# Set up transaction handling
|
1829
|
+
alter_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1830
|
+
kwargs["transaction"] = alter_transaction
|
220
1831
|
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
1832
|
+
try:
|
1833
|
+
if partition_updates:
|
1834
|
+
raise NotImplementedError("Partition updates are not yet supported.")
|
1835
|
+
if sort_scheme:
|
1836
|
+
raise NotImplementedError("Sort scheme updates are not yet supported.")
|
1837
|
+
|
1838
|
+
new_table: Table = _get_storage(**kwargs).update_table(
|
1839
|
+
*args,
|
1840
|
+
namespace=namespace,
|
1841
|
+
table_name=table,
|
1842
|
+
description=table_description,
|
1843
|
+
properties=resolved_table_properties,
|
1844
|
+
**kwargs,
|
1845
|
+
)
|
1846
|
+
|
1847
|
+
if table_version is None:
|
1848
|
+
table_version: Optional[TableVersion] = _get_storage(
|
1849
|
+
**kwargs
|
1850
|
+
).get_latest_active_table_version(namespace, table, **kwargs)
|
1851
|
+
if table_version is None:
|
1852
|
+
raise TableVersionNotFoundError(
|
1853
|
+
f"No active table version found for table {namespace}.{table}. "
|
1854
|
+
"Please specify a table_version parameter."
|
1855
|
+
)
|
1856
|
+
else:
|
1857
|
+
table_version = _get_storage(**kwargs).get_table_version(
|
1858
|
+
namespace, table, table_version, **kwargs
|
1859
|
+
)
|
1860
|
+
if table_version is None:
|
1861
|
+
raise TableVersionNotFoundError(
|
1862
|
+
f"Table version '{table_version}' not found for table {namespace}.{table}"
|
1863
|
+
)
|
1864
|
+
|
1865
|
+
# Get table properties for schema evolution
|
1866
|
+
schema_evolution_mode = table_version.read_table_property(
|
1867
|
+
TableProperty.SCHEMA_EVOLUTION_MODE
|
1868
|
+
)
|
1869
|
+
if schema_updates and schema_evolution_mode == SchemaEvolutionMode.DISABLED:
|
1870
|
+
raise TableValidationError(
|
1871
|
+
"Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates."
|
1872
|
+
)
|
1873
|
+
|
1874
|
+
# Only update table version properties if they are explicitly provided
|
1875
|
+
resolved_tv_properties = None
|
1876
|
+
if table_version_properties is not None:
|
1877
|
+
# inherit properties from the parent table if not specified
|
1878
|
+
default_tv_properties = new_table.properties
|
1879
|
+
if table_version.schema is None:
|
1880
|
+
# schemaless tables don't validate reader compatibility by default
|
1881
|
+
default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
|
1882
|
+
resolved_tv_properties = _add_default_table_properties(
|
1883
|
+
table_version_properties,
|
1884
|
+
default_tv_properties,
|
1885
|
+
)
|
1886
|
+
_validate_table_properties(resolved_tv_properties)
|
1887
|
+
|
1888
|
+
# Apply schema updates if provided
|
1889
|
+
updated_schema = None
|
1890
|
+
if schema_updates is not None:
|
1891
|
+
# Get the current schema from the table version
|
1892
|
+
current_schema = table_version.schema
|
1893
|
+
if current_schema != schema_updates.base_schema:
|
1894
|
+
raise ValueError(
|
1895
|
+
f"Schema updates are not compatible with the current schema for table `{namespace}.{table}`. Current schema: {current_schema}, Schema update base schema: {schema_updates.base_schema}"
|
1896
|
+
)
|
1897
|
+
|
1898
|
+
# Apply all the updates to get the final schema
|
1899
|
+
updated_schema = schema_updates.apply()
|
1900
|
+
|
1901
|
+
_get_storage(**kwargs).update_table_version(
|
1902
|
+
*args,
|
1903
|
+
namespace=namespace,
|
1904
|
+
table_name=table,
|
1905
|
+
table_version=table_version.id,
|
1906
|
+
lifecycle_state=lifecycle_state,
|
1907
|
+
description=table_version_description or table_description,
|
1908
|
+
schema=updated_schema,
|
1909
|
+
properties=resolved_tv_properties, # This will be None if table_version_properties was not provided
|
1910
|
+
**kwargs,
|
1911
|
+
)
|
1912
|
+
|
1913
|
+
except Exception as e:
|
1914
|
+
# If any error occurs, the transaction remains uncommitted
|
1915
|
+
commit_transaction = False
|
1916
|
+
logger.error(f"Error during alter_table: {e}")
|
1917
|
+
raise
|
1918
|
+
finally:
|
1919
|
+
if commit_transaction:
|
1920
|
+
# Seal the interactive transaction to commit all operations atomically
|
1921
|
+
alter_transaction.seal()
|
1922
|
+
|
1923
|
+
|
1924
|
+
def _add_default_table_properties(
|
1925
|
+
table_properties: Optional[TableProperties],
|
1926
|
+
default_table_properties: TableProperties = TablePropertyDefaultValues,
|
1927
|
+
) -> TableProperties:
|
1928
|
+
if table_properties is None:
|
1929
|
+
table_properties = {}
|
1930
|
+
for k, v in default_table_properties.items():
|
1931
|
+
if k not in table_properties:
|
1932
|
+
table_properties[k] = v
|
1933
|
+
return table_properties
|
1934
|
+
|
1935
|
+
|
1936
|
+
def _validate_table_properties(
|
1937
|
+
table_properties: TableProperties,
|
1938
|
+
) -> None:
|
1939
|
+
read_optimization_level = table_properties.get(
|
1940
|
+
TableProperty.READ_OPTIMIZATION_LEVEL,
|
1941
|
+
TablePropertyDefaultValues[TableProperty.READ_OPTIMIZATION_LEVEL],
|
234
1942
|
)
|
1943
|
+
if read_optimization_level != TableReadOptimizationLevel.MAX:
|
1944
|
+
raise NotImplementedError(
|
1945
|
+
f"Table read optimization level `{read_optimization_level} is not yet supported. Please use {TableReadOptimizationLevel.MAX}"
|
1946
|
+
)
|
235
1947
|
|
236
1948
|
|
237
1949
|
def create_table(
|
238
|
-
|
1950
|
+
table: str,
|
239
1951
|
*args,
|
240
1952
|
namespace: Optional[str] = None,
|
241
|
-
|
1953
|
+
table_version: Optional[str] = None,
|
242
1954
|
lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
|
243
1955
|
schema: Optional[Schema] = None,
|
244
1956
|
partition_scheme: Optional[PartitionScheme] = None,
|
245
1957
|
sort_keys: Optional[SortScheme] = None,
|
246
|
-
|
1958
|
+
table_description: Optional[str] = None,
|
1959
|
+
table_version_description: Optional[str] = None,
|
247
1960
|
table_properties: Optional[TableProperties] = None,
|
1961
|
+
table_version_properties: Optional[TableVersionProperties] = None,
|
248
1962
|
namespace_properties: Optional[NamespaceProperties] = None,
|
249
1963
|
content_types: Optional[List[ContentType]] = None,
|
250
1964
|
fail_if_exists: bool = True,
|
1965
|
+
transaction: Optional[Transaction] = None,
|
251
1966
|
**kwargs,
|
252
1967
|
) -> TableDefinition:
|
253
1968
|
"""Create an empty table in the catalog.
|
@@ -255,20 +1970,22 @@ def create_table(
|
|
255
1970
|
If a namespace isn't provided, the table will be created within the default deltacat namespace.
|
256
1971
|
Additionally if the provided namespace does not exist, it will be created for you.
|
257
1972
|
|
258
|
-
|
259
1973
|
Args:
|
260
|
-
|
1974
|
+
table: Name of the table to create.
|
261
1975
|
namespace: Optional namespace for the table. Uses default namespace if not specified.
|
262
1976
|
version: Optional version identifier for the table.
|
263
1977
|
lifecycle_state: Lifecycle state of the new table. Defaults to ACTIVE.
|
264
1978
|
schema: Schema definition for the table.
|
265
1979
|
partition_scheme: Optional partitioning scheme for the table.
|
266
1980
|
sort_keys: Optional sort keys for the table.
|
267
|
-
|
1981
|
+
table_description: Optional description of the table.
|
1982
|
+
table_version_description: Optional description for the table version.
|
268
1983
|
table_properties: Optional properties for the table.
|
1984
|
+
table_version_properties: Optional properties for the table version. Defaults to the current parent table properties if not specified.
|
269
1985
|
namespace_properties: Optional properties for the namespace if it needs to be created.
|
270
1986
|
content_types: Optional list of allowed content types for the table.
|
271
1987
|
fail_if_exists: If True, raises an error if table already exists. If False, returns existing table.
|
1988
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
272
1989
|
|
273
1990
|
Returns:
|
274
1991
|
TableDefinition object for the created or existing table.
|
@@ -277,56 +1994,133 @@ def create_table(
|
|
277
1994
|
TableAlreadyExistsError: If the table already exists and fail_if_exists is True.
|
278
1995
|
NamespaceNotFoundError: If the provided namespace does not exist.
|
279
1996
|
"""
|
1997
|
+
resolved_table_properties = _add_default_table_properties(table_properties)
|
1998
|
+
# Note: resolved_tv_properties will be set after checking existing table
|
1999
|
+
|
280
2000
|
namespace = namespace or default_namespace()
|
281
2001
|
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
raise TableAlreadyExistsError(f"Table {namespace}.{name} already exists")
|
286
|
-
return table
|
2002
|
+
# Set up transaction handling
|
2003
|
+
create_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2004
|
+
kwargs["transaction"] = create_transaction
|
287
2005
|
|
288
|
-
|
289
|
-
|
290
|
-
|
2006
|
+
try:
|
2007
|
+
existing_table = (
|
2008
|
+
get_table(
|
2009
|
+
table,
|
2010
|
+
namespace=namespace,
|
2011
|
+
table_version=table_version,
|
2012
|
+
*args,
|
2013
|
+
**kwargs,
|
2014
|
+
)
|
2015
|
+
if "existing_table_definition" not in kwargs
|
2016
|
+
else kwargs["existing_table_definition"]
|
291
2017
|
)
|
2018
|
+
if existing_table is not None:
|
2019
|
+
if existing_table.table_version and existing_table.stream:
|
2020
|
+
if fail_if_exists:
|
2021
|
+
table_identifier = (
|
2022
|
+
f"{namespace}.{table}"
|
2023
|
+
if not table_version
|
2024
|
+
else f"{namespace}.{table}.{table_version}"
|
2025
|
+
)
|
2026
|
+
raise TableAlreadyExistsError(
|
2027
|
+
f"Table {table_identifier} already exists"
|
2028
|
+
)
|
2029
|
+
return existing_table
|
2030
|
+
# the table exists but the table version doesn't - inherit the existing table properties
|
2031
|
+
# Also ensure table properties are inherited when not explicitly provided
|
2032
|
+
if table_properties is None:
|
2033
|
+
resolved_table_properties = existing_table.table.properties
|
2034
|
+
|
2035
|
+
# Set up table version properties based on existing table or explicit properties
|
2036
|
+
default_tv_properties = resolved_table_properties
|
2037
|
+
if schema is None:
|
2038
|
+
default_tv_properties = dict(
|
2039
|
+
default_tv_properties
|
2040
|
+
) # Make a copy to avoid modifying original
|
2041
|
+
default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
|
2042
|
+
resolved_tv_properties = _add_default_table_properties(
|
2043
|
+
table_version_properties, default_tv_properties
|
2044
|
+
)
|
2045
|
+
else:
|
2046
|
+
# create the namespace if it doesn't exist
|
2047
|
+
if not namespace_exists(namespace, **kwargs):
|
2048
|
+
create_namespace(
|
2049
|
+
namespace=namespace,
|
2050
|
+
properties=namespace_properties,
|
2051
|
+
*args,
|
2052
|
+
**kwargs,
|
2053
|
+
)
|
2054
|
+
|
2055
|
+
# Set up table version properties for new table
|
2056
|
+
default_tv_properties = resolved_table_properties
|
2057
|
+
if schema is None:
|
2058
|
+
default_tv_properties = dict(
|
2059
|
+
default_tv_properties
|
2060
|
+
) # Make a copy to avoid modifying original
|
2061
|
+
default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
|
2062
|
+
resolved_tv_properties = _add_default_table_properties(
|
2063
|
+
table_version_properties, default_tv_properties
|
2064
|
+
)
|
292
2065
|
|
293
|
-
|
294
|
-
*args,
|
295
|
-
namespace=namespace,
|
296
|
-
table_name=name,
|
297
|
-
table_version=version,
|
298
|
-
schema=schema,
|
299
|
-
partition_scheme=partition_scheme,
|
300
|
-
sort_keys=sort_keys,
|
301
|
-
table_version_description=description,
|
302
|
-
table_description=description,
|
303
|
-
table_properties=table_properties,
|
304
|
-
lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
|
305
|
-
supported_content_types=content_types,
|
306
|
-
**kwargs,
|
307
|
-
)
|
2066
|
+
_validate_table_properties(resolved_tv_properties)
|
308
2067
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
2068
|
+
(table, table_version, stream) = _get_storage(**kwargs).create_table_version(
|
2069
|
+
namespace=namespace,
|
2070
|
+
table_name=table,
|
2071
|
+
table_version=table_version,
|
2072
|
+
schema=schema,
|
2073
|
+
partition_scheme=partition_scheme,
|
2074
|
+
sort_keys=sort_keys,
|
2075
|
+
table_version_description=table_version_description
|
2076
|
+
if table_version_description is not None
|
2077
|
+
else table_description,
|
2078
|
+
table_description=table_description,
|
2079
|
+
table_properties=resolved_table_properties,
|
2080
|
+
table_version_properties=resolved_tv_properties,
|
2081
|
+
lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
|
2082
|
+
supported_content_types=content_types,
|
2083
|
+
*args,
|
2084
|
+
**kwargs,
|
2085
|
+
)
|
2086
|
+
|
2087
|
+
result = TableDefinition.of(
|
2088
|
+
table=table,
|
2089
|
+
table_version=table_version,
|
2090
|
+
stream=stream,
|
2091
|
+
)
|
2092
|
+
|
2093
|
+
return result
|
2094
|
+
|
2095
|
+
except Exception as e:
|
2096
|
+
# If any error occurs, the transaction remains uncommitted
|
2097
|
+
commit_transaction = False
|
2098
|
+
logger.error(f"Error during create_table: {e}")
|
2099
|
+
raise
|
2100
|
+
finally:
|
2101
|
+
if commit_transaction:
|
2102
|
+
# Seal the interactive transaction to commit all operations atomically
|
2103
|
+
create_transaction.seal()
|
314
2104
|
|
315
2105
|
|
316
2106
|
def drop_table(
|
317
|
-
|
2107
|
+
table: str,
|
318
2108
|
*args,
|
319
2109
|
namespace: Optional[str] = None,
|
320
2110
|
table_version: Optional[str] = None,
|
321
2111
|
purge: bool = False,
|
2112
|
+
transaction: Optional[Transaction] = None,
|
322
2113
|
**kwargs,
|
323
2114
|
) -> None:
|
324
2115
|
"""Drop a table from the catalog and optionally purges underlying data.
|
325
2116
|
|
326
2117
|
Args:
|
327
|
-
|
2118
|
+
table: Name of the table to drop.
|
328
2119
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2120
|
+
table_version: Optional table version of the table to drop. If not specified, the parent table of all
|
2121
|
+
table versions will be dropped.
|
329
2122
|
purge: If True, permanently delete the table data. If False, only remove from catalog.
|
2123
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
330
2124
|
|
331
2125
|
Returns:
|
332
2126
|
None
|
@@ -341,17 +2135,56 @@ def drop_table(
|
|
341
2135
|
raise NotImplementedError("Purge flag is not currently supported.")
|
342
2136
|
|
343
2137
|
namespace = namespace or default_namespace()
|
344
|
-
|
345
|
-
|
346
|
-
)
|
2138
|
+
|
2139
|
+
# Set up transaction handling
|
2140
|
+
drop_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2141
|
+
kwargs["transaction"] = drop_transaction
|
2142
|
+
|
2143
|
+
try:
|
2144
|
+
if not table_version:
|
2145
|
+
_get_storage(**kwargs).delete_table(
|
2146
|
+
namespace=namespace,
|
2147
|
+
table_name=table,
|
2148
|
+
purge=purge,
|
2149
|
+
*args,
|
2150
|
+
**kwargs,
|
2151
|
+
)
|
2152
|
+
else:
|
2153
|
+
_get_storage(**kwargs).update_table_version(
|
2154
|
+
namespace=namespace,
|
2155
|
+
table_name=table,
|
2156
|
+
table_version=table_version,
|
2157
|
+
lifecycle_state=LifecycleState.DELETED,
|
2158
|
+
*args,
|
2159
|
+
**kwargs,
|
2160
|
+
)
|
2161
|
+
|
2162
|
+
except Exception as e:
|
2163
|
+
# If any error occurs, the transaction remains uncommitted
|
2164
|
+
commit_transaction = False
|
2165
|
+
logger.error(f"Error during drop_table: {e}")
|
2166
|
+
raise
|
2167
|
+
finally:
|
2168
|
+
if commit_transaction:
|
2169
|
+
# Seal the interactive transaction to commit all operations atomically
|
2170
|
+
drop_transaction.seal()
|
347
2171
|
|
348
2172
|
|
349
|
-
def refresh_table(
|
2173
|
+
def refresh_table(
|
2174
|
+
table: str,
|
2175
|
+
*args,
|
2176
|
+
namespace: Optional[str] = None,
|
2177
|
+
table_version: Optional[str] = None,
|
2178
|
+
transaction: Optional[Transaction] = None,
|
2179
|
+
**kwargs,
|
2180
|
+
) -> None:
|
350
2181
|
"""Refresh metadata cached on the Ray cluster for the given table.
|
351
2182
|
|
352
2183
|
Args:
|
353
2184
|
table: Name of the table to refresh.
|
354
2185
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2186
|
+
table_version: Optional specific version of the table to refresh.
|
2187
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
355
2188
|
|
356
2189
|
Returns:
|
357
2190
|
None
|
@@ -360,32 +2193,79 @@ def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs)
|
|
360
2193
|
|
361
2194
|
|
362
2195
|
def list_tables(
|
363
|
-
*args,
|
2196
|
+
*args,
|
2197
|
+
namespace: Optional[str] = None,
|
2198
|
+
table: Optional[str] = None,
|
2199
|
+
transaction: Optional[Transaction] = None,
|
2200
|
+
**kwargs,
|
364
2201
|
) -> ListResult[TableDefinition]:
|
365
2202
|
"""List a page of table definitions.
|
366
2203
|
|
367
2204
|
Args:
|
368
2205
|
namespace: Optional namespace to list tables from. Uses default namespace if not specified.
|
2206
|
+
table: Optional table to list its table versions. If not specified, lists the latest active version of each table in the namespace.
|
2207
|
+
transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
|
369
2208
|
|
370
2209
|
Returns:
|
371
2210
|
ListResult containing TableDefinition objects for tables in the namespace.
|
372
2211
|
"""
|
373
2212
|
namespace = namespace or default_namespace()
|
374
|
-
tables = _get_storage(**kwargs).list_tables(*args, namespace=namespace, **kwargs)
|
375
|
-
table_definitions = [
|
376
|
-
get_table(*args, table.table_name, namespace, **kwargs)
|
377
|
-
for table in tables.all_items()
|
378
|
-
]
|
379
2213
|
|
380
|
-
|
2214
|
+
# Set up transaction handling
|
2215
|
+
list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2216
|
+
kwargs["transaction"] = list_transaction
|
2217
|
+
|
2218
|
+
try:
|
2219
|
+
if not table:
|
2220
|
+
tables = _get_storage(**kwargs).list_tables(
|
2221
|
+
namespace=namespace,
|
2222
|
+
*args,
|
2223
|
+
**kwargs,
|
2224
|
+
)
|
2225
|
+
table_definitions = [
|
2226
|
+
get_table(table.table_name, namespace=namespace, *args, **kwargs)
|
2227
|
+
for table in tables.all_items()
|
2228
|
+
]
|
2229
|
+
else:
|
2230
|
+
table_versions = _get_storage(**kwargs).list_table_versions(
|
2231
|
+
namespace=namespace,
|
2232
|
+
table_name=table,
|
2233
|
+
*args,
|
2234
|
+
**kwargs,
|
2235
|
+
)
|
2236
|
+
table_definitions = [
|
2237
|
+
get_table(
|
2238
|
+
table,
|
2239
|
+
namespace=namespace,
|
2240
|
+
table_version=table_version.id,
|
2241
|
+
*args,
|
2242
|
+
**kwargs,
|
2243
|
+
)
|
2244
|
+
for table_version in table_versions.all_items()
|
2245
|
+
]
|
2246
|
+
|
2247
|
+
result = ListResult(items=table_definitions)
|
2248
|
+
|
2249
|
+
return result
|
2250
|
+
|
2251
|
+
except Exception as e:
|
2252
|
+
# If any error occurs, the transaction remains uncommitted
|
2253
|
+
commit_transaction = False
|
2254
|
+
logger.error(f"Error during list_tables: {e}")
|
2255
|
+
raise
|
2256
|
+
finally:
|
2257
|
+
if commit_transaction:
|
2258
|
+
# Seal the interactive transaction to commit all operations atomically
|
2259
|
+
list_transaction.seal()
|
381
2260
|
|
382
2261
|
|
383
2262
|
def get_table(
|
384
|
-
|
2263
|
+
table: str,
|
385
2264
|
*args,
|
386
2265
|
namespace: Optional[str] = None,
|
387
2266
|
table_version: Optional[str] = None,
|
388
2267
|
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
2268
|
+
transaction: Optional[Transaction] = None,
|
389
2269
|
**kwargs,
|
390
2270
|
) -> Optional[TableDefinition]:
|
391
2271
|
"""Get table definition metadata.
|
@@ -393,64 +2273,84 @@ def get_table(
|
|
393
2273
|
Args:
|
394
2274
|
name: Name of the table to retrieve.
|
395
2275
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
396
|
-
table_version: Optional specific version of the table to retrieve.
|
397
|
-
|
398
|
-
|
399
|
-
format if not specified.
|
2276
|
+
table_version: Optional specific version of the table to retrieve. Defaults to the latest active version.
|
2277
|
+
stream_format: Optional stream format to retrieve. Defaults to DELTACAT.
|
2278
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
400
2279
|
|
401
2280
|
Returns:
|
402
|
-
Deltacat TableDefinition if the table exists, None otherwise.
|
403
|
-
|
404
|
-
|
405
|
-
TableVersionNotFoundError: If the table version does not exist.
|
406
|
-
StreamNotFoundError: If the stream does not exist.
|
2281
|
+
Deltacat TableDefinition if the table exists, None otherwise. The table definition's table version will be
|
2282
|
+
None if the requested version is not found. The table definition's stream will be None if the requested stream
|
2283
|
+
format is not found.
|
407
2284
|
"""
|
408
2285
|
namespace = namespace or default_namespace()
|
409
|
-
table: Optional[Table] = _get_storage(**kwargs).get_table(
|
410
|
-
*args, table_name=name, namespace=namespace, **kwargs
|
411
|
-
)
|
412
2286
|
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
table_version: Optional[TableVersion] = _get_storage(**kwargs).get_table_version(
|
417
|
-
*args, namespace, name, table_version or table.latest_table_version, **kwargs
|
418
|
-
)
|
2287
|
+
# Set up transaction handling
|
2288
|
+
get_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2289
|
+
kwargs["transaction"] = get_transaction
|
419
2290
|
|
420
|
-
|
421
|
-
|
422
|
-
|
2291
|
+
try:
|
2292
|
+
table_obj: Optional[Table] = _get_storage(**kwargs).get_table(
|
2293
|
+
table_name=table,
|
2294
|
+
namespace=namespace,
|
2295
|
+
*args,
|
2296
|
+
**kwargs,
|
423
2297
|
)
|
424
2298
|
|
425
|
-
|
426
|
-
|
427
|
-
namespace=namespace,
|
428
|
-
table_name=name,
|
429
|
-
table_version=table_version.id,
|
430
|
-
stream_format=stream_format,
|
431
|
-
**kwargs,
|
432
|
-
)
|
2299
|
+
if table_obj is None:
|
2300
|
+
return None
|
433
2301
|
|
434
|
-
|
435
|
-
|
436
|
-
|
2302
|
+
table_version_obj: Optional[TableVersion] = _get_storage(
|
2303
|
+
**kwargs
|
2304
|
+
).get_table_version(
|
2305
|
+
namespace,
|
2306
|
+
table,
|
2307
|
+
table_version or table_obj.latest_active_table_version,
|
2308
|
+
*args,
|
2309
|
+
**kwargs,
|
437
2310
|
)
|
438
2311
|
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
2312
|
+
stream = None
|
2313
|
+
if table_version_obj:
|
2314
|
+
stream = _get_storage(**kwargs).get_stream(
|
2315
|
+
namespace=namespace,
|
2316
|
+
table_name=table,
|
2317
|
+
table_version=table_version_obj.id,
|
2318
|
+
stream_format=stream_format,
|
2319
|
+
*args,
|
2320
|
+
**kwargs,
|
2321
|
+
)
|
2322
|
+
|
2323
|
+
return TableDefinition.of(
|
2324
|
+
table=table_obj,
|
2325
|
+
table_version=table_version_obj,
|
2326
|
+
stream=stream,
|
2327
|
+
)
|
2328
|
+
except Exception as e:
|
2329
|
+
# If any error occurs, the transaction remains uncommitted
|
2330
|
+
commit_transaction = False
|
2331
|
+
logger.error(f"Error during get_table: {e}")
|
2332
|
+
raise
|
2333
|
+
finally:
|
2334
|
+
if commit_transaction:
|
2335
|
+
# Seal the interactive transaction to commit all operations atomically
|
2336
|
+
get_transaction.seal()
|
444
2337
|
|
445
2338
|
|
446
2339
|
def truncate_table(
|
447
|
-
table: str,
|
2340
|
+
table: str,
|
2341
|
+
*args,
|
2342
|
+
namespace: Optional[str] = None,
|
2343
|
+
table_version: Optional[str] = None,
|
2344
|
+
transaction: Optional[Transaction] = None,
|
2345
|
+
**kwargs,
|
448
2346
|
) -> None:
|
449
2347
|
"""Truncate table data.
|
450
2348
|
|
451
2349
|
Args:
|
452
2350
|
table: Name of the table to truncate.
|
453
2351
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2352
|
+
table_version: Optional specific version of the table to truncate. Defaults to the latest active version.
|
2353
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
454
2354
|
|
455
2355
|
Returns:
|
456
2356
|
None
|
@@ -459,7 +2359,12 @@ def truncate_table(
|
|
459
2359
|
|
460
2360
|
|
461
2361
|
def rename_table(
|
462
|
-
table: str,
|
2362
|
+
table: str,
|
2363
|
+
new_name: str,
|
2364
|
+
*args,
|
2365
|
+
namespace: Optional[str] = None,
|
2366
|
+
transaction: Optional[Transaction] = None,
|
2367
|
+
**kwargs,
|
463
2368
|
) -> None:
|
464
2369
|
"""Rename an existing table.
|
465
2370
|
|
@@ -467,6 +2372,7 @@ def rename_table(
|
|
467
2372
|
table: Current name of the table.
|
468
2373
|
new_name: New name for the table.
|
469
2374
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2375
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
470
2376
|
|
471
2377
|
Returns:
|
472
2378
|
None
|
@@ -475,71 +2381,219 @@ def rename_table(
|
|
475
2381
|
TableNotFoundError: If the table does not exist.
|
476
2382
|
"""
|
477
2383
|
namespace = namespace or default_namespace()
|
478
|
-
_get_storage(**kwargs).update_table(
|
479
|
-
*args, table_name=table, new_table_name=new_name, namespace=namespace, **kwargs
|
480
|
-
)
|
481
2384
|
|
2385
|
+
# Set up transaction handling
|
2386
|
+
rename_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2387
|
+
kwargs["transaction"] = rename_transaction
|
2388
|
+
|
2389
|
+
try:
|
2390
|
+
_get_storage(**kwargs).update_table(
|
2391
|
+
table_name=table,
|
2392
|
+
new_table_name=new_name,
|
2393
|
+
namespace=namespace,
|
2394
|
+
*args,
|
2395
|
+
**kwargs,
|
2396
|
+
)
|
2397
|
+
|
2398
|
+
except Exception as e:
|
2399
|
+
# If any error occurs, the transaction remains uncommitted
|
2400
|
+
commit_transaction = False
|
2401
|
+
logger.error(f"Error during rename_table: {e}")
|
2402
|
+
raise
|
2403
|
+
finally:
|
2404
|
+
if commit_transaction:
|
2405
|
+
# Seal the interactive transaction to commit all operations atomically
|
2406
|
+
rename_transaction.seal()
|
482
2407
|
|
483
|
-
|
2408
|
+
|
2409
|
+
def table_exists(
|
2410
|
+
table: str,
|
2411
|
+
*args,
|
2412
|
+
namespace: Optional[str] = None,
|
2413
|
+
table_version: Optional[str] = None,
|
2414
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
2415
|
+
transaction: Optional[Transaction] = None,
|
2416
|
+
**kwargs,
|
2417
|
+
) -> bool:
|
484
2418
|
"""Check if a table exists in the catalog.
|
485
2419
|
|
486
2420
|
Args:
|
487
2421
|
table: Name of the table to check.
|
488
2422
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2423
|
+
table_version: Optional specific version of the table to check. Defaults to the latest active version.
|
2424
|
+
stream_format: Optional stream format to check. Defaults to DELTACAT.
|
2425
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
489
2426
|
|
490
2427
|
Returns:
|
491
2428
|
True if the table exists, False otherwise.
|
492
2429
|
"""
|
493
2430
|
namespace = namespace or default_namespace()
|
494
|
-
return _get_storage(**kwargs).table_exists(
|
495
|
-
*args, table_name=table, namespace=namespace, **kwargs
|
496
|
-
)
|
497
2431
|
|
2432
|
+
# Set up transaction handling
|
2433
|
+
exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2434
|
+
kwargs["transaction"] = exists_transaction
|
498
2435
|
|
499
|
-
|
2436
|
+
try:
|
2437
|
+
table_obj = _get_storage(**kwargs).get_table(
|
2438
|
+
namespace=namespace,
|
2439
|
+
table_name=table,
|
2440
|
+
*args,
|
2441
|
+
**kwargs,
|
2442
|
+
)
|
2443
|
+
if table_obj is None:
|
2444
|
+
return False
|
2445
|
+
table_version = table_version or table_obj.latest_active_table_version
|
2446
|
+
if not table_version:
|
2447
|
+
return False
|
2448
|
+
table_version_exists = _get_storage(**kwargs).table_version_exists(
|
2449
|
+
namespace,
|
2450
|
+
table,
|
2451
|
+
table_version,
|
2452
|
+
*args,
|
2453
|
+
**kwargs,
|
2454
|
+
)
|
2455
|
+
if not table_version_exists:
|
2456
|
+
return False
|
2457
|
+
stream_exists = _get_storage(**kwargs).stream_exists(
|
2458
|
+
namespace=namespace,
|
2459
|
+
table_name=table,
|
2460
|
+
table_version=table_version,
|
2461
|
+
stream_format=stream_format,
|
2462
|
+
*args,
|
2463
|
+
**kwargs,
|
2464
|
+
)
|
2465
|
+
return stream_exists
|
2466
|
+
except Exception as e:
|
2467
|
+
# If any error occurs, the transaction remains uncommitted
|
2468
|
+
commit_transaction = False
|
2469
|
+
logger.error(f"Error during table_exists: {e}")
|
2470
|
+
raise
|
2471
|
+
finally:
|
2472
|
+
if commit_transaction:
|
2473
|
+
# Seal the interactive transaction to commit all operations atomically
|
2474
|
+
exists_transaction.seal()
|
2475
|
+
|
2476
|
+
|
2477
|
+
def list_namespaces(
|
2478
|
+
*args,
|
2479
|
+
transaction: Optional[Transaction] = None,
|
2480
|
+
**kwargs,
|
2481
|
+
) -> ListResult[Namespace]:
|
500
2482
|
"""List a page of table namespaces.
|
501
2483
|
|
502
2484
|
Args:
|
503
|
-
|
2485
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
504
2486
|
|
505
2487
|
Returns:
|
506
2488
|
ListResult containing Namespace objects.
|
507
2489
|
"""
|
508
|
-
|
2490
|
+
# Set up transaction handling
|
2491
|
+
list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2492
|
+
kwargs["transaction"] = list_transaction
|
2493
|
+
|
2494
|
+
try:
|
2495
|
+
result = _get_storage(**kwargs).list_namespaces(*args, **kwargs)
|
2496
|
+
|
2497
|
+
return result
|
509
2498
|
|
2499
|
+
except Exception as e:
|
2500
|
+
# If any error occurs, the transaction remains uncommitted
|
2501
|
+
commit_transaction = False
|
2502
|
+
logger.error(f"Error during list_namespaces: {e}")
|
2503
|
+
raise
|
2504
|
+
finally:
|
2505
|
+
if commit_transaction:
|
2506
|
+
# Seal the interactive transaction to commit all operations atomically
|
2507
|
+
list_transaction.seal()
|
510
2508
|
|
511
|
-
|
2509
|
+
|
2510
|
+
def get_namespace(
|
2511
|
+
namespace: str,
|
2512
|
+
*args,
|
2513
|
+
transaction: Optional[Transaction] = None,
|
2514
|
+
**kwargs,
|
2515
|
+
) -> Optional[Namespace]:
|
512
2516
|
"""Get metadata for a specific table namespace.
|
513
2517
|
|
514
2518
|
Args:
|
515
2519
|
namespace: Name of the namespace to retrieve.
|
2520
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
516
2521
|
|
517
2522
|
Returns:
|
518
2523
|
Namespace object if the namespace exists, None otherwise.
|
519
2524
|
"""
|
520
|
-
|
2525
|
+
# Set up transaction handling
|
2526
|
+
get_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2527
|
+
kwargs["transaction"] = get_ns_transaction
|
2528
|
+
|
2529
|
+
try:
|
2530
|
+
result = _get_storage(**kwargs).get_namespace(
|
2531
|
+
*args, namespace=namespace, **kwargs
|
2532
|
+
)
|
2533
|
+
|
2534
|
+
return result
|
2535
|
+
|
2536
|
+
except Exception as e:
|
2537
|
+
# If any error occurs, the transaction remains uncommitted
|
2538
|
+
commit_transaction = False
|
2539
|
+
logger.error(f"Error during get_namespace: {e}")
|
2540
|
+
raise
|
2541
|
+
finally:
|
2542
|
+
if commit_transaction:
|
2543
|
+
# Seal the interactive transaction to commit all operations atomically
|
2544
|
+
get_ns_transaction.seal()
|
521
2545
|
|
522
2546
|
|
523
|
-
def namespace_exists(
|
2547
|
+
def namespace_exists(
|
2548
|
+
namespace: str,
|
2549
|
+
*args,
|
2550
|
+
transaction: Optional[Transaction] = None,
|
2551
|
+
**kwargs,
|
2552
|
+
) -> bool:
|
524
2553
|
"""Check if a namespace exists.
|
525
2554
|
|
526
2555
|
Args:
|
527
2556
|
namespace: Name of the namespace to check.
|
2557
|
+
transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
|
528
2558
|
|
529
2559
|
Returns:
|
530
2560
|
True if the namespace exists, False otherwise.
|
531
2561
|
"""
|
532
|
-
|
2562
|
+
# Set up transaction handling
|
2563
|
+
exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2564
|
+
kwargs["transaction"] = exists_transaction
|
2565
|
+
|
2566
|
+
try:
|
2567
|
+
result = _get_storage(**kwargs).namespace_exists(
|
2568
|
+
*args, namespace=namespace, **kwargs
|
2569
|
+
)
|
2570
|
+
|
2571
|
+
return result
|
2572
|
+
|
2573
|
+
except Exception as e:
|
2574
|
+
# If any error occurs, the transaction remains uncommitted
|
2575
|
+
commit_transaction = False
|
2576
|
+
logger.error(f"Error during namespace_exists: {e}")
|
2577
|
+
raise
|
2578
|
+
finally:
|
2579
|
+
if commit_transaction:
|
2580
|
+
# Seal the interactive transaction to commit all operations atomically
|
2581
|
+
exists_transaction.seal()
|
533
2582
|
|
534
2583
|
|
535
2584
|
def create_namespace(
|
536
|
-
namespace: str,
|
2585
|
+
namespace: str,
|
2586
|
+
*args,
|
2587
|
+
properties: Optional[NamespaceProperties] = None,
|
2588
|
+
transaction: Optional[Transaction] = None,
|
2589
|
+
**kwargs,
|
537
2590
|
) -> Namespace:
|
538
2591
|
"""Create a new namespace.
|
539
2592
|
|
540
2593
|
Args:
|
541
2594
|
namespace: Name of the namespace to create.
|
542
2595
|
properties: Optional properties for the namespace.
|
2596
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
543
2597
|
|
544
2598
|
Returns:
|
545
2599
|
Created Namespace object.
|
@@ -547,12 +2601,29 @@ def create_namespace(
|
|
547
2601
|
Raises:
|
548
2602
|
NamespaceAlreadyExistsError: If the namespace already exists.
|
549
2603
|
"""
|
550
|
-
|
551
|
-
|
2604
|
+
# Set up transaction handling
|
2605
|
+
namespace_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2606
|
+
kwargs["transaction"] = namespace_transaction
|
552
2607
|
|
553
|
-
|
554
|
-
|
555
|
-
|
2608
|
+
try:
|
2609
|
+
if namespace_exists(namespace, **kwargs):
|
2610
|
+
raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
|
2611
|
+
|
2612
|
+
result = _get_storage(**kwargs).create_namespace(
|
2613
|
+
*args, namespace=namespace, properties=properties, **kwargs
|
2614
|
+
)
|
2615
|
+
|
2616
|
+
return result
|
2617
|
+
|
2618
|
+
except Exception as e:
|
2619
|
+
# If any error occurs, the transaction remains uncommitted
|
2620
|
+
commit_transaction = False
|
2621
|
+
logger.error(f"Error during create_namespace: {e}")
|
2622
|
+
raise
|
2623
|
+
finally:
|
2624
|
+
if commit_transaction:
|
2625
|
+
# Seal the interactive transaction to commit all operations atomically
|
2626
|
+
namespace_transaction.seal()
|
556
2627
|
|
557
2628
|
|
558
2629
|
def alter_namespace(
|
@@ -560,6 +2631,7 @@ def alter_namespace(
|
|
560
2631
|
*args,
|
561
2632
|
properties: Optional[NamespaceProperties] = None,
|
562
2633
|
new_namespace: Optional[str] = None,
|
2634
|
+
transaction: Optional[Transaction] = None,
|
563
2635
|
**kwargs,
|
564
2636
|
) -> None:
|
565
2637
|
"""Alter a namespace definition.
|
@@ -568,26 +2640,49 @@ def alter_namespace(
|
|
568
2640
|
namespace: Name of the namespace to alter.
|
569
2641
|
properties: Optional new properties for the namespace.
|
570
2642
|
new_namespace: Optional new name for the namespace.
|
2643
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
571
2644
|
|
572
2645
|
Returns:
|
573
2646
|
None
|
574
2647
|
"""
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
**kwargs
|
581
|
-
|
2648
|
+
# Set up transaction handling
|
2649
|
+
alter_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2650
|
+
kwargs["transaction"] = alter_ns_transaction
|
2651
|
+
|
2652
|
+
try:
|
2653
|
+
_get_storage(**kwargs).update_namespace(
|
2654
|
+
namespace=namespace,
|
2655
|
+
properties=properties,
|
2656
|
+
new_namespace=new_namespace,
|
2657
|
+
*args,
|
2658
|
+
**kwargs,
|
2659
|
+
)
|
2660
|
+
|
2661
|
+
except Exception as e:
|
2662
|
+
# If any error occurs, the transaction remains uncommitted
|
2663
|
+
commit_transaction = False
|
2664
|
+
logger.error(f"Error during alter_namespace: {e}")
|
2665
|
+
raise
|
2666
|
+
finally:
|
2667
|
+
if commit_transaction:
|
2668
|
+
# Seal the interactive transaction to commit all operations atomically
|
2669
|
+
alter_ns_transaction.seal()
|
582
2670
|
|
583
2671
|
|
584
|
-
def drop_namespace(
|
2672
|
+
def drop_namespace(
|
2673
|
+
namespace: str,
|
2674
|
+
*args,
|
2675
|
+
purge: bool = False,
|
2676
|
+
transaction: Optional[Transaction] = None,
|
2677
|
+
**kwargs,
|
2678
|
+
) -> None:
|
585
2679
|
"""Drop a namespace and all of its tables from the catalog.
|
586
2680
|
|
587
2681
|
Args:
|
588
2682
|
namespace: Name of the namespace to drop.
|
589
|
-
purge: If True, permanently delete all
|
590
|
-
If False, only
|
2683
|
+
purge: If True, permanently delete all table data in the namespace.
|
2684
|
+
If False, only removes the namespace from the catalog.
|
2685
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
591
2686
|
|
592
2687
|
Returns:
|
593
2688
|
None
|
@@ -597,50 +2692,39 @@ def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None
|
|
597
2692
|
if purge:
|
598
2693
|
raise NotImplementedError("Purge flag is not currently supported.")
|
599
2694
|
|
600
|
-
|
601
|
-
|
602
|
-
|
2695
|
+
# Set up transaction handling
|
2696
|
+
drop_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2697
|
+
kwargs["transaction"] = drop_ns_transaction
|
2698
|
+
|
2699
|
+
try:
|
2700
|
+
_get_storage(**kwargs).delete_namespace(
|
2701
|
+
*args,
|
2702
|
+
namespace=namespace,
|
2703
|
+
purge=purge,
|
2704
|
+
**kwargs,
|
2705
|
+
)
|
2706
|
+
|
2707
|
+
except Exception as e:
|
2708
|
+
# If any error occurs, the transaction remains uncommitted
|
2709
|
+
commit_transaction = False
|
2710
|
+
logger.error(f"Error during drop_namespace: {e}")
|
2711
|
+
raise
|
2712
|
+
finally:
|
2713
|
+
if commit_transaction:
|
2714
|
+
# Seal the interactive transaction to commit all operations atomically
|
2715
|
+
drop_ns_transaction.seal()
|
603
2716
|
|
604
2717
|
|
605
2718
|
def default_namespace(*args, **kwargs) -> str:
|
606
2719
|
"""Return the default namespace for the catalog.
|
607
2720
|
|
608
2721
|
Returns:
|
609
|
-
|
2722
|
+
Name of the default namespace.
|
610
2723
|
"""
|
611
|
-
return DEFAULT_NAMESPACE
|
612
|
-
|
613
|
-
|
614
|
-
def _validate_read_table_args(
|
615
|
-
namespace: Optional[str] = None,
|
616
|
-
table_type: Optional[TableType] = None,
|
617
|
-
distributed_dataset_type: Optional[DistributedDatasetType] = None,
|
618
|
-
merge_on_read: Optional[bool] = None,
|
619
|
-
**kwargs,
|
620
|
-
):
|
621
|
-
storage = _get_storage(**kwargs)
|
622
|
-
if storage is None:
|
623
|
-
raise ValueError(
|
624
|
-
"Catalog not initialized. Did you miss calling "
|
625
|
-
"initialize(ds=<deltacat_storage>)?"
|
626
|
-
)
|
627
|
-
|
628
|
-
if merge_on_read:
|
629
|
-
raise ValueError("Merge on read not supported currently.")
|
630
|
-
|
631
|
-
if table_type is not TableType.PYARROW:
|
632
|
-
raise ValueError("Only PYARROW table type is supported as of now")
|
633
|
-
|
634
|
-
if distributed_dataset_type is not DistributedDatasetType.DAFT:
|
635
|
-
raise ValueError("Only DAFT dataset type is supported as of now")
|
2724
|
+
return DEFAULT_NAMESPACE
|
636
2725
|
|
637
|
-
if namespace is None:
|
638
|
-
raise ValueError(
|
639
|
-
"namespace must be passed to uniquely identify a table in the catalog."
|
640
|
-
)
|
641
2726
|
|
642
|
-
|
643
|
-
def _get_latest_or_given_table_version(
|
2727
|
+
def _get_latest_active_or_given_table_version(
|
644
2728
|
namespace: str,
|
645
2729
|
table_name: str,
|
646
2730
|
table_version: Optional[str] = None,
|
@@ -649,9 +2733,16 @@ def _get_latest_or_given_table_version(
|
|
649
2733
|
) -> TableVersion:
|
650
2734
|
table_version_obj = None
|
651
2735
|
if table_version is None:
|
652
|
-
table_version_obj = _get_storage(**kwargs).
|
653
|
-
namespace=namespace,
|
2736
|
+
table_version_obj = _get_storage(**kwargs).get_latest_active_table_version(
|
2737
|
+
namespace=namespace,
|
2738
|
+
table_name=table_name,
|
2739
|
+
*args,
|
2740
|
+
**kwargs,
|
654
2741
|
)
|
2742
|
+
if table_version_obj is None:
|
2743
|
+
raise TableVersionNotFoundError(
|
2744
|
+
f"No active table version found for table {namespace}.{table_name}"
|
2745
|
+
)
|
655
2746
|
table_version = table_version_obj.table_version
|
656
2747
|
else:
|
657
2748
|
table_version_obj = _get_storage(**kwargs).get_table_version(
|
@@ -665,18 +2756,82 @@ def _get_latest_or_given_table_version(
|
|
665
2756
|
return table_version_obj
|
666
2757
|
|
667
2758
|
|
2759
|
+
def _get_all_committed_partitions(
|
2760
|
+
table: str,
|
2761
|
+
namespace: str,
|
2762
|
+
table_version: str,
|
2763
|
+
**kwargs,
|
2764
|
+
) -> List[Union[Partition, PartitionLocator]]:
|
2765
|
+
"""Get all committed partitions for a table and validate uniqueness."""
|
2766
|
+
logger.info(
|
2767
|
+
f"Reading all partitions metadata in the table={table} "
|
2768
|
+
"as partition_filter was None."
|
2769
|
+
)
|
2770
|
+
|
2771
|
+
all_partitions = (
|
2772
|
+
_get_storage(**kwargs)
|
2773
|
+
.list_partitions(
|
2774
|
+
table_name=table,
|
2775
|
+
namespace=namespace,
|
2776
|
+
table_version=table_version,
|
2777
|
+
**kwargs,
|
2778
|
+
)
|
2779
|
+
.all_items()
|
2780
|
+
)
|
2781
|
+
|
2782
|
+
committed_partitions = [
|
2783
|
+
partition
|
2784
|
+
for partition in all_partitions
|
2785
|
+
if partition.state == CommitState.COMMITTED
|
2786
|
+
]
|
2787
|
+
|
2788
|
+
logger.info(
|
2789
|
+
f"Found {len(committed_partitions)} committed partitions for "
|
2790
|
+
f"table={namespace}/{table}/{table_version}"
|
2791
|
+
)
|
2792
|
+
|
2793
|
+
_validate_partition_uniqueness(
|
2794
|
+
committed_partitions, namespace, table, table_version
|
2795
|
+
)
|
2796
|
+
return committed_partitions
|
2797
|
+
|
2798
|
+
|
2799
|
+
def _validate_partition_uniqueness(
|
2800
|
+
partitions: List[Partition], namespace: str, table: str, table_version: str
|
2801
|
+
) -> None:
|
2802
|
+
"""Validate that there are no duplicate committed partitions for the same partition values."""
|
2803
|
+
commit_count_per_partition_value = defaultdict(int)
|
2804
|
+
for partition in partitions:
|
2805
|
+
# Normalize partition values: both None and [] represent unpartitioned data
|
2806
|
+
normalized_values = (
|
2807
|
+
None
|
2808
|
+
if (
|
2809
|
+
partition.partition_values is None
|
2810
|
+
or (
|
2811
|
+
isinstance(partition.partition_values, list)
|
2812
|
+
and len(partition.partition_values) == 0
|
2813
|
+
)
|
2814
|
+
)
|
2815
|
+
else partition.partition_values
|
2816
|
+
)
|
2817
|
+
commit_count_per_partition_value[normalized_values] += 1
|
2818
|
+
|
2819
|
+
# Check for multiple committed partitions for the same partition values
|
2820
|
+
for partition_values, commit_count in commit_count_per_partition_value.items():
|
2821
|
+
if commit_count > 1:
|
2822
|
+
raise RuntimeError(
|
2823
|
+
f"Multiple committed partitions found for table={namespace}/{table}/{table_version}. "
|
2824
|
+
f"Partition values: {partition_values}. Commit count: {commit_count}. "
|
2825
|
+
f"This should not happen."
|
2826
|
+
)
|
2827
|
+
|
2828
|
+
|
668
2829
|
def _get_deltas_from_partition_filter(
|
669
2830
|
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
670
|
-
stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
|
671
2831
|
*args,
|
672
2832
|
**kwargs,
|
673
2833
|
):
|
674
|
-
|
675
2834
|
result_deltas = []
|
676
|
-
start_stream_position, end_stream_position = stream_position_range_inclusive or (
|
677
|
-
None,
|
678
|
-
None,
|
679
|
-
)
|
680
2835
|
for partition_like in partition_filter:
|
681
2836
|
deltas = (
|
682
2837
|
_get_storage(**kwargs)
|
@@ -684,32 +2839,39 @@ def _get_deltas_from_partition_filter(
|
|
684
2839
|
partition_like=partition_like,
|
685
2840
|
ascending_order=True,
|
686
2841
|
include_manifest=True,
|
687
|
-
start_stream_position=start_stream_position,
|
688
|
-
last_stream_position=end_stream_position,
|
689
2842
|
*args,
|
690
2843
|
**kwargs,
|
691
2844
|
)
|
692
2845
|
.all_items()
|
693
2846
|
)
|
694
2847
|
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
2848
|
+
# Validate that all qualified deltas are append type - merge-on-read not yet implemented
|
2849
|
+
# TODO(pdames): Run compaction minus materialize for MoR of each partition.
|
2850
|
+
if deltas:
|
2851
|
+
non_append_deltas = []
|
2852
|
+
for delta in deltas:
|
2853
|
+
if delta.type != DeltaType.APPEND:
|
2854
|
+
non_append_deltas.append(delta)
|
2855
|
+
else:
|
2856
|
+
result_deltas.append(delta)
|
2857
|
+
if non_append_deltas:
|
2858
|
+
delta_types = {delta.type for delta in non_append_deltas}
|
2859
|
+
delta_info = [
|
2860
|
+
(str(delta.locator), delta.type) for delta in non_append_deltas[:5]
|
2861
|
+
] # Show first 5
|
2862
|
+
raise NotImplementedError(
|
2863
|
+
f"Merge-on-read is not yet implemented. Found {len(non_append_deltas)} non-append deltas "
|
2864
|
+
f"with types {delta_types}. All deltas must be APPEND type for read operations. "
|
2865
|
+
f"Examples: {delta_info}. Please run compaction first to merge non-append deltas."
|
2866
|
+
)
|
2867
|
+
|
2868
|
+
logger.info(f"Validated {len(deltas)} qualified deltas are all APPEND type")
|
707
2869
|
return result_deltas
|
708
2870
|
|
709
2871
|
|
710
2872
|
def _get_storage(**kwargs):
|
711
2873
|
"""
|
712
|
-
Returns the implementation of `deltacat.storage.interface` to use with this catalog
|
2874
|
+
Returns the implementation of `deltacat.storage.interface` to use with this catalog
|
713
2875
|
|
714
2876
|
This is configured in the `CatalogProperties` stored during initialization and passed through `delegate.py`
|
715
2877
|
"""
|
@@ -717,4 +2879,4 @@ def _get_storage(**kwargs):
|
|
717
2879
|
if properties is not None and properties.storage is not None:
|
718
2880
|
return properties.storage
|
719
2881
|
else:
|
720
|
-
return
|
2882
|
+
return dc.storage.metastore
|