deltacat 2.0.0b10__py3-none-any.whl → 2.0.0b12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +96 -17
- deltacat/api.py +122 -67
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +0 -18
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2435 -279
- deltacat/catalog/model/catalog.py +154 -77
- deltacat/catalog/model/properties.py +63 -22
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/converter_session.py +145 -32
- deltacat/compute/converter/model/convert_input.py +26 -19
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +35 -16
- deltacat/compute/converter/model/converter_session_params.py +24 -21
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +18 -9
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +157 -50
- deltacat/compute/converter/steps/dedupe.py +24 -11
- deltacat/compute/converter/utils/convert_task_options.py +27 -12
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +8 -8
- deltacat/compute/converter/utils/io.py +101 -12
- deltacat/compute/converter/utils/s3u.py +33 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +25 -12
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +45 -2
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +1 -3
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +3 -5
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/indexer/indexer.py +2 -2
- deltacat/examples/indexer/job_runner.py +1 -2
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +29 -11
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/datasource/deltacat_datasource.py +0 -1
- deltacat/io/reader/deltacat_read_api.py +1 -1
- deltacat/storage/__init__.py +20 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +182 -6
- deltacat/storage/model/metafile.py +151 -78
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +48 -26
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +675 -490
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -3
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +5 -3
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +988 -4
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +224 -14
- deltacat/types/tables.py +2329 -59
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +823 -36
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +100 -0
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +658 -27
- deltacat/utils/pyarrow.py +1258 -213
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +57 -16
- deltacat-2.0.0b12.dist-info/METADATA +1163 -0
- deltacat-2.0.0b12.dist-info/RECORD +439 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/daft/daft_scan.py +0 -115
- deltacat/daft/model.py +0 -258
- deltacat/daft/translator.py +0 -126
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1236
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.0b10.dist-info/METADATA +0 -68
- deltacat-2.0.0b10.dist-info/RECORD +0 -381
- /deltacat/{compute/merge_on_read/model → docs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs/autogen}/__init__.py +0 -0
- /deltacat/{daft → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/examples/{iceberg → compactor}/__init__.py +0 -0
- /deltacat/{storage/iceberg → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → experimental/catalog}/__init__.py +0 -0
- /deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +0 -0
- /deltacat/{storage/rivulet/schema → experimental/compatibility}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → experimental/converter_agent}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → experimental/converter_agent/beam}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → experimental/storage}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental/storage/iceberg}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/storage/rivulet/arrow}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/storage/rivulet/fs/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.0b10.dist-info → deltacat-2.0.0b12.dist-info}/top_level.txt +0 -0
deltacat/catalog/main/impl.py
CHANGED
@@ -1,39 +1,79 @@
|
|
1
|
-
from typing import Any, Dict, List, Optional, Union, Tuple
|
1
|
+
from typing import Any, Dict, List, Optional, Union, Tuple, Set
|
2
2
|
import logging
|
3
|
+
from collections import defaultdict
|
3
4
|
|
5
|
+
import numpy as np
|
6
|
+
import pyarrow as pa
|
7
|
+
import pandas as pd
|
8
|
+
import daft
|
4
9
|
import deltacat as dc
|
5
10
|
|
6
|
-
from deltacat.
|
11
|
+
from deltacat.storage.model.manifest import ManifestAuthor
|
12
|
+
from deltacat.catalog.model.properties import CatalogProperties
|
7
13
|
from deltacat.exceptions import (
|
8
14
|
NamespaceAlreadyExistsError,
|
9
|
-
StreamNotFoundError,
|
10
15
|
TableAlreadyExistsError,
|
11
16
|
TableVersionNotFoundError,
|
17
|
+
TableNotFoundError,
|
18
|
+
TableVersionAlreadyExistsError,
|
19
|
+
TableValidationError,
|
20
|
+
SchemaValidationError,
|
12
21
|
)
|
13
22
|
from deltacat.catalog.model.table_definition import TableDefinition
|
14
23
|
from deltacat.storage.model.sort_key import SortScheme
|
15
24
|
from deltacat.storage.model.list_result import ListResult
|
16
25
|
from deltacat.storage.model.namespace import Namespace, NamespaceProperties
|
17
|
-
from deltacat.storage.model.schema import
|
26
|
+
from deltacat.storage.model.schema import (
|
27
|
+
Schema,
|
28
|
+
SchemaUpdate,
|
29
|
+
)
|
18
30
|
from deltacat.storage.model.table import TableProperties, Table
|
19
31
|
from deltacat.storage.model.types import (
|
20
|
-
|
32
|
+
Dataset,
|
21
33
|
LifecycleState,
|
22
|
-
LocalDataset,
|
23
|
-
LocalTable,
|
24
34
|
StreamFormat,
|
35
|
+
SchemaConsistencyType,
|
25
36
|
)
|
26
37
|
from deltacat.storage.model.partition import (
|
27
38
|
Partition,
|
28
39
|
PartitionLocator,
|
29
40
|
PartitionScheme,
|
30
41
|
)
|
31
|
-
from deltacat.storage.model.table_version import
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
from deltacat.types
|
36
|
-
from deltacat.
|
42
|
+
from deltacat.storage.model.table_version import (
|
43
|
+
TableVersion,
|
44
|
+
TableVersionProperties,
|
45
|
+
)
|
46
|
+
from deltacat.storage.model.types import DeltaType
|
47
|
+
from deltacat.storage import Delta
|
48
|
+
from deltacat.storage.model.types import CommitState
|
49
|
+
from deltacat.storage.model.transaction import (
|
50
|
+
Transaction,
|
51
|
+
setup_transaction,
|
52
|
+
)
|
53
|
+
from deltacat.types.media import (
|
54
|
+
ContentType,
|
55
|
+
DatasetType,
|
56
|
+
StorageType,
|
57
|
+
SCHEMA_CONTENT_TYPES,
|
58
|
+
)
|
59
|
+
from deltacat.types.tables import (
|
60
|
+
SchemaEvolutionMode,
|
61
|
+
TableProperty,
|
62
|
+
TablePropertyDefaultValues,
|
63
|
+
TableReadOptimizationLevel,
|
64
|
+
TableWriteMode,
|
65
|
+
get_dataset_type,
|
66
|
+
get_table_schema,
|
67
|
+
get_table_column_names,
|
68
|
+
from_pyarrow,
|
69
|
+
concat_tables,
|
70
|
+
empty_table,
|
71
|
+
infer_table_schema,
|
72
|
+
to_pandas,
|
73
|
+
)
|
74
|
+
from deltacat.utils import pyarrow as pa_utils
|
75
|
+
from deltacat.utils.reader_compatibility_mapping import get_compatible_readers
|
76
|
+
from deltacat.utils.pyarrow import get_base_arrow_type_name
|
37
77
|
from deltacat import logs
|
38
78
|
from deltacat.constants import DEFAULT_NAMESPACE
|
39
79
|
|
@@ -42,20 +82,30 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
42
82
|
"""
|
43
83
|
Default Catalog interface implementation using DeltaCAT native storage.
|
44
84
|
|
45
|
-
|
46
|
-
|
85
|
+
The functions here should not be invoked directly, but should instead be
|
86
|
+
invoked through `delegate.py` (e.g., to support passing catalog's by name, and
|
87
|
+
to ensure that each initialized `Catalog` implementation has its `inner`
|
88
|
+
property set to the `CatalogProperties` returned from `initialize()`).
|
47
89
|
|
48
|
-
`CatalogProperties`
|
49
|
-
|
90
|
+
The `CatalogProperties` instance returned by `initialize()` contains all
|
91
|
+
durable state required to deterministically reconstruct the associated DeltaCAT
|
92
|
+
native `Catalog` implementation (e.g., the root URI for the catalog metastore).
|
50
93
|
"""
|
51
94
|
|
52
95
|
|
53
96
|
# catalog functions
|
54
|
-
def initialize(
|
97
|
+
def initialize(
|
98
|
+
config: Optional[CatalogProperties] = None,
|
99
|
+
*args,
|
100
|
+
**kwargs,
|
101
|
+
) -> CatalogProperties:
|
55
102
|
"""
|
56
|
-
|
103
|
+
Performs any required one-time initialization and validation of this
|
104
|
+
catalog implementation based on the input configuration. If no config
|
105
|
+
instance is given, a new `CatalogProperties` instance is constructed
|
106
|
+
using the given keyword arguments.
|
57
107
|
|
58
|
-
|
108
|
+
Returns the input config if given, and the newly created config otherwise.
|
59
109
|
"""
|
60
110
|
if config is not None:
|
61
111
|
if not isinstance(config, CatalogProperties):
|
@@ -68,13 +118,146 @@ def initialize(config: CatalogProperties = None, *args, **kwargs) -> CatalogProp
|
|
68
118
|
|
69
119
|
|
70
120
|
# table functions
|
121
|
+
def _validate_write_mode_and_table_existence(
|
122
|
+
table: str,
|
123
|
+
namespace: str,
|
124
|
+
mode: TableWriteMode,
|
125
|
+
**kwargs,
|
126
|
+
) -> bool:
|
127
|
+
"""Validate write mode against table existence and return whether table exists."""
|
128
|
+
table_exists_flag = table_exists(
|
129
|
+
table,
|
130
|
+
namespace=namespace,
|
131
|
+
**kwargs,
|
132
|
+
)
|
133
|
+
logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
|
134
|
+
|
135
|
+
if mode == TableWriteMode.CREATE and table_exists_flag:
|
136
|
+
raise ValueError(
|
137
|
+
f"Table {namespace}.{table} already exists and mode is CREATE."
|
138
|
+
)
|
139
|
+
elif (
|
140
|
+
mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
|
141
|
+
and not table_exists_flag
|
142
|
+
):
|
143
|
+
raise TableNotFoundError(
|
144
|
+
f"Table {namespace}.{table} does not exist and mode is {mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()}. Use CREATE or AUTO mode to create a new table."
|
145
|
+
)
|
146
|
+
|
147
|
+
return table_exists_flag
|
148
|
+
|
149
|
+
|
150
|
+
def _get_table_and_validate_write_mode(
|
151
|
+
table: str,
|
152
|
+
namespace: str,
|
153
|
+
table_version: Optional[str],
|
154
|
+
mode: TableWriteMode,
|
155
|
+
**kwargs,
|
156
|
+
) -> Tuple[bool, TableDefinition]:
|
157
|
+
"""Validate write mode against table and table version existence.
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
Tuple of (table_exists_flag, table_definition)
|
161
|
+
"""
|
162
|
+
# First validate table, table version, and stream existence
|
163
|
+
existing_table_def = get_table(
|
164
|
+
table,
|
165
|
+
namespace=namespace,
|
166
|
+
table_version=table_version,
|
167
|
+
**kwargs,
|
168
|
+
)
|
169
|
+
table_exists_flag = (
|
170
|
+
existing_table_def is not None
|
171
|
+
and existing_table_def.table_version
|
172
|
+
and existing_table_def.stream
|
173
|
+
)
|
174
|
+
logger.info(f"Table to write to ({namespace}.{table}) exists: {table_exists_flag}")
|
175
|
+
|
176
|
+
# Then validate table existence constraints
|
177
|
+
if mode == TableWriteMode.CREATE and table_exists_flag and table_version is None:
|
178
|
+
raise TableAlreadyExistsError(
|
179
|
+
f"Table {namespace}.{table} already exists and mode is CREATE."
|
180
|
+
)
|
181
|
+
elif (
|
182
|
+
mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
|
183
|
+
and existing_table_def is None
|
184
|
+
):
|
185
|
+
raise TableNotFoundError(
|
186
|
+
f"Table {namespace}.{table} does not exist and write mode is {mode}. Use CREATE or AUTO mode to create a new table."
|
187
|
+
)
|
188
|
+
|
189
|
+
# Then validate table version existence constraints
|
190
|
+
if table_version is not None and table_exists_flag:
|
191
|
+
if mode == TableWriteMode.CREATE:
|
192
|
+
raise TableVersionAlreadyExistsError(
|
193
|
+
f"Table version {namespace}.{table}.{table_version} already exists and mode is CREATE."
|
194
|
+
)
|
195
|
+
logger.info(f"Table version ({namespace}.{table}.{table_version}) exists.")
|
196
|
+
elif (
|
197
|
+
mode not in (TableWriteMode.CREATE, TableWriteMode.AUTO)
|
198
|
+
and table_version is not None
|
199
|
+
and not table_exists_flag
|
200
|
+
):
|
201
|
+
raise TableVersionNotFoundError(
|
202
|
+
f"Table version {namespace}.{table}.{table_version} does not exist and write mode is {mode}. "
|
203
|
+
f"Use CREATE or AUTO mode to create a new table version, or omit table_version "
|
204
|
+
f"to use the latest version."
|
205
|
+
)
|
206
|
+
return table_exists_flag, existing_table_def
|
207
|
+
|
208
|
+
|
209
|
+
def _validate_content_type_against_supported_content_types(
|
210
|
+
namespace: str,
|
211
|
+
table: str,
|
212
|
+
content_type: ContentType,
|
213
|
+
supported_content_types: Optional[List[ContentType]],
|
214
|
+
) -> None:
|
215
|
+
if supported_content_types and content_type not in supported_content_types:
|
216
|
+
raise ValueError(
|
217
|
+
f"Content type proposed for write to table {namespace}.{table} ({content_type}) "
|
218
|
+
f"conflicts with the proposed list of new supported content types: {supported_content_types}"
|
219
|
+
)
|
220
|
+
|
221
|
+
|
222
|
+
def _create_table_for_write(
|
223
|
+
data: Dataset,
|
224
|
+
table: str,
|
225
|
+
namespace: str,
|
226
|
+
table_version: Optional[str],
|
227
|
+
content_type: ContentType,
|
228
|
+
existing_table_definition: Optional[TableDefinition],
|
229
|
+
*args,
|
230
|
+
**kwargs,
|
231
|
+
) -> TableDefinition:
|
232
|
+
"""Creates a new table, table version, and/or stream in preparation for a write operation."""
|
233
|
+
if "schema" not in kwargs:
|
234
|
+
kwargs["schema"] = infer_table_schema(data)
|
235
|
+
|
236
|
+
_validate_content_type_against_supported_content_types(
|
237
|
+
namespace,
|
238
|
+
table,
|
239
|
+
content_type,
|
240
|
+
kwargs.get("content_types"),
|
241
|
+
)
|
242
|
+
return create_table(
|
243
|
+
table,
|
244
|
+
namespace=namespace,
|
245
|
+
table_version=table_version,
|
246
|
+
existing_table_definition=existing_table_definition,
|
247
|
+
*args,
|
248
|
+
**kwargs,
|
249
|
+
)
|
250
|
+
|
251
|
+
|
71
252
|
def write_to_table(
|
72
|
-
data:
|
253
|
+
data: Dataset,
|
73
254
|
table: str,
|
74
255
|
*args,
|
75
256
|
namespace: Optional[str] = None,
|
257
|
+
table_version: Optional[str] = None,
|
76
258
|
mode: TableWriteMode = TableWriteMode.AUTO,
|
77
259
|
content_type: ContentType = ContentType.PARQUET,
|
260
|
+
transaction: Optional[Transaction] = None,
|
78
261
|
**kwargs,
|
79
262
|
) -> None:
|
80
263
|
"""Write local or distributed data to a table. Raises an error if the
|
@@ -83,79 +266,1137 @@ def write_to_table(
|
|
83
266
|
When creating a table, all `create_table` parameters may be optionally
|
84
267
|
specified as additional keyword arguments. When appending to, or replacing,
|
85
268
|
an existing table, all `alter_table` parameters may be optionally specified
|
86
|
-
as additional keyword arguments.
|
87
|
-
raise NotImplementedError("write_to_table not implemented")
|
269
|
+
as additional keyword arguments.
|
88
270
|
|
271
|
+
Args:
|
272
|
+
data: Local or distributed data to write to the table.
|
273
|
+
table: Name of the table to write to.
|
274
|
+
namespace: Optional namespace for the table. Uses default if not specified.
|
275
|
+
table_version: Optional version of the table to write to. If specified,
|
276
|
+
will create this version if it doesn't exist (in CREATE mode) or
|
277
|
+
get this version if it exists (in other modes). If not specified,
|
278
|
+
uses the latest version.
|
279
|
+
mode: Write mode (AUTO, CREATE, APPEND, REPLACE, MERGE, DELETE).
|
280
|
+
content_type: Content type used to write the data files. Defaults to PARQUET.
|
281
|
+
transaction: Optional transaction to append write operations to instead of
|
282
|
+
creating and committing a new transaction.
|
283
|
+
**kwargs: Additional keyword arguments.
|
284
|
+
"""
|
285
|
+
namespace = namespace or default_namespace()
|
89
286
|
|
90
|
-
|
287
|
+
# Set up transaction handling
|
288
|
+
write_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
289
|
+
kwargs["transaction"] = write_transaction
|
290
|
+
|
291
|
+
try:
|
292
|
+
# Validate write mode and table/table version/stream existence
|
293
|
+
(table_exists_flag, table_definition,) = _get_table_and_validate_write_mode(
|
294
|
+
table,
|
295
|
+
namespace,
|
296
|
+
table_version,
|
297
|
+
mode,
|
298
|
+
**kwargs,
|
299
|
+
)
|
300
|
+
|
301
|
+
# Get or create table, table version, and/or stream
|
302
|
+
if not table_exists_flag:
|
303
|
+
table_definition = _create_table_for_write(
|
304
|
+
data,
|
305
|
+
table,
|
306
|
+
namespace,
|
307
|
+
table_version,
|
308
|
+
content_type,
|
309
|
+
table_definition,
|
310
|
+
*args,
|
311
|
+
**kwargs,
|
312
|
+
)
|
313
|
+
else:
|
314
|
+
# call alter_table if there are any alter_table kwargs provided
|
315
|
+
if (
|
316
|
+
"lifecycle_state" in kwargs
|
317
|
+
or "schema_updates" in kwargs
|
318
|
+
or "partition_updates" in kwargs
|
319
|
+
or "sort_scheme" in kwargs
|
320
|
+
or "table_description" in kwargs
|
321
|
+
or "table_version_description" in kwargs
|
322
|
+
or "table_properties" in kwargs
|
323
|
+
or "table_version_properties" in kwargs
|
324
|
+
):
|
325
|
+
alter_table(
|
326
|
+
table,
|
327
|
+
namespace=namespace,
|
328
|
+
table_version=table_version,
|
329
|
+
*args,
|
330
|
+
**kwargs,
|
331
|
+
)
|
332
|
+
|
333
|
+
# Get the active table version and stream
|
334
|
+
table_version_obj = _get_latest_active_or_given_table_version(
|
335
|
+
namespace=table_definition.table.namespace,
|
336
|
+
table_name=table_definition.table.table_name,
|
337
|
+
table_version=table_version or table_definition.table_version.table_version,
|
338
|
+
**kwargs,
|
339
|
+
)
|
340
|
+
|
341
|
+
# Validate schema compatibility for schemaless content types with schema tables
|
342
|
+
if (
|
343
|
+
content_type.value not in SCHEMA_CONTENT_TYPES
|
344
|
+
and table_version_obj.schema is not None
|
345
|
+
):
|
346
|
+
schemaless_types = {
|
347
|
+
ct for ct in ContentType if ct.value not in SCHEMA_CONTENT_TYPES
|
348
|
+
}
|
349
|
+
raise TableValidationError(
|
350
|
+
f"Content type '{content_type.value}' cannot be written to a table with a schema. "
|
351
|
+
f"Table '{namespace}.{table}' has a schema, but content type '{content_type.value}' "
|
352
|
+
f"is schemaless. Schemaless content types ({', '.join(sorted([ct.value for ct in schemaless_types]))}) "
|
353
|
+
f"can only be written to schemaless tables."
|
354
|
+
)
|
355
|
+
|
356
|
+
# Handle different write modes and get stream and delta type
|
357
|
+
stream, delta_type = _handle_write_mode(
|
358
|
+
mode,
|
359
|
+
table_definition,
|
360
|
+
table_version_obj,
|
361
|
+
namespace,
|
362
|
+
table,
|
363
|
+
**kwargs,
|
364
|
+
)
|
365
|
+
|
366
|
+
if not stream:
|
367
|
+
raise ValueError(f"No default stream found for table {namespace}.{table}")
|
368
|
+
|
369
|
+
# Automatically set entry_params for DELETE/MERGE modes if not provided
|
370
|
+
_set_entry_params_if_needed(
|
371
|
+
mode,
|
372
|
+
table_version_obj,
|
373
|
+
kwargs,
|
374
|
+
)
|
375
|
+
|
376
|
+
# Validate table configuration
|
377
|
+
_validate_table_configuration(
|
378
|
+
stream,
|
379
|
+
table_version_obj,
|
380
|
+
namespace,
|
381
|
+
table,
|
382
|
+
)
|
383
|
+
|
384
|
+
# Handle partition creation/retrieval
|
385
|
+
partition, commit_staged_partition = _handle_partition_creation(
|
386
|
+
mode,
|
387
|
+
table_exists_flag,
|
388
|
+
delta_type,
|
389
|
+
stream,
|
390
|
+
**kwargs,
|
391
|
+
)
|
392
|
+
|
393
|
+
# Get table properties for schema evolution
|
394
|
+
schema_evolution_mode = table_version_obj.read_table_property(
|
395
|
+
TableProperty.SCHEMA_EVOLUTION_MODE
|
396
|
+
)
|
397
|
+
default_schema_consistency_type = table_version_obj.read_table_property(
|
398
|
+
TableProperty.DEFAULT_SCHEMA_CONSISTENCY_TYPE
|
399
|
+
)
|
400
|
+
|
401
|
+
# Convert unsupported dataset types and NumPy arrays that need schema validation
|
402
|
+
if isinstance(data, np.ndarray) and table_version_obj.schema is not None:
|
403
|
+
# NumPy arrays need conversion to Pandas for proper column naming in schema validation
|
404
|
+
converted_data = _convert_numpy_for_schema_validation(
|
405
|
+
data, table_version_obj.schema
|
406
|
+
)
|
407
|
+
else:
|
408
|
+
# Convert other unsupported dataset types (e.g., Daft) or keep NumPy as-is for schemaless tables
|
409
|
+
converted_data = _convert_data_if_needed(data)
|
410
|
+
|
411
|
+
# Capture original field set before schema coercion for partial UPSERT support
|
412
|
+
original_fields = set(get_table_column_names(converted_data))
|
413
|
+
|
414
|
+
# Validate and coerce data against schema
|
415
|
+
# This ensures proper schema evolution and type handling
|
416
|
+
(
|
417
|
+
validated_data,
|
418
|
+
schema_modified,
|
419
|
+
updated_schema,
|
420
|
+
) = _validate_and_coerce_data_against_schema(
|
421
|
+
converted_data, # Use converted data for NumPy, original for others
|
422
|
+
table_version_obj.schema,
|
423
|
+
schema_evolution_mode=schema_evolution_mode,
|
424
|
+
default_schema_consistency_type=default_schema_consistency_type,
|
425
|
+
)
|
426
|
+
|
427
|
+
# Convert validated data to supported format for storage if needed
|
428
|
+
converted_data = _convert_data_if_needed(validated_data)
|
429
|
+
|
430
|
+
# Validate reader compatibility against supported reader types
|
431
|
+
supported_reader_types = table_version_obj.read_table_property(
|
432
|
+
TableProperty.SUPPORTED_READER_TYPES
|
433
|
+
)
|
434
|
+
_validate_reader_compatibility(
|
435
|
+
converted_data,
|
436
|
+
content_type,
|
437
|
+
supported_reader_types,
|
438
|
+
)
|
439
|
+
|
440
|
+
# Update table version if schema was modified during evolution
|
441
|
+
if schema_modified:
|
442
|
+
# Extract catalog properties and filter kwargs
|
443
|
+
catalog_kwargs = {
|
444
|
+
"catalog": kwargs.get("catalog"),
|
445
|
+
"inner": kwargs.get("inner"),
|
446
|
+
"transaction": write_transaction, # Pass transaction to update_table_version
|
447
|
+
}
|
448
|
+
|
449
|
+
_get_storage(**catalog_kwargs).update_table_version(
|
450
|
+
namespace=namespace,
|
451
|
+
table_name=table,
|
452
|
+
table_version=table_version_obj.table_version,
|
453
|
+
schema=updated_schema,
|
454
|
+
**catalog_kwargs,
|
455
|
+
)
|
456
|
+
|
457
|
+
# Stage and commit delta, handle compaction
|
458
|
+
# Remove schema from kwargs to avoid duplicate parameter conflict
|
459
|
+
filtered_kwargs = {k: v for k, v in kwargs.items() if k != "schema"}
|
460
|
+
# Use updated schema if schema evolution occurred, otherwise use original schema
|
461
|
+
_stage_commit_and_compact(
|
462
|
+
converted_data,
|
463
|
+
partition,
|
464
|
+
delta_type,
|
465
|
+
content_type,
|
466
|
+
commit_staged_partition,
|
467
|
+
table_version_obj,
|
468
|
+
namespace,
|
469
|
+
table,
|
470
|
+
schema=updated_schema if schema_modified else table_version_obj.schema,
|
471
|
+
original_fields=original_fields,
|
472
|
+
**filtered_kwargs,
|
473
|
+
)
|
474
|
+
except Exception as e:
|
475
|
+
# If any error occurs, the transaction remains uncommitted
|
476
|
+
commit_transaction = False
|
477
|
+
logger.error(f"Error during write_to_table: {e}")
|
478
|
+
raise
|
479
|
+
finally:
|
480
|
+
if commit_transaction:
|
481
|
+
# Seal the interactive transaction to commit all operations atomically
|
482
|
+
write_transaction.seal()
|
483
|
+
|
484
|
+
|
485
|
+
def _handle_write_mode(
|
486
|
+
mode: TableWriteMode,
|
487
|
+
table_definition: TableDefinition,
|
488
|
+
table_version_obj: TableVersion,
|
489
|
+
namespace: str,
|
91
490
|
table: str,
|
92
|
-
*args,
|
93
|
-
namespace: Optional[str] = None,
|
94
|
-
table_version: Optional[str] = None,
|
95
|
-
table_type: Optional[TableType] = TableType.PYARROW,
|
96
|
-
distributed_dataset_type: Optional[
|
97
|
-
DistributedDatasetType
|
98
|
-
] = DistributedDatasetType.RAY_DATASET,
|
99
|
-
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
100
|
-
stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
|
101
|
-
merge_on_read: Optional[bool] = False,
|
102
|
-
reader_kwargs: Optional[Dict[Any, Any]] = None,
|
103
491
|
**kwargs,
|
104
|
-
) ->
|
105
|
-
"""
|
492
|
+
) -> Tuple[Any, DeltaType]: # Using Any for stream type to avoid complex imports
|
493
|
+
"""Handle different write modes and return appropriate stream and delta type."""
|
494
|
+
table_schema = table_definition.table_version.schema
|
495
|
+
|
496
|
+
if mode == TableWriteMode.REPLACE:
|
497
|
+
return _handle_replace_mode(
|
498
|
+
table_schema,
|
499
|
+
namespace,
|
500
|
+
table,
|
501
|
+
table_version_obj,
|
502
|
+
**kwargs,
|
503
|
+
)
|
504
|
+
elif mode == TableWriteMode.APPEND:
|
505
|
+
return _handle_append_mode(
|
506
|
+
table_schema,
|
507
|
+
namespace,
|
508
|
+
table,
|
509
|
+
table_version_obj,
|
510
|
+
**kwargs,
|
511
|
+
)
|
512
|
+
elif mode in (TableWriteMode.MERGE, TableWriteMode.DELETE):
|
513
|
+
return _handle_merge_delete_mode(
|
514
|
+
mode,
|
515
|
+
table_schema,
|
516
|
+
namespace,
|
517
|
+
table,
|
518
|
+
table_version_obj,
|
519
|
+
**kwargs,
|
520
|
+
)
|
521
|
+
else:
|
522
|
+
# AUTO and CREATE modes
|
523
|
+
return _handle_auto_create_mode(
|
524
|
+
table_schema,
|
525
|
+
namespace,
|
526
|
+
table,
|
527
|
+
table_version_obj,
|
528
|
+
**kwargs,
|
529
|
+
)
|
106
530
|
|
107
|
-
if reader_kwargs is None:
|
108
|
-
reader_kwargs = {}
|
109
531
|
|
110
|
-
|
532
|
+
def _handle_replace_mode(
|
533
|
+
table_schema,
|
534
|
+
namespace: str,
|
535
|
+
table: str,
|
536
|
+
table_version_obj: TableVersion,
|
537
|
+
**kwargs,
|
538
|
+
) -> Tuple[Any, DeltaType]:
|
539
|
+
"""Handle REPLACE mode by staging and committing a new stream."""
|
540
|
+
stream = _get_storage(**kwargs).stage_stream(
|
111
541
|
namespace=namespace,
|
112
|
-
|
113
|
-
|
114
|
-
|
542
|
+
table_name=table,
|
543
|
+
table_version=table_version_obj.table_version,
|
544
|
+
**kwargs,
|
545
|
+
)
|
546
|
+
|
547
|
+
stream = _get_storage(**kwargs).commit_stream(stream=stream, **kwargs)
|
548
|
+
delta_type = (
|
549
|
+
DeltaType.UPSERT
|
550
|
+
if table_schema and table_schema.merge_keys
|
551
|
+
else DeltaType.APPEND
|
552
|
+
)
|
553
|
+
return stream, delta_type
|
554
|
+
|
555
|
+
|
556
|
+
def _handle_append_mode(
|
557
|
+
table_schema,
|
558
|
+
namespace: str,
|
559
|
+
table: str,
|
560
|
+
table_version_obj: TableVersion,
|
561
|
+
**kwargs,
|
562
|
+
) -> Tuple[Any, DeltaType]:
|
563
|
+
"""Handle APPEND mode by validating no merge keys and getting existing stream."""
|
564
|
+
if table_schema and table_schema.merge_keys:
|
565
|
+
raise SchemaValidationError(
|
566
|
+
f"APPEND mode cannot be used with tables that have merge keys. "
|
567
|
+
f"Table {namespace}.{table} has merge keys: {table_schema.merge_keys}. "
|
568
|
+
f"Use MERGE mode instead."
|
569
|
+
)
|
570
|
+
|
571
|
+
stream = _get_table_stream(
|
572
|
+
namespace,
|
573
|
+
table,
|
574
|
+
table_version_obj.table_version,
|
575
|
+
**kwargs,
|
576
|
+
)
|
577
|
+
return stream, DeltaType.APPEND
|
578
|
+
|
579
|
+
|
580
|
+
def _handle_merge_delete_mode(
|
581
|
+
mode: TableWriteMode,
|
582
|
+
table_schema,
|
583
|
+
namespace: str,
|
584
|
+
table: str,
|
585
|
+
table_version_obj: TableVersion,
|
586
|
+
**kwargs,
|
587
|
+
) -> Tuple[Any, DeltaType]:
|
588
|
+
"""Handle MERGE/DELETE modes by validating merge keys and getting existing stream."""
|
589
|
+
if not table_schema or not table_schema.merge_keys:
|
590
|
+
raise TableValidationError(
|
591
|
+
f"{mode.value.upper() if hasattr(mode, 'value') else str(mode).upper()} mode requires tables to have at least one merge key. "
|
592
|
+
f"Table {namespace}.{table}.{table_version_obj.table_version} has no merge keys. "
|
593
|
+
f"Use APPEND, AUTO, or REPLACE mode instead."
|
594
|
+
)
|
595
|
+
|
596
|
+
stream = _get_table_stream(
|
597
|
+
namespace,
|
598
|
+
table,
|
599
|
+
table_version_obj.table_version,
|
115
600
|
**kwargs,
|
116
601
|
)
|
602
|
+
delta_type = DeltaType.UPSERT if mode == TableWriteMode.MERGE else DeltaType.DELETE
|
603
|
+
return stream, delta_type
|
604
|
+
|
605
|
+
|
606
|
+
def _handle_auto_create_mode(
|
607
|
+
table_schema,
|
608
|
+
namespace: str,
|
609
|
+
table: str,
|
610
|
+
table_version_obj: TableVersion,
|
611
|
+
**kwargs,
|
612
|
+
) -> Tuple[Any, DeltaType]:
|
613
|
+
"""Handle AUTO and CREATE modes by getting existing stream."""
|
614
|
+
stream = _get_table_stream(
|
615
|
+
namespace,
|
616
|
+
table,
|
617
|
+
table_version_obj.table_version,
|
618
|
+
**kwargs,
|
619
|
+
)
|
620
|
+
delta_type = (
|
621
|
+
DeltaType.UPSERT
|
622
|
+
if table_schema and table_schema.merge_keys
|
623
|
+
else DeltaType.APPEND
|
624
|
+
)
|
625
|
+
return stream, delta_type
|
626
|
+
|
627
|
+
|
628
|
+
def _validate_table_configuration(
|
629
|
+
stream,
|
630
|
+
table_version_obj: TableVersion,
|
631
|
+
namespace: str,
|
632
|
+
table: str,
|
633
|
+
) -> None:
|
634
|
+
"""Validate table configuration for unsupported features."""
|
635
|
+
# Check if table is partitioned
|
636
|
+
if (
|
637
|
+
stream.partition_scheme
|
638
|
+
and stream.partition_scheme.keys is not None
|
639
|
+
and len(stream.partition_scheme.keys) > 0
|
640
|
+
):
|
641
|
+
raise NotImplementedError(
|
642
|
+
f"write_to_table does not yet support partitioned tables. "
|
643
|
+
f"Table {namespace}.{table} has partition scheme with "
|
644
|
+
f"{len(stream.partition_scheme.keys)} partition key(s): "
|
645
|
+
f"{[key.name or key.key[0] for key in stream.partition_scheme.keys]}. "
|
646
|
+
f"Please use the lower-level metastore API for partitioned tables."
|
647
|
+
)
|
648
|
+
|
649
|
+
# Check if table has sort keys
|
650
|
+
if (
|
651
|
+
table_version_obj.sort_scheme
|
652
|
+
and table_version_obj.sort_scheme.keys is not None
|
653
|
+
and len(table_version_obj.sort_scheme.keys) > 0
|
654
|
+
):
|
655
|
+
raise NotImplementedError(
|
656
|
+
f"write_to_table does not yet support tables with sort keys. "
|
657
|
+
f"Table {namespace}.{table} has sort scheme with "
|
658
|
+
f"{len(table_version_obj.sort_scheme.keys)} sort key(s): "
|
659
|
+
f"{[key.key[0] for key in table_version_obj.sort_scheme.keys]}. "
|
660
|
+
f"Please use the lower-level metastore API for sorted tables."
|
661
|
+
)
|
662
|
+
|
663
|
+
|
664
|
+
def _handle_partition_creation(
|
665
|
+
mode: TableWriteMode,
|
666
|
+
table_exists_flag: bool,
|
667
|
+
delta_type: DeltaType,
|
668
|
+
stream,
|
669
|
+
**kwargs,
|
670
|
+
) -> Tuple[Any, bool]: # partition, commit_staged_partition
|
671
|
+
"""Handle partition creation/retrieval based on write mode."""
|
672
|
+
if mode == TableWriteMode.REPLACE or not table_exists_flag:
|
673
|
+
# REPLACE mode or new table: Stage a new partition
|
674
|
+
partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
|
675
|
+
# If we're doing UPSERT/DELETE operations, let compaction handle the commit
|
676
|
+
commit_staged_partition = delta_type not in (DeltaType.UPSERT, DeltaType.DELETE)
|
677
|
+
return partition, commit_staged_partition
|
678
|
+
elif delta_type in (DeltaType.UPSERT, DeltaType.DELETE):
|
679
|
+
# UPSERT/DELETE operations: Try to use existing committed partition first
|
680
|
+
partition = _get_storage(**kwargs).get_partition(
|
681
|
+
stream_locator=stream.locator,
|
682
|
+
partition_values=None,
|
683
|
+
**kwargs,
|
684
|
+
)
|
685
|
+
commit_staged_partition = False
|
686
|
+
|
687
|
+
if not partition:
|
688
|
+
# No existing committed partition found, stage a new one
|
689
|
+
partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
|
690
|
+
commit_staged_partition = False # Let compaction handle the commit
|
691
|
+
|
692
|
+
return partition, commit_staged_partition
|
693
|
+
else:
|
694
|
+
# APPEND mode on existing table: Get existing partition
|
695
|
+
partition = _get_storage(**kwargs).get_partition(
|
696
|
+
stream_locator=stream.locator,
|
697
|
+
partition_values=None,
|
698
|
+
**kwargs,
|
699
|
+
)
|
700
|
+
commit_staged_partition = False
|
701
|
+
|
702
|
+
if not partition:
|
703
|
+
# No existing partition found, create a new one
|
704
|
+
partition = _get_storage(**kwargs).stage_partition(stream=stream, **kwargs)
|
705
|
+
commit_staged_partition = True
|
706
|
+
|
707
|
+
return partition, commit_staged_partition
|
708
|
+
|
709
|
+
|
710
|
+
def _convert_numpy_for_schema_validation(
|
711
|
+
data: np.ndarray, schema: Optional[Schema]
|
712
|
+
) -> Dataset:
|
713
|
+
"""Convert NumPy array to Pandas DataFrame with proper column names for schema validation.
|
714
|
+
|
715
|
+
Args:
|
716
|
+
data: NumPy array to convert
|
717
|
+
schema: DeltaCAT Schema object for column naming
|
718
|
+
|
719
|
+
Returns:
|
720
|
+
Pandas DataFrame with proper column names matching schema
|
721
|
+
|
722
|
+
Raises:
|
723
|
+
ValueError: If array has more columns than schema or schema is invalid
|
724
|
+
"""
|
725
|
+
if not isinstance(schema, Schema) or not schema.arrow:
|
726
|
+
raise ValueError(
|
727
|
+
f"Expected DeltaCAT schema for Numpy schema validation, but found: {schema}"
|
728
|
+
)
|
729
|
+
|
730
|
+
# Use schema subset matching NumPy array dimensions
|
731
|
+
arrow_schema = schema.arrow
|
732
|
+
num_cols = data.shape[1] if data.ndim > 1 else 1
|
733
|
+
|
734
|
+
if len(arrow_schema) >= num_cols:
|
735
|
+
# Use the first N columns from the schema to match data dimensions
|
736
|
+
subset_fields = [arrow_schema.field(i) for i in range(num_cols)]
|
737
|
+
subset_schema = pa.schema(subset_fields)
|
738
|
+
return to_pandas(data, schema=subset_schema)
|
739
|
+
else:
|
740
|
+
raise ValueError(
|
741
|
+
f"NumPy array has {num_cols} columns but table schema only has {len(arrow_schema)} columns. "
|
742
|
+
f"Cannot write NumPy data with more columns than the table schema supports."
|
743
|
+
)
|
744
|
+
|
745
|
+
|
746
|
+
def _build_entry_index_to_schema_mapping(
|
747
|
+
qualified_deltas: List[Delta], table_version_obj, **kwargs
|
748
|
+
) -> List[Schema]:
|
749
|
+
"""Build a mapping from manifest entry index to schema for reading operations.
|
750
|
+
|
751
|
+
Args:
|
752
|
+
qualified_deltas: List of deltas to process
|
753
|
+
table_version_obj: Table version containing schemas
|
754
|
+
**kwargs: Additional arguments passed to storage operations
|
755
|
+
|
756
|
+
Returns:
|
757
|
+
List mapping each manifest entry index to its corresponding schema
|
758
|
+
|
759
|
+
Raises:
|
760
|
+
ValueError: If a manifest's schema ID is not found in table version schemas
|
761
|
+
"""
|
762
|
+
entry_index_to_schema = []
|
763
|
+
for delta in qualified_deltas:
|
764
|
+
if delta.manifest:
|
765
|
+
manifest = delta.manifest
|
766
|
+
else:
|
767
|
+
# Fetch manifest from storage
|
768
|
+
manifest = _get_storage(**kwargs).get_delta_manifest(
|
769
|
+
delta.locator,
|
770
|
+
**kwargs,
|
771
|
+
)
|
772
|
+
# Map manifest entry index to schema ID
|
773
|
+
schema_id = manifest.meta.schema_id
|
774
|
+
|
775
|
+
# Find the schema that matches this manifest's schema_id
|
776
|
+
matching_schema = None
|
777
|
+
if table_version_obj.schemas:
|
778
|
+
for schema in table_version_obj.schemas:
|
779
|
+
if schema.id == schema_id:
|
780
|
+
matching_schema = schema
|
781
|
+
break
|
782
|
+
|
783
|
+
if matching_schema is None:
|
784
|
+
available_schema_ids = (
|
785
|
+
[s.id for s in table_version_obj.schemas]
|
786
|
+
if table_version_obj.schemas
|
787
|
+
else []
|
788
|
+
)
|
789
|
+
raise ValueError(
|
790
|
+
f"Manifest schema ID {schema_id} not found in table version schemas. "
|
791
|
+
f"Available schema IDs: {available_schema_ids}. "
|
792
|
+
)
|
793
|
+
|
794
|
+
# Add the matching schema for each entry in this manifest
|
795
|
+
for _ in range(len(manifest.entries)):
|
796
|
+
entry_index_to_schema.append(matching_schema)
|
797
|
+
|
798
|
+
return entry_index_to_schema
|
799
|
+
|
800
|
+
|
801
|
+
def _convert_data_if_needed(data: Dataset) -> Dataset:
|
802
|
+
"""Convert unsupported data types to supported ones."""
|
803
|
+
if isinstance(data, daft.DataFrame):
|
804
|
+
# Daft DataFrame - convert based on execution mode
|
805
|
+
ctx = daft.context.get_context()
|
806
|
+
runner = ctx.get_or_create_runner()
|
807
|
+
runner_type = runner.name
|
808
|
+
|
809
|
+
if runner_type == "ray":
|
810
|
+
# Running with Ray backend - convert to Ray Dataset
|
811
|
+
return data.to_ray_dataset()
|
812
|
+
else:
|
813
|
+
# Running with local backend - convert to PyArrow Table
|
814
|
+
return data.to_arrow()
|
815
|
+
|
816
|
+
return data
|
817
|
+
|
818
|
+
|
819
|
+
def _validate_and_coerce_data_against_schema(
|
820
|
+
data: Dataset,
|
821
|
+
schema: Optional[Schema],
|
822
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
|
823
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
|
824
|
+
) -> Tuple[Dataset, bool, Optional[Schema]]:
|
825
|
+
"""Validate and coerce data against the table schema if schema consistency types are set.
|
826
|
+
|
827
|
+
Args:
|
828
|
+
data: The dataset to validate/coerce
|
829
|
+
schema: The DeltaCAT schema to validate against (optional)
|
830
|
+
schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
|
831
|
+
default_schema_consistency_type: Default consistency type for new fields in AUTO mode
|
832
|
+
|
833
|
+
Returns:
|
834
|
+
Tuple[Dataset, bool, Optional[Schema]]: Validated/coerced data, flag indicating if schema was modified, and updated schema
|
835
|
+
|
836
|
+
Raises:
|
837
|
+
ValueError: If validation fails or coercion is not possible
|
838
|
+
"""
|
839
|
+
if not schema:
|
840
|
+
return data, False, None
|
841
|
+
|
842
|
+
validated_data, updated_schema = schema.validate_and_coerce_dataset(
|
843
|
+
data,
|
844
|
+
schema_evolution_mode=schema_evolution_mode,
|
845
|
+
default_schema_consistency_type=default_schema_consistency_type,
|
846
|
+
)
|
847
|
+
|
848
|
+
# Check if schema was modified by comparing with original
|
849
|
+
schema_modified = not updated_schema.equivalent_to(schema, True)
|
850
|
+
# Return updated schema only if it was modified
|
851
|
+
updated_schema = updated_schema if schema_modified else None
|
852
|
+
|
853
|
+
return validated_data, schema_modified, updated_schema
|
854
|
+
|
117
855
|
|
118
|
-
|
856
|
+
def _validate_reader_compatibility(
|
857
|
+
data: Dataset,
|
858
|
+
content_type: ContentType,
|
859
|
+
supported_reader_types: Optional[List[DatasetType]],
|
860
|
+
) -> None:
|
861
|
+
"""Validate that the data types being written are compatible with all supported reader types.
|
862
|
+
|
863
|
+
Args:
|
864
|
+
data: The dataset to validate
|
865
|
+
content_type: Content type being written
|
866
|
+
supported_reader_types: List of DatasetTypes that must be able to read this data
|
867
|
+
|
868
|
+
Raises:
|
869
|
+
TableValidationError: If any data types would break supported reader compatibility
|
870
|
+
"""
|
871
|
+
if not supported_reader_types:
|
872
|
+
return
|
873
|
+
|
874
|
+
# Get the schema from the data
|
875
|
+
schema = get_table_schema(data)
|
876
|
+
|
877
|
+
# Get the dataset type of the current data
|
878
|
+
writer_dataset_type = get_dataset_type(data)
|
879
|
+
|
880
|
+
# PYARROW_PARQUET is equivalent to PYARROW for compatibility
|
881
|
+
writer_type_str = (
|
882
|
+
writer_dataset_type.value
|
883
|
+
if writer_dataset_type != DatasetType.PYARROW_PARQUET
|
884
|
+
else "pyarrow"
|
885
|
+
)
|
886
|
+
|
887
|
+
content_type_str = content_type.value
|
888
|
+
|
889
|
+
# Check each field type for compatibility
|
890
|
+
incompatible_fields = []
|
891
|
+
|
892
|
+
for field in schema:
|
893
|
+
field_name = field.name
|
894
|
+
arrow_type_str = str(field.type)
|
895
|
+
|
896
|
+
# Get the base type name from PyArrow field type
|
897
|
+
base_type_name = get_base_arrow_type_name(field.type)
|
898
|
+
|
899
|
+
# Get compatible readers for this (arrow_type, writer_dataset_type, content_type) combination
|
900
|
+
compatible_readers = get_compatible_readers(
|
901
|
+
base_type_name,
|
902
|
+
writer_type_str,
|
903
|
+
content_type_str,
|
904
|
+
)
|
905
|
+
|
906
|
+
# Check if all supported reader types are compatible
|
907
|
+
for required_reader in supported_reader_types:
|
908
|
+
reader_is_compatible = required_reader in compatible_readers
|
909
|
+
|
910
|
+
# Special case: PYARROW_PARQUET is equivalent to PYARROW for compatibility if we're writing parquet
|
911
|
+
if (
|
912
|
+
not reader_is_compatible
|
913
|
+
and content_type == ContentType.PARQUET
|
914
|
+
and required_reader == DatasetType.PYARROW_PARQUET
|
915
|
+
):
|
916
|
+
reader_is_compatible = DatasetType.PYARROW in compatible_readers
|
917
|
+
|
918
|
+
if not reader_is_compatible:
|
919
|
+
incompatible_fields.append(
|
920
|
+
{
|
921
|
+
"field_name": field_name,
|
922
|
+
"arrow_type": arrow_type_str,
|
923
|
+
"incompatible_reader": required_reader,
|
924
|
+
"writer_type": writer_dataset_type,
|
925
|
+
"content_type": content_type,
|
926
|
+
}
|
927
|
+
)
|
928
|
+
|
929
|
+
# Raise error if any incompatibilities found
|
930
|
+
if incompatible_fields:
|
931
|
+
error_details = []
|
932
|
+
for incompatible in incompatible_fields:
|
933
|
+
error_details.append(
|
934
|
+
f"Field '{incompatible['field_name']}' with type '{incompatible['arrow_type']}' "
|
935
|
+
f"written by {incompatible['writer_type']} to {incompatible['content_type']} "
|
936
|
+
f"cannot be read by required reader type {incompatible['incompatible_reader']}. "
|
937
|
+
f"If you expect this write to succeed and this reader is not required, then it "
|
938
|
+
f"can be removed from the table's supported reader types property."
|
939
|
+
)
|
940
|
+
|
941
|
+
raise TableValidationError(
|
942
|
+
f"Reader compatibility validation failed. The following fields would break "
|
943
|
+
f"supported reader types:\n" + "\n".join(error_details)
|
944
|
+
)
|
945
|
+
|
946
|
+
|
947
|
+
def _stage_commit_and_compact(
|
948
|
+
converted_data: Dataset,
|
949
|
+
partition,
|
950
|
+
delta_type: DeltaType,
|
951
|
+
content_type: ContentType,
|
952
|
+
commit_staged_partition: bool,
|
953
|
+
table_version_obj: TableVersion,
|
954
|
+
namespace: str,
|
955
|
+
table: str,
|
956
|
+
schema: Schema,
|
957
|
+
original_fields: Set[str],
|
958
|
+
**kwargs,
|
959
|
+
) -> None:
|
960
|
+
"""Stage and commit delta, then handle compaction if needed."""
|
961
|
+
# Remove schema from kwargs to avoid duplicate parameter conflict
|
962
|
+
# We explicitly pass the correct schema parameter
|
963
|
+
kwargs.pop("schema", None)
|
964
|
+
|
965
|
+
# Stage a delta with the data
|
966
|
+
delta = _get_storage(**kwargs).stage_delta(
|
967
|
+
data=converted_data,
|
968
|
+
partition=partition,
|
969
|
+
delta_type=delta_type,
|
970
|
+
content_type=content_type,
|
971
|
+
author=ManifestAuthor.of(
|
972
|
+
name="deltacat.write_to_table", version=dc.__version__
|
973
|
+
),
|
974
|
+
schema=schema,
|
975
|
+
**kwargs,
|
976
|
+
)
|
977
|
+
|
978
|
+
delta = _get_storage(**kwargs).commit_delta(delta=delta, **kwargs)
|
979
|
+
|
980
|
+
if commit_staged_partition:
|
981
|
+
_get_storage(**kwargs).commit_partition(partition=partition, **kwargs)
|
982
|
+
|
983
|
+
# Check compaction trigger decision
|
984
|
+
should_compact = _trigger_compaction(
|
985
|
+
table_version_obj,
|
986
|
+
delta,
|
987
|
+
TableReadOptimizationLevel.MAX,
|
988
|
+
**kwargs,
|
989
|
+
)
|
990
|
+
if should_compact:
|
991
|
+
# Run V2 compaction session to merge or delete data
|
992
|
+
if table_version_obj.schema:
|
993
|
+
all_column_names = table_version_obj.schema.arrow.names
|
994
|
+
else:
|
995
|
+
raise RuntimeError("Table version schema is required to run compaction.")
|
996
|
+
_run_compaction_session(
|
997
|
+
table_version_obj=table_version_obj,
|
998
|
+
partition=partition,
|
999
|
+
latest_delta_stream_position=delta.stream_position,
|
1000
|
+
namespace=namespace,
|
1001
|
+
table=table,
|
1002
|
+
original_fields=original_fields,
|
1003
|
+
all_column_names=all_column_names,
|
1004
|
+
**kwargs,
|
1005
|
+
)
|
1006
|
+
|
1007
|
+
|
1008
|
+
def _trigger_compaction(
|
1009
|
+
table_version_obj: TableVersion,
|
1010
|
+
latest_delta: Optional[Delta],
|
1011
|
+
target_read_optimization_level: TableReadOptimizationLevel,
|
1012
|
+
**kwargs,
|
1013
|
+
) -> bool:
|
1014
|
+
# Import inside function to avoid circular imports
|
1015
|
+
from deltacat.compute.compactor.utils import round_completion_reader as rci
|
1016
|
+
|
1017
|
+
# Extract delta type from latest_delta if available, otherwise default to no compaction
|
1018
|
+
if latest_delta is not None:
|
1019
|
+
delta_type = latest_delta.type
|
1020
|
+
partition_values = latest_delta.partition_locator.partition_values
|
1021
|
+
logger.info(
|
1022
|
+
f"Using delta type {delta_type} from latest delta {latest_delta.locator}"
|
1023
|
+
)
|
1024
|
+
else:
|
1025
|
+
logger.info(f"No latest delta discovered, defaulting to no compaction.")
|
1026
|
+
return False
|
1027
|
+
|
1028
|
+
if (
|
1029
|
+
table_version_obj.read_table_property(TableProperty.READ_OPTIMIZATION_LEVEL)
|
1030
|
+
== target_read_optimization_level
|
1031
|
+
):
|
1032
|
+
if delta_type == DeltaType.DELETE or delta_type == DeltaType.UPSERT:
|
1033
|
+
return True
|
1034
|
+
elif delta_type == DeltaType.APPEND:
|
1035
|
+
# Get default stream to determine partition locator
|
1036
|
+
stream = _get_table_stream(
|
1037
|
+
table_version_obj.locator.namespace,
|
1038
|
+
table_version_obj.locator.table_name,
|
1039
|
+
table_version_obj.locator.table_version,
|
1040
|
+
**kwargs,
|
1041
|
+
)
|
1042
|
+
|
1043
|
+
if not stream:
|
1044
|
+
return False
|
1045
|
+
|
1046
|
+
# Use provided partition_values or None for unpartitioned tables
|
1047
|
+
partition_locator = PartitionLocator.of(
|
1048
|
+
stream_locator=stream.locator,
|
1049
|
+
partition_values=partition_values,
|
1050
|
+
partition_id=None,
|
1051
|
+
)
|
1052
|
+
|
1053
|
+
# Get round completion info to determine high watermark
|
1054
|
+
round_completion_info = rci.read_round_completion_info(
|
1055
|
+
source_partition_locator=partition_locator,
|
1056
|
+
destination_partition_locator=partition_locator,
|
1057
|
+
deltacat_storage=_get_storage(**kwargs),
|
1058
|
+
deltacat_storage_kwargs=kwargs,
|
1059
|
+
)
|
1060
|
+
|
1061
|
+
high_watermark = (
|
1062
|
+
round_completion_info.high_watermark
|
1063
|
+
if round_completion_info
|
1064
|
+
and isinstance(round_completion_info.high_watermark, int)
|
1065
|
+
else 0
|
1066
|
+
)
|
1067
|
+
|
1068
|
+
# Get all deltas appended since last compaction
|
1069
|
+
deltas = _get_storage(**kwargs).list_deltas(
|
1070
|
+
namespace=table_version_obj.locator.namespace,
|
1071
|
+
table_name=table_version_obj.locator.table_name,
|
1072
|
+
table_version=table_version_obj.locator.table_version,
|
1073
|
+
partition_values=partition_values,
|
1074
|
+
start_stream_position=high_watermark + 1,
|
1075
|
+
**kwargs,
|
1076
|
+
)
|
1077
|
+
|
1078
|
+
if not deltas:
|
1079
|
+
return False
|
1080
|
+
|
1081
|
+
# Count deltas appended since last compaction
|
1082
|
+
appended_deltas_since_last_compaction = len(deltas)
|
1083
|
+
delta_trigger = table_version_obj.read_table_property(
|
1084
|
+
TableProperty.APPENDED_DELTA_COUNT_COMPACTION_TRIGGER
|
1085
|
+
)
|
1086
|
+
if delta_trigger and appended_deltas_since_last_compaction >= delta_trigger:
|
1087
|
+
return True
|
1088
|
+
|
1089
|
+
# Count files appended since last compaction
|
1090
|
+
appended_files_since_last_compaction = 0
|
1091
|
+
for delta in deltas:
|
1092
|
+
if delta.manifest and delta.manifest.entries:
|
1093
|
+
appended_files_since_last_compaction += len(delta.manifest.entries)
|
1094
|
+
|
1095
|
+
file_trigger = table_version_obj.read_table_property(
|
1096
|
+
TableProperty.APPENDED_FILE_COUNT_COMPACTION_TRIGGER
|
1097
|
+
)
|
1098
|
+
if file_trigger and appended_files_since_last_compaction >= file_trigger:
|
1099
|
+
return True
|
1100
|
+
|
1101
|
+
# Count records appended since last compaction
|
1102
|
+
appended_records_since_last_compaction = 0
|
1103
|
+
for delta in deltas:
|
1104
|
+
if delta.meta and delta.meta.record_count:
|
1105
|
+
appended_records_since_last_compaction += delta.meta.record_count
|
1106
|
+
|
1107
|
+
record_trigger = table_version_obj.read_table_property(
|
1108
|
+
TableProperty.APPENDED_RECORD_COUNT_COMPACTION_TRIGGER
|
1109
|
+
)
|
1110
|
+
if (
|
1111
|
+
record_trigger
|
1112
|
+
and appended_records_since_last_compaction >= record_trigger
|
1113
|
+
):
|
1114
|
+
return True
|
1115
|
+
return False
|
1116
|
+
|
1117
|
+
|
1118
|
+
def _get_compaction_primary_keys(table_version_obj: TableVersion) -> set:
|
1119
|
+
"""Extract primary keys from table schema for compaction."""
|
1120
|
+
table_schema = table_version_obj.schema
|
1121
|
+
return (
|
1122
|
+
set(table_schema.merge_keys)
|
1123
|
+
if table_schema and table_schema.merge_keys
|
1124
|
+
else set()
|
1125
|
+
)
|
1126
|
+
|
1127
|
+
|
1128
|
+
def _get_compaction_hash_bucket_count(
|
1129
|
+
partition: Partition, table_version_obj: TableVersion
|
1130
|
+
) -> int:
|
1131
|
+
"""Determine hash bucket count from previous compaction, table property, or default."""
|
1132
|
+
# First check if we have a hash bucket count from previous compaction
|
1133
|
+
if (
|
1134
|
+
partition.compaction_round_completion_info
|
1135
|
+
and partition.compaction_round_completion_info.hash_bucket_count
|
1136
|
+
):
|
1137
|
+
hash_bucket_count = partition.compaction_round_completion_info.hash_bucket_count
|
1138
|
+
logger.info(
|
1139
|
+
f"Using hash bucket count {hash_bucket_count} from previous compaction"
|
1140
|
+
)
|
1141
|
+
return hash_bucket_count
|
1142
|
+
|
1143
|
+
# Otherwise use the table property for default compaction hash bucket count
|
1144
|
+
hash_bucket_count = table_version_obj.read_table_property(
|
1145
|
+
TableProperty.DEFAULT_COMPACTION_HASH_BUCKET_COUNT
|
1146
|
+
)
|
1147
|
+
logger.info(f"Using hash bucket count {hash_bucket_count} from table property")
|
1148
|
+
return hash_bucket_count
|
1149
|
+
|
1150
|
+
|
1151
|
+
def _get_merge_order_sort_keys(table_version_obj: TableVersion):
|
1152
|
+
"""Extract sort keys from merge_order fields in schema for compaction.
|
1153
|
+
|
1154
|
+
Args:
|
1155
|
+
table_version_obj: The table version containing schema
|
1156
|
+
|
1157
|
+
Returns:
|
1158
|
+
List of SortKey objects from merge_order fields, or None if no merge_order fields are defined
|
1159
|
+
"""
|
1160
|
+
if table_version_obj.schema:
|
1161
|
+
return table_version_obj.schema.merge_order_sort_keys()
|
1162
|
+
return None
|
1163
|
+
|
1164
|
+
|
1165
|
+
def _create_compaction_params(
|
1166
|
+
table_version_obj: TableVersion,
|
1167
|
+
partition: Partition,
|
1168
|
+
latest_stream_position: int,
|
1169
|
+
primary_keys: set,
|
1170
|
+
hash_bucket_count: int,
|
1171
|
+
original_fields: Set[str],
|
1172
|
+
all_column_names: Optional[List[str]],
|
1173
|
+
**kwargs,
|
1174
|
+
):
|
1175
|
+
"""Create compaction parameters for the compaction session."""
|
1176
|
+
from deltacat.compute.compactor.model.compact_partition_params import (
|
1177
|
+
CompactPartitionParams,
|
1178
|
+
)
|
1179
|
+
|
1180
|
+
# Remove create_table/alter_table kwargs not needed for compaction
|
1181
|
+
kwargs.pop("lifecycle_state", None)
|
1182
|
+
kwargs.pop("schema", None)
|
1183
|
+
kwargs.pop("partition_scheme", None)
|
1184
|
+
kwargs.pop("sort_keys", None)
|
1185
|
+
kwargs.pop("table_description", None)
|
1186
|
+
kwargs.pop("table_version_description", None)
|
1187
|
+
kwargs.pop("table_properties", None)
|
1188
|
+
kwargs.pop("table_version_properties", None)
|
1189
|
+
kwargs.pop("namespace_properties", None)
|
1190
|
+
kwargs.pop("content_types", None)
|
1191
|
+
kwargs.pop("fail_if_exists", None)
|
1192
|
+
kwargs.pop("schema_updates", None)
|
1193
|
+
kwargs.pop("partition_updates", None)
|
1194
|
+
kwargs.pop("sort_scheme", None)
|
1195
|
+
|
1196
|
+
table_writer_kwargs = kwargs.pop("table_writer_kwargs", {})
|
1197
|
+
table_writer_kwargs["schema"] = table_version_obj.schema
|
1198
|
+
table_writer_kwargs["sort_scheme_id"] = table_version_obj.sort_scheme.id
|
1199
|
+
deltacat_storage_kwargs = kwargs.pop("deltacat_storage_kwargs", {})
|
1200
|
+
deltacat_storage_kwargs["transaction"] = kwargs.get("transaction", None)
|
1201
|
+
list_deltas_kwargs = kwargs.pop("list_deltas_kwargs", {})
|
1202
|
+
list_deltas_kwargs["transaction"] = kwargs.get("transaction", None)
|
1203
|
+
|
1204
|
+
return CompactPartitionParams.of(
|
1205
|
+
{
|
1206
|
+
"catalog": kwargs.get("inner", kwargs.get("catalog")),
|
1207
|
+
"source_partition_locator": partition.locator,
|
1208
|
+
"destination_partition_locator": partition.locator, # In-place compaction
|
1209
|
+
"primary_keys": primary_keys,
|
1210
|
+
"last_stream_position_to_compact": latest_stream_position,
|
1211
|
+
"deltacat_storage": _get_storage(**kwargs),
|
1212
|
+
"deltacat_storage_kwargs": deltacat_storage_kwargs,
|
1213
|
+
"list_deltas_kwargs": list_deltas_kwargs,
|
1214
|
+
"table_writer_kwargs": table_writer_kwargs,
|
1215
|
+
"hash_bucket_count": hash_bucket_count,
|
1216
|
+
"records_per_compacted_file": table_version_obj.read_table_property(
|
1217
|
+
TableProperty.RECORDS_PER_COMPACTED_FILE,
|
1218
|
+
),
|
1219
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
1220
|
+
"drop_duplicates": True,
|
1221
|
+
"sort_keys": _get_merge_order_sort_keys(table_version_obj),
|
1222
|
+
"original_fields": original_fields,
|
1223
|
+
"all_column_names": all_column_names,
|
1224
|
+
}
|
1225
|
+
)
|
1226
|
+
|
1227
|
+
|
1228
|
+
def _run_compaction_session(
|
1229
|
+
table_version_obj: TableVersion,
|
1230
|
+
partition: Partition,
|
1231
|
+
latest_delta_stream_position: int,
|
1232
|
+
namespace: str,
|
1233
|
+
table: str,
|
1234
|
+
original_fields: Set[str],
|
1235
|
+
all_column_names: List[str],
|
1236
|
+
**kwargs,
|
1237
|
+
) -> None:
|
1238
|
+
"""
|
1239
|
+
Run a V2 compaction session for the given table and partition.
|
1240
|
+
|
1241
|
+
Args:
|
1242
|
+
table_version_obj: The table version object
|
1243
|
+
partition: The partition to compact
|
1244
|
+
latest_delta_stream_position: Stream position of the latest delta
|
1245
|
+
namespace: The table namespace
|
1246
|
+
table: The table name
|
1247
|
+
original_fields: The original field set for partial UPSERT support
|
1248
|
+
**kwargs: Additional arguments including catalog and storage parameters
|
1249
|
+
"""
|
1250
|
+
# Import inside function to avoid circular imports
|
1251
|
+
from deltacat.compute.compactor_v2.compaction_session import compact_partition
|
1252
|
+
|
1253
|
+
try:
|
1254
|
+
# Extract compaction configuration
|
1255
|
+
primary_keys = _get_compaction_primary_keys(table_version_obj)
|
1256
|
+
hash_bucket_count = _get_compaction_hash_bucket_count(
|
1257
|
+
partition, table_version_obj
|
1258
|
+
)
|
1259
|
+
|
1260
|
+
# Create compaction parameters
|
1261
|
+
compact_partition_params = _create_compaction_params(
|
1262
|
+
table_version_obj,
|
1263
|
+
partition,
|
1264
|
+
latest_delta_stream_position,
|
1265
|
+
primary_keys,
|
1266
|
+
hash_bucket_count,
|
1267
|
+
original_fields=original_fields,
|
1268
|
+
all_column_names=all_column_names,
|
1269
|
+
**kwargs,
|
1270
|
+
)
|
1271
|
+
|
1272
|
+
# Run V2 compaction session
|
1273
|
+
compact_partition(params=compact_partition_params)
|
1274
|
+
except Exception as e:
|
1275
|
+
logger.error(
|
1276
|
+
f"Error during compaction session for {namespace}.{table}, "
|
1277
|
+
f"partition {partition.locator}: {e}"
|
1278
|
+
)
|
1279
|
+
raise
|
1280
|
+
|
1281
|
+
|
1282
|
+
def _get_merge_key_field_names_from_schema(schema) -> List[str]:
|
1283
|
+
"""Extract merge key field names from a DeltaCAT Schema object.
|
1284
|
+
|
1285
|
+
Args:
|
1286
|
+
schema: DeltaCAT Schema object
|
1287
|
+
|
1288
|
+
Returns:
|
1289
|
+
List of field names that are marked as merge keys
|
1290
|
+
"""
|
1291
|
+
if not schema or not schema.merge_keys:
|
1292
|
+
return []
|
1293
|
+
|
1294
|
+
merge_key_field_names = []
|
1295
|
+
field_ids_to_fields = schema.field_ids_to_fields
|
1296
|
+
|
1297
|
+
for merge_key_id in schema.merge_keys:
|
1298
|
+
if merge_key_id in field_ids_to_fields:
|
1299
|
+
field = field_ids_to_fields[merge_key_id]
|
1300
|
+
merge_key_field_names.append(field.arrow.name)
|
1301
|
+
|
1302
|
+
return merge_key_field_names
|
1303
|
+
|
1304
|
+
|
1305
|
+
def _set_entry_params_if_needed(
|
1306
|
+
mode: TableWriteMode, table_version_obj, kwargs: dict
|
1307
|
+
) -> None:
|
1308
|
+
"""Automatically set entry_params to merge keys if not already set by user.
|
1309
|
+
|
1310
|
+
Args:
|
1311
|
+
mode: The table write mode
|
1312
|
+
table_version_obj: The table version object containing schema
|
1313
|
+
kwargs: Keyword arguments dictionary that may contain entry_params
|
1314
|
+
"""
|
1315
|
+
# Only set entry_params for DELETE and MERGE modes
|
1316
|
+
if mode not in [TableWriteMode.DELETE, TableWriteMode.MERGE]:
|
1317
|
+
return
|
1318
|
+
|
1319
|
+
# Don't override if user already provided entry_params
|
1320
|
+
if "entry_params" in kwargs and kwargs["entry_params"] is not None:
|
1321
|
+
return
|
1322
|
+
|
1323
|
+
# Get schema from table version
|
1324
|
+
if not table_version_obj or not table_version_obj.schema:
|
1325
|
+
return
|
1326
|
+
|
1327
|
+
# Extract merge key field names
|
1328
|
+
merge_key_field_names = _get_merge_key_field_names_from_schema(
|
1329
|
+
table_version_obj.schema
|
1330
|
+
)
|
1331
|
+
|
1332
|
+
if merge_key_field_names:
|
1333
|
+
from deltacat.storage import EntryParams
|
1334
|
+
|
1335
|
+
kwargs["entry_params"] = EntryParams.of(merge_key_field_names)
|
1336
|
+
|
1337
|
+
|
1338
|
+
def _get_table_stream(namespace: str, table: str, table_version: str, **kwargs):
|
1339
|
+
"""Helper function to get a stream for a table version."""
|
1340
|
+
return _get_storage(**kwargs).get_stream(
|
119
1341
|
namespace=namespace,
|
120
1342
|
table_name=table,
|
121
1343
|
table_version=table_version,
|
122
1344
|
**kwargs,
|
123
1345
|
)
|
124
|
-
table_version = table_version_obj.table_version
|
125
1346
|
|
1347
|
+
|
1348
|
+
def _validate_read_table_input(
|
1349
|
+
namespace: str,
|
1350
|
+
table: str,
|
1351
|
+
table_schema: Optional[Schema],
|
1352
|
+
table_type: Optional[DatasetType],
|
1353
|
+
distributed_dataset_type: Optional[DatasetType],
|
1354
|
+
) -> None:
|
1355
|
+
"""Validate input parameters for read_table operation."""
|
126
1356
|
if (
|
127
|
-
|
128
|
-
|
1357
|
+
distributed_dataset_type
|
1358
|
+
and distributed_dataset_type not in DatasetType.distributed()
|
129
1359
|
):
|
130
1360
|
raise ValueError(
|
131
|
-
"
|
132
|
-
f"
|
1361
|
+
f"{distributed_dataset_type} is not a valid distributed dataset type. "
|
1362
|
+
f"Valid distributed dataset types are: {DatasetType.distributed()}."
|
1363
|
+
)
|
1364
|
+
if table_type and table_type not in DatasetType.local():
|
1365
|
+
raise ValueError(
|
1366
|
+
f"{table_type} is not a valid local table type. "
|
1367
|
+
f"Valid table types are: {DatasetType.local()}."
|
133
1368
|
)
|
134
1369
|
|
1370
|
+
# For schemaless tables, distributed datasets are not yet supported
|
1371
|
+
if table_schema is None and distributed_dataset_type:
|
1372
|
+
raise NotImplementedError(
|
1373
|
+
f"Distributed dataset reading is not yet supported for schemaless tables. "
|
1374
|
+
f"Table '{namespace}.{table}' has no schema, but distributed_dataset_type={distributed_dataset_type} was specified. "
|
1375
|
+
f"Please use local storage by setting distributed_dataset_type=None."
|
1376
|
+
)
|
1377
|
+
|
1378
|
+
|
1379
|
+
def _get_qualified_deltas_for_read(
|
1380
|
+
table: str,
|
1381
|
+
namespace: str,
|
1382
|
+
table_version: str,
|
1383
|
+
partition_filter: Optional[List[Union[Partition, PartitionLocator]]],
|
1384
|
+
**kwargs,
|
1385
|
+
) -> List[Delta]:
|
1386
|
+
"""Get qualified deltas for reading based on partition filter."""
|
135
1387
|
logger.info(
|
136
1388
|
f"Reading metadata for table={namespace}/{table}/{table_version} "
|
137
|
-
f"with partition_filters={partition_filter}
|
138
|
-
f" range={stream_position_range_inclusive}"
|
1389
|
+
f"with partition_filters={partition_filter}."
|
139
1390
|
)
|
140
1391
|
|
1392
|
+
# Get partition filter if not provided
|
141
1393
|
if partition_filter is None:
|
142
|
-
|
143
|
-
|
144
|
-
"as partition_filter was None."
|
145
|
-
)
|
146
|
-
partition_filter = (
|
147
|
-
_get_storage(**kwargs)
|
148
|
-
.list_partitions(
|
149
|
-
table_name=table,
|
150
|
-
namespace=namespace,
|
151
|
-
table_version=table_version,
|
152
|
-
**kwargs,
|
153
|
-
)
|
154
|
-
.all_items()
|
1394
|
+
partition_filter = _get_all_committed_partitions(
|
1395
|
+
table, namespace, table_version, **kwargs
|
155
1396
|
)
|
156
1397
|
|
1398
|
+
# Get deltas from partitions
|
157
1399
|
qualified_deltas = _get_deltas_from_partition_filter(
|
158
|
-
stream_position_range_inclusive=stream_position_range_inclusive,
|
159
1400
|
partition_filter=partition_filter,
|
160
1401
|
**kwargs,
|
161
1402
|
)
|
@@ -165,30 +1406,390 @@ def read_table(
|
|
165
1406
|
f"from {len(partition_filter)} partitions."
|
166
1407
|
)
|
167
1408
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
1409
|
+
return qualified_deltas
|
1410
|
+
|
1411
|
+
|
1412
|
+
def _get_max_parallelism(
|
1413
|
+
max_parallelism: Optional[int],
|
1414
|
+
distributed_dataset_type: Optional[DatasetType],
|
1415
|
+
) -> int:
|
1416
|
+
"""Get the max parallelism for a read operation."""
|
1417
|
+
if distributed_dataset_type:
|
1418
|
+
max_parallelism = max_parallelism or 100
|
1419
|
+
else:
|
1420
|
+
# TODO(pdames): Set max parallelism using available resources and dataset size
|
1421
|
+
max_parallelism = 1
|
1422
|
+
if max_parallelism < 1:
|
1423
|
+
raise ValueError(
|
1424
|
+
f"max_parallelism must be greater than 0, but got {max_parallelism}"
|
1425
|
+
)
|
1426
|
+
logger.info(f"Using max_parallelism={max_parallelism} for read operation")
|
1427
|
+
|
1428
|
+
return max_parallelism
|
1429
|
+
|
1430
|
+
|
1431
|
+
def _handle_schemaless_table_read(
|
1432
|
+
qualified_deltas: List[Delta],
|
1433
|
+
read_as: DatasetType,
|
1434
|
+
**kwargs,
|
1435
|
+
) -> Dataset:
|
1436
|
+
"""Handle reading schemaless tables by flattening manifest entries."""
|
1437
|
+
# Create a PyArrow table for each delta
|
1438
|
+
# TODO(pdames): More efficient implementation for tables with millions/billions of entries
|
1439
|
+
tables = []
|
1440
|
+
for delta in qualified_deltas:
|
1441
|
+
# Get the manifest for this delta
|
1442
|
+
if delta.manifest:
|
1443
|
+
manifest = delta.manifest
|
1444
|
+
else:
|
1445
|
+
# Fetch manifest from storage
|
1446
|
+
manifest = _get_storage(**kwargs).get_delta_manifest(
|
1447
|
+
delta.locator,
|
1448
|
+
transaction=kwargs.get("transaction"),
|
1449
|
+
**kwargs,
|
1450
|
+
)
|
1451
|
+
# Create flattened table from this delta's manifest
|
1452
|
+
table = pa_utils.delta_manifest_to_table(
|
1453
|
+
manifest,
|
1454
|
+
delta,
|
1455
|
+
)
|
1456
|
+
tables.append(table)
|
1457
|
+
|
1458
|
+
# Concatenate all PyArrow tables
|
1459
|
+
final_table = pa_utils.concat_tables(tables)
|
1460
|
+
|
1461
|
+
# Convert from PyArrow to the requested dataset type
|
1462
|
+
return from_pyarrow(final_table, read_as)
|
1463
|
+
|
1464
|
+
|
1465
|
+
def _download_and_process_table_data(
|
1466
|
+
namespace: str,
|
1467
|
+
table: str,
|
1468
|
+
qualified_deltas: List[Delta],
|
1469
|
+
read_as: DatasetType,
|
1470
|
+
max_parallelism: Optional[int],
|
1471
|
+
columns: Optional[List[str]],
|
1472
|
+
file_path_column: Optional[str],
|
1473
|
+
table_version_obj: Optional[TableVersion],
|
1474
|
+
**kwargs,
|
1475
|
+
) -> Dataset:
|
1476
|
+
"""Download delta data and process result based on storage type."""
|
1477
|
+
|
1478
|
+
# Handle NUMPY read requests by translating to PANDAS internally
|
1479
|
+
original_read_as = read_as
|
1480
|
+
effective_read_as = read_as
|
1481
|
+
if read_as == DatasetType.NUMPY:
|
1482
|
+
effective_read_as = DatasetType.PANDAS
|
1483
|
+
logger.debug("Translating NUMPY read request to PANDAS for internal processing")
|
1484
|
+
|
1485
|
+
# Merge deltas and download data
|
1486
|
+
if not qualified_deltas:
|
1487
|
+
# Return empty table with original read_as type
|
1488
|
+
return empty_table(original_read_as)
|
1489
|
+
|
1490
|
+
# Special handling for non-empty schemaless tables
|
1491
|
+
if table_version_obj.schema is None:
|
1492
|
+
result = _handle_schemaless_table_read(
|
1493
|
+
qualified_deltas,
|
1494
|
+
effective_read_as,
|
1495
|
+
**kwargs,
|
1496
|
+
)
|
1497
|
+
# Convert to numpy if original request was for numpy
|
1498
|
+
if original_read_as == DatasetType.NUMPY:
|
1499
|
+
return _convert_pandas_to_numpy(result)
|
1500
|
+
return result
|
1501
|
+
|
1502
|
+
# Get schemas for each manifest entry
|
1503
|
+
entry_index_to_schema = _build_entry_index_to_schema_mapping(
|
1504
|
+
qualified_deltas, table_version_obj, **kwargs
|
1505
|
+
)
|
1506
|
+
# Standard non-empty schema table read path - merge deltas and download data
|
1507
|
+
merged_delta = Delta.merge_deltas(qualified_deltas)
|
1508
|
+
|
1509
|
+
# Convert read parameters to download parameters
|
1510
|
+
table_type = (
|
1511
|
+
effective_read_as
|
1512
|
+
if effective_read_as in DatasetType.local()
|
1513
|
+
else (kwargs.pop("table_type", None) or DatasetType.PYARROW)
|
1514
|
+
)
|
1515
|
+
distributed_dataset_type = (
|
1516
|
+
effective_read_as if effective_read_as in DatasetType.distributed() else None
|
175
1517
|
)
|
176
1518
|
|
177
|
-
|
178
|
-
|
1519
|
+
# Validate input parameters
|
1520
|
+
_validate_read_table_input(
|
1521
|
+
namespace,
|
1522
|
+
table,
|
1523
|
+
table_version_obj.schema,
|
1524
|
+
table_type,
|
1525
|
+
distributed_dataset_type,
|
179
1526
|
)
|
180
1527
|
|
1528
|
+
# Determine max parallelism
|
1529
|
+
max_parallelism = _get_max_parallelism(
|
1530
|
+
max_parallelism,
|
1531
|
+
distributed_dataset_type,
|
1532
|
+
)
|
1533
|
+
# Filter out parameters that are already passed as keyword arguments
|
1534
|
+
# to avoid "multiple values for argument" errors
|
1535
|
+
filtered_kwargs = {
|
1536
|
+
k: v
|
1537
|
+
for k, v in kwargs.items()
|
1538
|
+
if k
|
1539
|
+
not in [
|
1540
|
+
"delta_like",
|
1541
|
+
"table_type",
|
1542
|
+
"storage_type",
|
1543
|
+
"max_parallelism",
|
1544
|
+
"columns",
|
1545
|
+
"distributed_dataset_type",
|
1546
|
+
"file_path_column",
|
1547
|
+
]
|
1548
|
+
}
|
1549
|
+
result = _get_storage(**kwargs).download_delta(
|
1550
|
+
merged_delta,
|
1551
|
+
table_type=effective_read_as,
|
1552
|
+
storage_type=StorageType.DISTRIBUTED
|
1553
|
+
if distributed_dataset_type
|
1554
|
+
else StorageType.LOCAL,
|
1555
|
+
max_parallelism=max_parallelism,
|
1556
|
+
columns=columns,
|
1557
|
+
distributed_dataset_type=distributed_dataset_type,
|
1558
|
+
file_path_column=file_path_column,
|
1559
|
+
**filtered_kwargs,
|
1560
|
+
)
|
1561
|
+
|
1562
|
+
# Handle local storage table concatenation and PYARROW_PARQUET lazy materialization
|
1563
|
+
if not distributed_dataset_type and table_type and isinstance(result, list):
|
1564
|
+
if table_type == DatasetType.PYARROW_PARQUET:
|
1565
|
+
# For PYARROW_PARQUET, preserve lazy materialization:
|
1566
|
+
return result[0] if len(result) == 1 else result
|
1567
|
+
else:
|
1568
|
+
# For other types, perform normal concatenation
|
1569
|
+
result = _handle_local_table_concatenation(
|
1570
|
+
result,
|
1571
|
+
table_type,
|
1572
|
+
table_version_obj.schema,
|
1573
|
+
entry_index_to_schema,
|
1574
|
+
file_path_column,
|
1575
|
+
columns,
|
1576
|
+
)
|
1577
|
+
# Convert to numpy if original request was for numpy
|
1578
|
+
if original_read_as == DatasetType.NUMPY:
|
1579
|
+
return _convert_pandas_to_numpy(result)
|
1580
|
+
|
1581
|
+
return result
|
1582
|
+
|
1583
|
+
|
1584
|
+
def _convert_pandas_to_numpy(dataset: Dataset):
|
1585
|
+
"""Convert pandas DataFrame to numpy ndarray."""
|
1586
|
+
if not isinstance(dataset, pd.DataFrame):
|
1587
|
+
raise ValueError(f"Expected pandas DataFrame but found {type(dataset)}")
|
1588
|
+
return dataset.to_numpy()
|
1589
|
+
|
1590
|
+
|
1591
|
+
def _coerce_dataset_to_schema(
|
1592
|
+
dataset: Dataset, target_schema: pa.Schema, manifest_entry_schema: Schema
|
1593
|
+
) -> Dataset:
|
1594
|
+
"""Coerce a dataset to match the target PyArrow schema using DeltaCAT Schema.coerce method."""
|
1595
|
+
# Convert target PyArrow schema to DeltaCAT schema and use its coerce method
|
1596
|
+
deltacat_schema = Schema.of(schema=target_schema)
|
1597
|
+
return deltacat_schema.coerce(dataset, manifest_entry_schema)
|
1598
|
+
|
1599
|
+
|
1600
|
+
def _coerce_results_to_schema(
|
1601
|
+
results: Dataset, target_schema: pa.Schema, entry_index_to_schema: List[Schema]
|
1602
|
+
) -> List[Dataset]:
|
1603
|
+
"""Coerce all table results to match the target schema."""
|
1604
|
+
coerced_results = []
|
1605
|
+
for i, table_result in enumerate(results):
|
1606
|
+
coerced_result = _coerce_dataset_to_schema(
|
1607
|
+
table_result, target_schema, entry_index_to_schema[i]
|
1608
|
+
)
|
1609
|
+
coerced_results.append(coerced_result)
|
1610
|
+
logger.debug(f"Coerced table {i} to unified schema")
|
1611
|
+
return coerced_results
|
1612
|
+
|
1613
|
+
|
1614
|
+
def _create_target_schema(
|
1615
|
+
arrow_schema: pa.Schema,
|
1616
|
+
columns: Optional[List[str]] = None,
|
1617
|
+
file_path_column: Optional[str] = None,
|
1618
|
+
) -> pa.Schema:
|
1619
|
+
"""Create target schema for concatenation with optional column selection and file_path_column."""
|
1620
|
+
if columns is not None:
|
1621
|
+
# Column selection - use only specified columns
|
1622
|
+
field_map = {field.name: field for field in arrow_schema}
|
1623
|
+
selected_fields = []
|
1624
|
+
|
1625
|
+
for col_name in columns:
|
1626
|
+
if col_name in field_map:
|
1627
|
+
selected_fields.append(field_map[col_name])
|
1628
|
+
arrow_schema = pa.schema(selected_fields)
|
1629
|
+
if file_path_column and file_path_column not in arrow_schema.names:
|
1630
|
+
arrow_schema = arrow_schema.append(pa.field(file_path_column, pa.string()))
|
1631
|
+
return arrow_schema
|
1632
|
+
|
1633
|
+
|
1634
|
+
def _create_entry_schemas_for_concatenation(
|
1635
|
+
entry_index_to_schema: List[Schema],
|
1636
|
+
columns: Optional[List[str]] = None,
|
1637
|
+
file_path_column: Optional[str] = None,
|
1638
|
+
) -> List[Schema]:
|
1639
|
+
"""Create entry schemas for concatenation, optionally filtered by column selection."""
|
1640
|
+
if columns is None:
|
1641
|
+
# No column selection - return original schemas as-is
|
1642
|
+
return entry_index_to_schema
|
1643
|
+
|
1644
|
+
# Column selection - filter each entry schema
|
1645
|
+
modified_schemas = []
|
1646
|
+
for entry_schema in entry_index_to_schema:
|
1647
|
+
if entry_schema and entry_schema.arrow:
|
1648
|
+
filtered_schema = _create_target_schema(
|
1649
|
+
entry_schema.arrow, columns, file_path_column
|
1650
|
+
)
|
1651
|
+
modified_schemas.append(Schema.of(schema=filtered_schema))
|
1652
|
+
else:
|
1653
|
+
modified_schemas.append(entry_schema)
|
1654
|
+
|
1655
|
+
return modified_schemas
|
1656
|
+
|
1657
|
+
|
1658
|
+
def _handle_local_table_concatenation(
|
1659
|
+
results: Dataset,
|
1660
|
+
table_type: DatasetType,
|
1661
|
+
table_schema: Optional[Schema],
|
1662
|
+
entry_index_to_schema: List[Schema],
|
1663
|
+
file_path_column: Optional[str] = None,
|
1664
|
+
columns: Optional[List[str]] = None,
|
1665
|
+
) -> Dataset:
|
1666
|
+
"""Handle concatenation of local table results with schema coercion."""
|
1667
|
+
logger.debug(f"Target table schema for concatenation: {table_schema}")
|
1668
|
+
|
1669
|
+
# Create target schema for coercion, respecting column selection
|
1670
|
+
target_schema = _create_target_schema(table_schema.arrow, columns, file_path_column)
|
1671
|
+
logger.debug(f"Created target schema: {target_schema.names}")
|
1672
|
+
|
1673
|
+
# Filter entry schemas to match column selection and file_path_column
|
1674
|
+
modified_entry_schemas = _create_entry_schemas_for_concatenation(
|
1675
|
+
entry_index_to_schema, columns, file_path_column
|
1676
|
+
)
|
1677
|
+
|
1678
|
+
# Coerce results to unified schema
|
1679
|
+
coerced_results = _coerce_results_to_schema(
|
1680
|
+
results, target_schema, modified_entry_schemas
|
1681
|
+
)
|
1682
|
+
|
1683
|
+
# Second step: concatenate the coerced results
|
1684
|
+
logger.debug(
|
1685
|
+
f"Concatenating {len(coerced_results)} local tables of type {table_type} with unified schemas"
|
1686
|
+
)
|
1687
|
+
concatenated_result = concat_tables(coerced_results, table_type)
|
1688
|
+
logger.debug(f"Concatenation complete, result type: {type(concatenated_result)}")
|
1689
|
+
return concatenated_result
|
1690
|
+
|
1691
|
+
|
1692
|
+
def read_table(
|
1693
|
+
table: str,
|
1694
|
+
*args,
|
1695
|
+
namespace: Optional[str] = None,
|
1696
|
+
table_version: Optional[str] = None,
|
1697
|
+
read_as: DatasetType = DatasetType.DAFT,
|
1698
|
+
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
1699
|
+
max_parallelism: Optional[int] = None,
|
1700
|
+
columns: Optional[List[str]] = None,
|
1701
|
+
file_path_column: Optional[str] = None,
|
1702
|
+
transaction: Optional[Transaction] = None,
|
1703
|
+
**kwargs,
|
1704
|
+
) -> Dataset:
|
1705
|
+
"""Read a table into a dataset.
|
1706
|
+
|
1707
|
+
Args:
|
1708
|
+
table: Name of the table to read.
|
1709
|
+
namespace: Optional namespace of the table. Uses default if not specified.
|
1710
|
+
table_version: Optional specific version of the table to read.
|
1711
|
+
read_as: Dataset type to use for reading table files. Defaults to DatasetType.DAFT.
|
1712
|
+
partition_filter: Optional list of partitions to read from.
|
1713
|
+
max_parallelism: Optional maximum parallelism for data download. Defaults to the number of
|
1714
|
+
available CPU cores for local dataset type reads (i.e., members of DatasetType.local())
|
1715
|
+
and 100 for distributed dataset type reads (i.e., members of DatasetType.distributed()).
|
1716
|
+
columns: Optional list of columns to include in the result.
|
1717
|
+
file_path_column: Optional column name to add file paths to the result.
|
1718
|
+
transaction: Optional transaction to chain this read operation to. If provided, uncommitted
|
1719
|
+
changes from the transaction will be visible to this read operation.
|
1720
|
+
**kwargs: Additional keyword arguments.
|
1721
|
+
|
1722
|
+
Returns:
|
1723
|
+
Dataset containing the table data.
|
1724
|
+
"""
|
1725
|
+
# Set up transaction handling
|
1726
|
+
read_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1727
|
+
kwargs["transaction"] = read_transaction
|
1728
|
+
|
1729
|
+
try:
|
1730
|
+
# Resolve namespace and get table metadata
|
1731
|
+
namespace = namespace or default_namespace()
|
1732
|
+
|
1733
|
+
table_version_obj = _get_latest_active_or_given_table_version(
|
1734
|
+
namespace=namespace,
|
1735
|
+
table_name=table,
|
1736
|
+
table_version=table_version,
|
1737
|
+
**kwargs,
|
1738
|
+
)
|
1739
|
+
|
1740
|
+
# Get partitions and deltas to read
|
1741
|
+
qualified_deltas = _get_qualified_deltas_for_read(
|
1742
|
+
table,
|
1743
|
+
namespace,
|
1744
|
+
table_version_obj.table_version,
|
1745
|
+
partition_filter,
|
1746
|
+
**kwargs,
|
1747
|
+
)
|
1748
|
+
|
1749
|
+
# Download and process the data
|
1750
|
+
# TODO(pdames): Remove once we implement a custom SerDe for pa.ParquetFile
|
1751
|
+
if read_as == DatasetType.PYARROW_PARQUET:
|
1752
|
+
max_parallelism = 1
|
1753
|
+
logger.warning(
|
1754
|
+
f"Forcing max_parallelism to 1 for PyArrow Parquet reads to avoid serialization errors."
|
1755
|
+
)
|
1756
|
+
result = _download_and_process_table_data(
|
1757
|
+
namespace,
|
1758
|
+
table,
|
1759
|
+
qualified_deltas,
|
1760
|
+
read_as,
|
1761
|
+
max_parallelism,
|
1762
|
+
columns,
|
1763
|
+
file_path_column,
|
1764
|
+
table_version_obj,
|
1765
|
+
**kwargs,
|
1766
|
+
)
|
1767
|
+
return result
|
1768
|
+
except Exception as e:
|
1769
|
+
# If any error occurs, the transaction remains uncommitted
|
1770
|
+
commit_transaction = False
|
1771
|
+
logger.error(f"Error during read_table: {e}")
|
1772
|
+
raise
|
1773
|
+
finally:
|
1774
|
+
if commit_transaction:
|
1775
|
+
# Seal the interactive transaction to commit all operations atomically
|
1776
|
+
read_transaction.seal()
|
1777
|
+
|
181
1778
|
|
182
1779
|
def alter_table(
|
183
1780
|
table: str,
|
184
1781
|
*args,
|
185
1782
|
namespace: Optional[str] = None,
|
1783
|
+
table_version: Optional[str] = None,
|
186
1784
|
lifecycle_state: Optional[LifecycleState] = None,
|
187
|
-
schema_updates: Optional[
|
1785
|
+
schema_updates: Optional[SchemaUpdate] = None,
|
188
1786
|
partition_updates: Optional[Dict[str, Any]] = None,
|
189
|
-
|
190
|
-
|
191
|
-
|
1787
|
+
sort_scheme: Optional[SortScheme] = None,
|
1788
|
+
table_description: Optional[str] = None,
|
1789
|
+
table_version_description: Optional[str] = None,
|
1790
|
+
table_properties: Optional[TableProperties] = None,
|
1791
|
+
table_version_properties: Optional[TableVersionProperties] = None,
|
1792
|
+
transaction: Optional[Transaction] = None,
|
192
1793
|
**kwargs,
|
193
1794
|
) -> None:
|
194
1795
|
"""Alter deltacat table/table_version definition.
|
@@ -199,61 +1800,169 @@ def alter_table(
|
|
199
1800
|
Args:
|
200
1801
|
table: Name of the table to alter.
|
201
1802
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
1803
|
+
table_version: Optional specific version of the table to alter. Defaults to the latest active version.
|
202
1804
|
lifecycle_state: New lifecycle state for the table.
|
203
|
-
schema_updates:
|
204
|
-
partition_updates:
|
205
|
-
|
206
|
-
|
207
|
-
|
1805
|
+
schema_updates: Schema updates to apply.
|
1806
|
+
partition_updates: Partition scheme updates to apply.
|
1807
|
+
sort_scheme: New sort scheme.
|
1808
|
+
table_description: New description for the table.
|
1809
|
+
table_version_description: New description for the table version. Defaults to `table_description` if not specified.
|
1810
|
+
table_properties: New table properties.
|
1811
|
+
table_version_properties: New table version properties. Defaults to the current parent table properties if not specified.
|
1812
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
208
1813
|
|
209
1814
|
Returns:
|
210
1815
|
None
|
211
1816
|
|
212
1817
|
Raises:
|
213
1818
|
TableNotFoundError: If the table does not already exist.
|
1819
|
+
TableVersionNotFoundError: If the specified table version or active table version does not exist.
|
214
1820
|
"""
|
1821
|
+
resolved_table_properties = None
|
1822
|
+
if table_properties is not None:
|
1823
|
+
resolved_table_properties = _add_default_table_properties(table_properties)
|
1824
|
+
_validate_table_properties(resolved_table_properties)
|
1825
|
+
|
215
1826
|
namespace = namespace or default_namespace()
|
216
1827
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
table_name=table,
|
221
|
-
description=description,
|
222
|
-
properties=properties,
|
223
|
-
lifecycle_state=lifecycle_state,
|
224
|
-
**kwargs,
|
225
|
-
)
|
1828
|
+
# Set up transaction handling
|
1829
|
+
alter_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
1830
|
+
kwargs["transaction"] = alter_transaction
|
226
1831
|
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
1832
|
+
try:
|
1833
|
+
if partition_updates:
|
1834
|
+
raise NotImplementedError("Partition updates are not yet supported.")
|
1835
|
+
if sort_scheme:
|
1836
|
+
raise NotImplementedError("Sort scheme updates are not yet supported.")
|
1837
|
+
|
1838
|
+
new_table: Table = _get_storage(**kwargs).update_table(
|
1839
|
+
*args,
|
1840
|
+
namespace=namespace,
|
1841
|
+
table_name=table,
|
1842
|
+
description=table_description,
|
1843
|
+
properties=resolved_table_properties,
|
1844
|
+
**kwargs,
|
1845
|
+
)
|
1846
|
+
|
1847
|
+
if table_version is None:
|
1848
|
+
table_version: Optional[TableVersion] = _get_storage(
|
1849
|
+
**kwargs
|
1850
|
+
).get_latest_active_table_version(namespace, table, **kwargs)
|
1851
|
+
if table_version is None:
|
1852
|
+
raise TableVersionNotFoundError(
|
1853
|
+
f"No active table version found for table {namespace}.{table}. "
|
1854
|
+
"Please specify a table_version parameter."
|
1855
|
+
)
|
1856
|
+
else:
|
1857
|
+
table_version = _get_storage(**kwargs).get_table_version(
|
1858
|
+
namespace, table, table_version, **kwargs
|
1859
|
+
)
|
1860
|
+
if table_version is None:
|
1861
|
+
raise TableVersionNotFoundError(
|
1862
|
+
f"Table version '{table_version}' not found for table {namespace}.{table}"
|
1863
|
+
)
|
1864
|
+
|
1865
|
+
# Get table properties for schema evolution
|
1866
|
+
schema_evolution_mode = table_version.read_table_property(
|
1867
|
+
TableProperty.SCHEMA_EVOLUTION_MODE
|
1868
|
+
)
|
1869
|
+
if schema_updates and schema_evolution_mode == SchemaEvolutionMode.DISABLED:
|
1870
|
+
raise TableValidationError(
|
1871
|
+
"Schema evolution is disabled for this table. Please enable schema evolution or remove schema updates."
|
1872
|
+
)
|
1873
|
+
|
1874
|
+
# Only update table version properties if they are explicitly provided
|
1875
|
+
resolved_tv_properties = None
|
1876
|
+
if table_version_properties is not None:
|
1877
|
+
# inherit properties from the parent table if not specified
|
1878
|
+
default_tv_properties = new_table.properties
|
1879
|
+
if table_version.schema is None:
|
1880
|
+
# schemaless tables don't validate reader compatibility by default
|
1881
|
+
default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
|
1882
|
+
resolved_tv_properties = _add_default_table_properties(
|
1883
|
+
table_version_properties,
|
1884
|
+
default_tv_properties,
|
1885
|
+
)
|
1886
|
+
_validate_table_properties(resolved_tv_properties)
|
1887
|
+
|
1888
|
+
# Apply schema updates if provided
|
1889
|
+
updated_schema = None
|
1890
|
+
if schema_updates is not None:
|
1891
|
+
# Get the current schema from the table version
|
1892
|
+
current_schema = table_version.schema
|
1893
|
+
if current_schema != schema_updates.base_schema:
|
1894
|
+
raise ValueError(
|
1895
|
+
f"Schema updates are not compatible with the current schema for table `{namespace}.{table}`. Current schema: {current_schema}, Schema update base schema: {schema_updates.base_schema}"
|
1896
|
+
)
|
1897
|
+
|
1898
|
+
# Apply all the updates to get the final schema
|
1899
|
+
updated_schema = schema_updates.apply()
|
1900
|
+
|
1901
|
+
_get_storage(**kwargs).update_table_version(
|
1902
|
+
*args,
|
1903
|
+
namespace=namespace,
|
1904
|
+
table_name=table,
|
1905
|
+
table_version=table_version.id,
|
1906
|
+
lifecycle_state=lifecycle_state,
|
1907
|
+
description=table_version_description or table_description,
|
1908
|
+
schema=updated_schema,
|
1909
|
+
properties=resolved_tv_properties, # This will be None if table_version_properties was not provided
|
1910
|
+
**kwargs,
|
1911
|
+
)
|
1912
|
+
|
1913
|
+
except Exception as e:
|
1914
|
+
# If any error occurs, the transaction remains uncommitted
|
1915
|
+
commit_transaction = False
|
1916
|
+
logger.error(f"Error during alter_table: {e}")
|
1917
|
+
raise
|
1918
|
+
finally:
|
1919
|
+
if commit_transaction:
|
1920
|
+
# Seal the interactive transaction to commit all operations atomically
|
1921
|
+
alter_transaction.seal()
|
1922
|
+
|
1923
|
+
|
1924
|
+
def _add_default_table_properties(
|
1925
|
+
table_properties: Optional[TableProperties],
|
1926
|
+
default_table_properties: TableProperties = TablePropertyDefaultValues,
|
1927
|
+
) -> TableProperties:
|
1928
|
+
if table_properties is None:
|
1929
|
+
table_properties = {}
|
1930
|
+
for k, v in default_table_properties.items():
|
1931
|
+
if k not in table_properties:
|
1932
|
+
table_properties[k] = v
|
1933
|
+
return table_properties
|
1934
|
+
|
1935
|
+
|
1936
|
+
def _validate_table_properties(
|
1937
|
+
table_properties: TableProperties,
|
1938
|
+
) -> None:
|
1939
|
+
read_optimization_level = table_properties.get(
|
1940
|
+
TableProperty.READ_OPTIMIZATION_LEVEL,
|
1941
|
+
TablePropertyDefaultValues[TableProperty.READ_OPTIMIZATION_LEVEL],
|
240
1942
|
)
|
1943
|
+
if read_optimization_level != TableReadOptimizationLevel.MAX:
|
1944
|
+
raise NotImplementedError(
|
1945
|
+
f"Table read optimization level `{read_optimization_level} is not yet supported. Please use {TableReadOptimizationLevel.MAX}"
|
1946
|
+
)
|
241
1947
|
|
242
1948
|
|
243
1949
|
def create_table(
|
244
|
-
|
1950
|
+
table: str,
|
245
1951
|
*args,
|
246
1952
|
namespace: Optional[str] = None,
|
247
|
-
|
1953
|
+
table_version: Optional[str] = None,
|
248
1954
|
lifecycle_state: Optional[LifecycleState] = LifecycleState.ACTIVE,
|
249
1955
|
schema: Optional[Schema] = None,
|
250
1956
|
partition_scheme: Optional[PartitionScheme] = None,
|
251
1957
|
sort_keys: Optional[SortScheme] = None,
|
252
|
-
|
1958
|
+
table_description: Optional[str] = None,
|
1959
|
+
table_version_description: Optional[str] = None,
|
253
1960
|
table_properties: Optional[TableProperties] = None,
|
1961
|
+
table_version_properties: Optional[TableVersionProperties] = None,
|
254
1962
|
namespace_properties: Optional[NamespaceProperties] = None,
|
255
1963
|
content_types: Optional[List[ContentType]] = None,
|
256
1964
|
fail_if_exists: bool = True,
|
1965
|
+
transaction: Optional[Transaction] = None,
|
257
1966
|
**kwargs,
|
258
1967
|
) -> TableDefinition:
|
259
1968
|
"""Create an empty table in the catalog.
|
@@ -261,20 +1970,22 @@ def create_table(
|
|
261
1970
|
If a namespace isn't provided, the table will be created within the default deltacat namespace.
|
262
1971
|
Additionally if the provided namespace does not exist, it will be created for you.
|
263
1972
|
|
264
|
-
|
265
1973
|
Args:
|
266
|
-
|
1974
|
+
table: Name of the table to create.
|
267
1975
|
namespace: Optional namespace for the table. Uses default namespace if not specified.
|
268
1976
|
version: Optional version identifier for the table.
|
269
1977
|
lifecycle_state: Lifecycle state of the new table. Defaults to ACTIVE.
|
270
1978
|
schema: Schema definition for the table.
|
271
1979
|
partition_scheme: Optional partitioning scheme for the table.
|
272
1980
|
sort_keys: Optional sort keys for the table.
|
273
|
-
|
1981
|
+
table_description: Optional description of the table.
|
1982
|
+
table_version_description: Optional description for the table version.
|
274
1983
|
table_properties: Optional properties for the table.
|
1984
|
+
table_version_properties: Optional properties for the table version. Defaults to the current parent table properties if not specified.
|
275
1985
|
namespace_properties: Optional properties for the namespace if it needs to be created.
|
276
1986
|
content_types: Optional list of allowed content types for the table.
|
277
1987
|
fail_if_exists: If True, raises an error if table already exists. If False, returns existing table.
|
1988
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
278
1989
|
|
279
1990
|
Returns:
|
280
1991
|
TableDefinition object for the created or existing table.
|
@@ -283,56 +1994,133 @@ def create_table(
|
|
283
1994
|
TableAlreadyExistsError: If the table already exists and fail_if_exists is True.
|
284
1995
|
NamespaceNotFoundError: If the provided namespace does not exist.
|
285
1996
|
"""
|
1997
|
+
resolved_table_properties = _add_default_table_properties(table_properties)
|
1998
|
+
# Note: resolved_tv_properties will be set after checking existing table
|
1999
|
+
|
286
2000
|
namespace = namespace or default_namespace()
|
287
2001
|
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
raise TableAlreadyExistsError(f"Table {namespace}.{name} already exists")
|
292
|
-
return table
|
2002
|
+
# Set up transaction handling
|
2003
|
+
create_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2004
|
+
kwargs["transaction"] = create_transaction
|
293
2005
|
|
294
|
-
|
295
|
-
|
296
|
-
|
2006
|
+
try:
|
2007
|
+
existing_table = (
|
2008
|
+
get_table(
|
2009
|
+
table,
|
2010
|
+
namespace=namespace,
|
2011
|
+
table_version=table_version,
|
2012
|
+
*args,
|
2013
|
+
**kwargs,
|
2014
|
+
)
|
2015
|
+
if "existing_table_definition" not in kwargs
|
2016
|
+
else kwargs["existing_table_definition"]
|
297
2017
|
)
|
2018
|
+
if existing_table is not None:
|
2019
|
+
if existing_table.table_version and existing_table.stream:
|
2020
|
+
if fail_if_exists:
|
2021
|
+
table_identifier = (
|
2022
|
+
f"{namespace}.{table}"
|
2023
|
+
if not table_version
|
2024
|
+
else f"{namespace}.{table}.{table_version}"
|
2025
|
+
)
|
2026
|
+
raise TableAlreadyExistsError(
|
2027
|
+
f"Table {table_identifier} already exists"
|
2028
|
+
)
|
2029
|
+
return existing_table
|
2030
|
+
# the table exists but the table version doesn't - inherit the existing table properties
|
2031
|
+
# Also ensure table properties are inherited when not explicitly provided
|
2032
|
+
if table_properties is None:
|
2033
|
+
resolved_table_properties = existing_table.table.properties
|
2034
|
+
|
2035
|
+
# Set up table version properties based on existing table or explicit properties
|
2036
|
+
default_tv_properties = resolved_table_properties
|
2037
|
+
if schema is None:
|
2038
|
+
default_tv_properties = dict(
|
2039
|
+
default_tv_properties
|
2040
|
+
) # Make a copy to avoid modifying original
|
2041
|
+
default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
|
2042
|
+
resolved_tv_properties = _add_default_table_properties(
|
2043
|
+
table_version_properties, default_tv_properties
|
2044
|
+
)
|
2045
|
+
else:
|
2046
|
+
# create the namespace if it doesn't exist
|
2047
|
+
if not namespace_exists(namespace, **kwargs):
|
2048
|
+
create_namespace(
|
2049
|
+
namespace=namespace,
|
2050
|
+
properties=namespace_properties,
|
2051
|
+
*args,
|
2052
|
+
**kwargs,
|
2053
|
+
)
|
2054
|
+
|
2055
|
+
# Set up table version properties for new table
|
2056
|
+
default_tv_properties = resolved_table_properties
|
2057
|
+
if schema is None:
|
2058
|
+
default_tv_properties = dict(
|
2059
|
+
default_tv_properties
|
2060
|
+
) # Make a copy to avoid modifying original
|
2061
|
+
default_tv_properties[TableProperty.SUPPORTED_READER_TYPES] = None
|
2062
|
+
resolved_tv_properties = _add_default_table_properties(
|
2063
|
+
table_version_properties, default_tv_properties
|
2064
|
+
)
|
298
2065
|
|
299
|
-
|
300
|
-
*args,
|
301
|
-
namespace=namespace,
|
302
|
-
table_name=name,
|
303
|
-
table_version=version,
|
304
|
-
schema=schema,
|
305
|
-
partition_scheme=partition_scheme,
|
306
|
-
sort_keys=sort_keys,
|
307
|
-
table_version_description=description,
|
308
|
-
table_description=description,
|
309
|
-
table_properties=table_properties,
|
310
|
-
lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
|
311
|
-
supported_content_types=content_types,
|
312
|
-
**kwargs,
|
313
|
-
)
|
2066
|
+
_validate_table_properties(resolved_tv_properties)
|
314
2067
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
2068
|
+
(table, table_version, stream) = _get_storage(**kwargs).create_table_version(
|
2069
|
+
namespace=namespace,
|
2070
|
+
table_name=table,
|
2071
|
+
table_version=table_version,
|
2072
|
+
schema=schema,
|
2073
|
+
partition_scheme=partition_scheme,
|
2074
|
+
sort_keys=sort_keys,
|
2075
|
+
table_version_description=table_version_description
|
2076
|
+
if table_version_description is not None
|
2077
|
+
else table_description,
|
2078
|
+
table_description=table_description,
|
2079
|
+
table_properties=resolved_table_properties,
|
2080
|
+
table_version_properties=resolved_tv_properties,
|
2081
|
+
lifecycle_state=lifecycle_state or LifecycleState.ACTIVE,
|
2082
|
+
supported_content_types=content_types,
|
2083
|
+
*args,
|
2084
|
+
**kwargs,
|
2085
|
+
)
|
2086
|
+
|
2087
|
+
result = TableDefinition.of(
|
2088
|
+
table=table,
|
2089
|
+
table_version=table_version,
|
2090
|
+
stream=stream,
|
2091
|
+
)
|
2092
|
+
|
2093
|
+
return result
|
2094
|
+
|
2095
|
+
except Exception as e:
|
2096
|
+
# If any error occurs, the transaction remains uncommitted
|
2097
|
+
commit_transaction = False
|
2098
|
+
logger.error(f"Error during create_table: {e}")
|
2099
|
+
raise
|
2100
|
+
finally:
|
2101
|
+
if commit_transaction:
|
2102
|
+
# Seal the interactive transaction to commit all operations atomically
|
2103
|
+
create_transaction.seal()
|
320
2104
|
|
321
2105
|
|
322
2106
|
def drop_table(
|
323
|
-
|
2107
|
+
table: str,
|
324
2108
|
*args,
|
325
2109
|
namespace: Optional[str] = None,
|
326
2110
|
table_version: Optional[str] = None,
|
327
2111
|
purge: bool = False,
|
2112
|
+
transaction: Optional[Transaction] = None,
|
328
2113
|
**kwargs,
|
329
2114
|
) -> None:
|
330
2115
|
"""Drop a table from the catalog and optionally purges underlying data.
|
331
2116
|
|
332
2117
|
Args:
|
333
|
-
|
2118
|
+
table: Name of the table to drop.
|
334
2119
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2120
|
+
table_version: Optional table version of the table to drop. If not specified, the parent table of all
|
2121
|
+
table versions will be dropped.
|
335
2122
|
purge: If True, permanently delete the table data. If False, only remove from catalog.
|
2123
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
336
2124
|
|
337
2125
|
Returns:
|
338
2126
|
None
|
@@ -347,17 +2135,56 @@ def drop_table(
|
|
347
2135
|
raise NotImplementedError("Purge flag is not currently supported.")
|
348
2136
|
|
349
2137
|
namespace = namespace or default_namespace()
|
350
|
-
|
351
|
-
|
352
|
-
)
|
2138
|
+
|
2139
|
+
# Set up transaction handling
|
2140
|
+
drop_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2141
|
+
kwargs["transaction"] = drop_transaction
|
2142
|
+
|
2143
|
+
try:
|
2144
|
+
if not table_version:
|
2145
|
+
_get_storage(**kwargs).delete_table(
|
2146
|
+
namespace=namespace,
|
2147
|
+
table_name=table,
|
2148
|
+
purge=purge,
|
2149
|
+
*args,
|
2150
|
+
**kwargs,
|
2151
|
+
)
|
2152
|
+
else:
|
2153
|
+
_get_storage(**kwargs).update_table_version(
|
2154
|
+
namespace=namespace,
|
2155
|
+
table_name=table,
|
2156
|
+
table_version=table_version,
|
2157
|
+
lifecycle_state=LifecycleState.DELETED,
|
2158
|
+
*args,
|
2159
|
+
**kwargs,
|
2160
|
+
)
|
2161
|
+
|
2162
|
+
except Exception as e:
|
2163
|
+
# If any error occurs, the transaction remains uncommitted
|
2164
|
+
commit_transaction = False
|
2165
|
+
logger.error(f"Error during drop_table: {e}")
|
2166
|
+
raise
|
2167
|
+
finally:
|
2168
|
+
if commit_transaction:
|
2169
|
+
# Seal the interactive transaction to commit all operations atomically
|
2170
|
+
drop_transaction.seal()
|
353
2171
|
|
354
2172
|
|
355
|
-
def refresh_table(
|
2173
|
+
def refresh_table(
|
2174
|
+
table: str,
|
2175
|
+
*args,
|
2176
|
+
namespace: Optional[str] = None,
|
2177
|
+
table_version: Optional[str] = None,
|
2178
|
+
transaction: Optional[Transaction] = None,
|
2179
|
+
**kwargs,
|
2180
|
+
) -> None:
|
356
2181
|
"""Refresh metadata cached on the Ray cluster for the given table.
|
357
2182
|
|
358
2183
|
Args:
|
359
2184
|
table: Name of the table to refresh.
|
360
2185
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2186
|
+
table_version: Optional specific version of the table to refresh.
|
2187
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
361
2188
|
|
362
2189
|
Returns:
|
363
2190
|
None
|
@@ -366,32 +2193,79 @@ def refresh_table(table: str, *args, namespace: Optional[str] = None, **kwargs)
|
|
366
2193
|
|
367
2194
|
|
368
2195
|
def list_tables(
|
369
|
-
*args,
|
2196
|
+
*args,
|
2197
|
+
namespace: Optional[str] = None,
|
2198
|
+
table: Optional[str] = None,
|
2199
|
+
transaction: Optional[Transaction] = None,
|
2200
|
+
**kwargs,
|
370
2201
|
) -> ListResult[TableDefinition]:
|
371
2202
|
"""List a page of table definitions.
|
372
2203
|
|
373
2204
|
Args:
|
374
2205
|
namespace: Optional namespace to list tables from. Uses default namespace if not specified.
|
2206
|
+
table: Optional table to list its table versions. If not specified, lists the latest active version of each table in the namespace.
|
2207
|
+
transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
|
375
2208
|
|
376
2209
|
Returns:
|
377
2210
|
ListResult containing TableDefinition objects for tables in the namespace.
|
378
2211
|
"""
|
379
2212
|
namespace = namespace or default_namespace()
|
380
|
-
tables = _get_storage(**kwargs).list_tables(*args, namespace=namespace, **kwargs)
|
381
|
-
table_definitions = [
|
382
|
-
get_table(*args, table.table_name, namespace, **kwargs)
|
383
|
-
for table in tables.all_items()
|
384
|
-
]
|
385
2213
|
|
386
|
-
|
2214
|
+
# Set up transaction handling
|
2215
|
+
list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2216
|
+
kwargs["transaction"] = list_transaction
|
2217
|
+
|
2218
|
+
try:
|
2219
|
+
if not table:
|
2220
|
+
tables = _get_storage(**kwargs).list_tables(
|
2221
|
+
namespace=namespace,
|
2222
|
+
*args,
|
2223
|
+
**kwargs,
|
2224
|
+
)
|
2225
|
+
table_definitions = [
|
2226
|
+
get_table(table.table_name, namespace=namespace, *args, **kwargs)
|
2227
|
+
for table in tables.all_items()
|
2228
|
+
]
|
2229
|
+
else:
|
2230
|
+
table_versions = _get_storage(**kwargs).list_table_versions(
|
2231
|
+
namespace=namespace,
|
2232
|
+
table_name=table,
|
2233
|
+
*args,
|
2234
|
+
**kwargs,
|
2235
|
+
)
|
2236
|
+
table_definitions = [
|
2237
|
+
get_table(
|
2238
|
+
table,
|
2239
|
+
namespace=namespace,
|
2240
|
+
table_version=table_version.id,
|
2241
|
+
*args,
|
2242
|
+
**kwargs,
|
2243
|
+
)
|
2244
|
+
for table_version in table_versions.all_items()
|
2245
|
+
]
|
2246
|
+
|
2247
|
+
result = ListResult(items=table_definitions)
|
2248
|
+
|
2249
|
+
return result
|
2250
|
+
|
2251
|
+
except Exception as e:
|
2252
|
+
# If any error occurs, the transaction remains uncommitted
|
2253
|
+
commit_transaction = False
|
2254
|
+
logger.error(f"Error during list_tables: {e}")
|
2255
|
+
raise
|
2256
|
+
finally:
|
2257
|
+
if commit_transaction:
|
2258
|
+
# Seal the interactive transaction to commit all operations atomically
|
2259
|
+
list_transaction.seal()
|
387
2260
|
|
388
2261
|
|
389
2262
|
def get_table(
|
390
|
-
|
2263
|
+
table: str,
|
391
2264
|
*args,
|
392
2265
|
namespace: Optional[str] = None,
|
393
2266
|
table_version: Optional[str] = None,
|
394
2267
|
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
2268
|
+
transaction: Optional[Transaction] = None,
|
395
2269
|
**kwargs,
|
396
2270
|
) -> Optional[TableDefinition]:
|
397
2271
|
"""Get table definition metadata.
|
@@ -399,64 +2273,84 @@ def get_table(
|
|
399
2273
|
Args:
|
400
2274
|
name: Name of the table to retrieve.
|
401
2275
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
402
|
-
table_version: Optional specific version of the table to retrieve.
|
403
|
-
|
404
|
-
|
405
|
-
format if not specified.
|
2276
|
+
table_version: Optional specific version of the table to retrieve. Defaults to the latest active version.
|
2277
|
+
stream_format: Optional stream format to retrieve. Defaults to DELTACAT.
|
2278
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
406
2279
|
|
407
2280
|
Returns:
|
408
|
-
Deltacat TableDefinition if the table exists, None otherwise.
|
409
|
-
|
410
|
-
|
411
|
-
TableVersionNotFoundError: If the table version does not exist.
|
412
|
-
StreamNotFoundError: If the stream does not exist.
|
2281
|
+
Deltacat TableDefinition if the table exists, None otherwise. The table definition's table version will be
|
2282
|
+
None if the requested version is not found. The table definition's stream will be None if the requested stream
|
2283
|
+
format is not found.
|
413
2284
|
"""
|
414
2285
|
namespace = namespace or default_namespace()
|
415
|
-
table: Optional[Table] = _get_storage(**kwargs).get_table(
|
416
|
-
*args, table_name=name, namespace=namespace, **kwargs
|
417
|
-
)
|
418
2286
|
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
table_version: Optional[TableVersion] = _get_storage(**kwargs).get_table_version(
|
423
|
-
*args, namespace, name, table_version or table.latest_table_version, **kwargs
|
424
|
-
)
|
2287
|
+
# Set up transaction handling
|
2288
|
+
get_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2289
|
+
kwargs["transaction"] = get_transaction
|
425
2290
|
|
426
|
-
|
427
|
-
|
428
|
-
|
2291
|
+
try:
|
2292
|
+
table_obj: Optional[Table] = _get_storage(**kwargs).get_table(
|
2293
|
+
table_name=table,
|
2294
|
+
namespace=namespace,
|
2295
|
+
*args,
|
2296
|
+
**kwargs,
|
429
2297
|
)
|
430
2298
|
|
431
|
-
|
432
|
-
|
433
|
-
namespace=namespace,
|
434
|
-
table_name=name,
|
435
|
-
table_version=table_version.id,
|
436
|
-
stream_format=stream_format,
|
437
|
-
**kwargs,
|
438
|
-
)
|
2299
|
+
if table_obj is None:
|
2300
|
+
return None
|
439
2301
|
|
440
|
-
|
441
|
-
|
442
|
-
|
2302
|
+
table_version_obj: Optional[TableVersion] = _get_storage(
|
2303
|
+
**kwargs
|
2304
|
+
).get_table_version(
|
2305
|
+
namespace,
|
2306
|
+
table,
|
2307
|
+
table_version or table_obj.latest_active_table_version,
|
2308
|
+
*args,
|
2309
|
+
**kwargs,
|
443
2310
|
)
|
444
2311
|
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
2312
|
+
stream = None
|
2313
|
+
if table_version_obj:
|
2314
|
+
stream = _get_storage(**kwargs).get_stream(
|
2315
|
+
namespace=namespace,
|
2316
|
+
table_name=table,
|
2317
|
+
table_version=table_version_obj.id,
|
2318
|
+
stream_format=stream_format,
|
2319
|
+
*args,
|
2320
|
+
**kwargs,
|
2321
|
+
)
|
2322
|
+
|
2323
|
+
return TableDefinition.of(
|
2324
|
+
table=table_obj,
|
2325
|
+
table_version=table_version_obj,
|
2326
|
+
stream=stream,
|
2327
|
+
)
|
2328
|
+
except Exception as e:
|
2329
|
+
# If any error occurs, the transaction remains uncommitted
|
2330
|
+
commit_transaction = False
|
2331
|
+
logger.error(f"Error during get_table: {e}")
|
2332
|
+
raise
|
2333
|
+
finally:
|
2334
|
+
if commit_transaction:
|
2335
|
+
# Seal the interactive transaction to commit all operations atomically
|
2336
|
+
get_transaction.seal()
|
450
2337
|
|
451
2338
|
|
452
2339
|
def truncate_table(
|
453
|
-
table: str,
|
2340
|
+
table: str,
|
2341
|
+
*args,
|
2342
|
+
namespace: Optional[str] = None,
|
2343
|
+
table_version: Optional[str] = None,
|
2344
|
+
transaction: Optional[Transaction] = None,
|
2345
|
+
**kwargs,
|
454
2346
|
) -> None:
|
455
2347
|
"""Truncate table data.
|
456
2348
|
|
457
2349
|
Args:
|
458
2350
|
table: Name of the table to truncate.
|
459
2351
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2352
|
+
table_version: Optional specific version of the table to truncate. Defaults to the latest active version.
|
2353
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
460
2354
|
|
461
2355
|
Returns:
|
462
2356
|
None
|
@@ -465,7 +2359,12 @@ def truncate_table(
|
|
465
2359
|
|
466
2360
|
|
467
2361
|
def rename_table(
|
468
|
-
table: str,
|
2362
|
+
table: str,
|
2363
|
+
new_name: str,
|
2364
|
+
*args,
|
2365
|
+
namespace: Optional[str] = None,
|
2366
|
+
transaction: Optional[Transaction] = None,
|
2367
|
+
**kwargs,
|
469
2368
|
) -> None:
|
470
2369
|
"""Rename an existing table.
|
471
2370
|
|
@@ -473,6 +2372,7 @@ def rename_table(
|
|
473
2372
|
table: Current name of the table.
|
474
2373
|
new_name: New name for the table.
|
475
2374
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2375
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
476
2376
|
|
477
2377
|
Returns:
|
478
2378
|
None
|
@@ -481,71 +2381,219 @@ def rename_table(
|
|
481
2381
|
TableNotFoundError: If the table does not exist.
|
482
2382
|
"""
|
483
2383
|
namespace = namespace or default_namespace()
|
484
|
-
_get_storage(**kwargs).update_table(
|
485
|
-
*args, table_name=table, new_table_name=new_name, namespace=namespace, **kwargs
|
486
|
-
)
|
487
2384
|
|
2385
|
+
# Set up transaction handling
|
2386
|
+
rename_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2387
|
+
kwargs["transaction"] = rename_transaction
|
2388
|
+
|
2389
|
+
try:
|
2390
|
+
_get_storage(**kwargs).update_table(
|
2391
|
+
table_name=table,
|
2392
|
+
new_table_name=new_name,
|
2393
|
+
namespace=namespace,
|
2394
|
+
*args,
|
2395
|
+
**kwargs,
|
2396
|
+
)
|
2397
|
+
|
2398
|
+
except Exception as e:
|
2399
|
+
# If any error occurs, the transaction remains uncommitted
|
2400
|
+
commit_transaction = False
|
2401
|
+
logger.error(f"Error during rename_table: {e}")
|
2402
|
+
raise
|
2403
|
+
finally:
|
2404
|
+
if commit_transaction:
|
2405
|
+
# Seal the interactive transaction to commit all operations atomically
|
2406
|
+
rename_transaction.seal()
|
488
2407
|
|
489
|
-
|
2408
|
+
|
2409
|
+
def table_exists(
|
2410
|
+
table: str,
|
2411
|
+
*args,
|
2412
|
+
namespace: Optional[str] = None,
|
2413
|
+
table_version: Optional[str] = None,
|
2414
|
+
stream_format: StreamFormat = StreamFormat.DELTACAT,
|
2415
|
+
transaction: Optional[Transaction] = None,
|
2416
|
+
**kwargs,
|
2417
|
+
) -> bool:
|
490
2418
|
"""Check if a table exists in the catalog.
|
491
2419
|
|
492
2420
|
Args:
|
493
2421
|
table: Name of the table to check.
|
494
2422
|
namespace: Optional namespace of the table. Uses default namespace if not specified.
|
2423
|
+
table_version: Optional specific version of the table to check. Defaults to the latest active version.
|
2424
|
+
stream_format: Optional stream format to check. Defaults to DELTACAT.
|
2425
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
495
2426
|
|
496
2427
|
Returns:
|
497
2428
|
True if the table exists, False otherwise.
|
498
2429
|
"""
|
499
2430
|
namespace = namespace or default_namespace()
|
500
|
-
return _get_storage(**kwargs).table_exists(
|
501
|
-
*args, table_name=table, namespace=namespace, **kwargs
|
502
|
-
)
|
503
2431
|
|
2432
|
+
# Set up transaction handling
|
2433
|
+
exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2434
|
+
kwargs["transaction"] = exists_transaction
|
504
2435
|
|
505
|
-
|
2436
|
+
try:
|
2437
|
+
table_obj = _get_storage(**kwargs).get_table(
|
2438
|
+
namespace=namespace,
|
2439
|
+
table_name=table,
|
2440
|
+
*args,
|
2441
|
+
**kwargs,
|
2442
|
+
)
|
2443
|
+
if table_obj is None:
|
2444
|
+
return False
|
2445
|
+
table_version = table_version or table_obj.latest_active_table_version
|
2446
|
+
if not table_version:
|
2447
|
+
return False
|
2448
|
+
table_version_exists = _get_storage(**kwargs).table_version_exists(
|
2449
|
+
namespace,
|
2450
|
+
table,
|
2451
|
+
table_version,
|
2452
|
+
*args,
|
2453
|
+
**kwargs,
|
2454
|
+
)
|
2455
|
+
if not table_version_exists:
|
2456
|
+
return False
|
2457
|
+
stream_exists = _get_storage(**kwargs).stream_exists(
|
2458
|
+
namespace=namespace,
|
2459
|
+
table_name=table,
|
2460
|
+
table_version=table_version,
|
2461
|
+
stream_format=stream_format,
|
2462
|
+
*args,
|
2463
|
+
**kwargs,
|
2464
|
+
)
|
2465
|
+
return stream_exists
|
2466
|
+
except Exception as e:
|
2467
|
+
# If any error occurs, the transaction remains uncommitted
|
2468
|
+
commit_transaction = False
|
2469
|
+
logger.error(f"Error during table_exists: {e}")
|
2470
|
+
raise
|
2471
|
+
finally:
|
2472
|
+
if commit_transaction:
|
2473
|
+
# Seal the interactive transaction to commit all operations atomically
|
2474
|
+
exists_transaction.seal()
|
2475
|
+
|
2476
|
+
|
2477
|
+
def list_namespaces(
|
2478
|
+
*args,
|
2479
|
+
transaction: Optional[Transaction] = None,
|
2480
|
+
**kwargs,
|
2481
|
+
) -> ListResult[Namespace]:
|
506
2482
|
"""List a page of table namespaces.
|
507
2483
|
|
508
2484
|
Args:
|
509
|
-
|
2485
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
510
2486
|
|
511
2487
|
Returns:
|
512
2488
|
ListResult containing Namespace objects.
|
513
2489
|
"""
|
514
|
-
|
2490
|
+
# Set up transaction handling
|
2491
|
+
list_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2492
|
+
kwargs["transaction"] = list_transaction
|
2493
|
+
|
2494
|
+
try:
|
2495
|
+
result = _get_storage(**kwargs).list_namespaces(*args, **kwargs)
|
2496
|
+
|
2497
|
+
return result
|
515
2498
|
|
2499
|
+
except Exception as e:
|
2500
|
+
# If any error occurs, the transaction remains uncommitted
|
2501
|
+
commit_transaction = False
|
2502
|
+
logger.error(f"Error during list_namespaces: {e}")
|
2503
|
+
raise
|
2504
|
+
finally:
|
2505
|
+
if commit_transaction:
|
2506
|
+
# Seal the interactive transaction to commit all operations atomically
|
2507
|
+
list_transaction.seal()
|
516
2508
|
|
517
|
-
|
2509
|
+
|
2510
|
+
def get_namespace(
|
2511
|
+
namespace: str,
|
2512
|
+
*args,
|
2513
|
+
transaction: Optional[Transaction] = None,
|
2514
|
+
**kwargs,
|
2515
|
+
) -> Optional[Namespace]:
|
518
2516
|
"""Get metadata for a specific table namespace.
|
519
2517
|
|
520
2518
|
Args:
|
521
2519
|
namespace: Name of the namespace to retrieve.
|
2520
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
522
2521
|
|
523
2522
|
Returns:
|
524
2523
|
Namespace object if the namespace exists, None otherwise.
|
525
2524
|
"""
|
526
|
-
|
2525
|
+
# Set up transaction handling
|
2526
|
+
get_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2527
|
+
kwargs["transaction"] = get_ns_transaction
|
2528
|
+
|
2529
|
+
try:
|
2530
|
+
result = _get_storage(**kwargs).get_namespace(
|
2531
|
+
*args, namespace=namespace, **kwargs
|
2532
|
+
)
|
2533
|
+
|
2534
|
+
return result
|
2535
|
+
|
2536
|
+
except Exception as e:
|
2537
|
+
# If any error occurs, the transaction remains uncommitted
|
2538
|
+
commit_transaction = False
|
2539
|
+
logger.error(f"Error during get_namespace: {e}")
|
2540
|
+
raise
|
2541
|
+
finally:
|
2542
|
+
if commit_transaction:
|
2543
|
+
# Seal the interactive transaction to commit all operations atomically
|
2544
|
+
get_ns_transaction.seal()
|
527
2545
|
|
528
2546
|
|
529
|
-
def namespace_exists(
|
2547
|
+
def namespace_exists(
|
2548
|
+
namespace: str,
|
2549
|
+
*args,
|
2550
|
+
transaction: Optional[Transaction] = None,
|
2551
|
+
**kwargs,
|
2552
|
+
) -> bool:
|
530
2553
|
"""Check if a namespace exists.
|
531
2554
|
|
532
2555
|
Args:
|
533
2556
|
namespace: Name of the namespace to check.
|
2557
|
+
transaction: Optional transaction to use for reading. If provided, will see uncommitted changes.
|
534
2558
|
|
535
2559
|
Returns:
|
536
2560
|
True if the namespace exists, False otherwise.
|
537
2561
|
"""
|
538
|
-
|
2562
|
+
# Set up transaction handling
|
2563
|
+
exists_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2564
|
+
kwargs["transaction"] = exists_transaction
|
2565
|
+
|
2566
|
+
try:
|
2567
|
+
result = _get_storage(**kwargs).namespace_exists(
|
2568
|
+
*args, namespace=namespace, **kwargs
|
2569
|
+
)
|
2570
|
+
|
2571
|
+
return result
|
2572
|
+
|
2573
|
+
except Exception as e:
|
2574
|
+
# If any error occurs, the transaction remains uncommitted
|
2575
|
+
commit_transaction = False
|
2576
|
+
logger.error(f"Error during namespace_exists: {e}")
|
2577
|
+
raise
|
2578
|
+
finally:
|
2579
|
+
if commit_transaction:
|
2580
|
+
# Seal the interactive transaction to commit all operations atomically
|
2581
|
+
exists_transaction.seal()
|
539
2582
|
|
540
2583
|
|
541
2584
|
def create_namespace(
|
542
|
-
namespace: str,
|
2585
|
+
namespace: str,
|
2586
|
+
*args,
|
2587
|
+
properties: Optional[NamespaceProperties] = None,
|
2588
|
+
transaction: Optional[Transaction] = None,
|
2589
|
+
**kwargs,
|
543
2590
|
) -> Namespace:
|
544
2591
|
"""Create a new namespace.
|
545
2592
|
|
546
2593
|
Args:
|
547
2594
|
namespace: Name of the namespace to create.
|
548
2595
|
properties: Optional properties for the namespace.
|
2596
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
549
2597
|
|
550
2598
|
Returns:
|
551
2599
|
Created Namespace object.
|
@@ -553,12 +2601,29 @@ def create_namespace(
|
|
553
2601
|
Raises:
|
554
2602
|
NamespaceAlreadyExistsError: If the namespace already exists.
|
555
2603
|
"""
|
556
|
-
|
557
|
-
|
2604
|
+
# Set up transaction handling
|
2605
|
+
namespace_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2606
|
+
kwargs["transaction"] = namespace_transaction
|
558
2607
|
|
559
|
-
|
560
|
-
|
561
|
-
|
2608
|
+
try:
|
2609
|
+
if namespace_exists(namespace, **kwargs):
|
2610
|
+
raise NamespaceAlreadyExistsError(f"Namespace {namespace} already exists")
|
2611
|
+
|
2612
|
+
result = _get_storage(**kwargs).create_namespace(
|
2613
|
+
*args, namespace=namespace, properties=properties, **kwargs
|
2614
|
+
)
|
2615
|
+
|
2616
|
+
return result
|
2617
|
+
|
2618
|
+
except Exception as e:
|
2619
|
+
# If any error occurs, the transaction remains uncommitted
|
2620
|
+
commit_transaction = False
|
2621
|
+
logger.error(f"Error during create_namespace: {e}")
|
2622
|
+
raise
|
2623
|
+
finally:
|
2624
|
+
if commit_transaction:
|
2625
|
+
# Seal the interactive transaction to commit all operations atomically
|
2626
|
+
namespace_transaction.seal()
|
562
2627
|
|
563
2628
|
|
564
2629
|
def alter_namespace(
|
@@ -566,6 +2631,7 @@ def alter_namespace(
|
|
566
2631
|
*args,
|
567
2632
|
properties: Optional[NamespaceProperties] = None,
|
568
2633
|
new_namespace: Optional[str] = None,
|
2634
|
+
transaction: Optional[Transaction] = None,
|
569
2635
|
**kwargs,
|
570
2636
|
) -> None:
|
571
2637
|
"""Alter a namespace definition.
|
@@ -574,26 +2640,49 @@ def alter_namespace(
|
|
574
2640
|
namespace: Name of the namespace to alter.
|
575
2641
|
properties: Optional new properties for the namespace.
|
576
2642
|
new_namespace: Optional new name for the namespace.
|
2643
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
577
2644
|
|
578
2645
|
Returns:
|
579
2646
|
None
|
580
2647
|
"""
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
**kwargs
|
587
|
-
|
2648
|
+
# Set up transaction handling
|
2649
|
+
alter_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2650
|
+
kwargs["transaction"] = alter_ns_transaction
|
2651
|
+
|
2652
|
+
try:
|
2653
|
+
_get_storage(**kwargs).update_namespace(
|
2654
|
+
namespace=namespace,
|
2655
|
+
properties=properties,
|
2656
|
+
new_namespace=new_namespace,
|
2657
|
+
*args,
|
2658
|
+
**kwargs,
|
2659
|
+
)
|
2660
|
+
|
2661
|
+
except Exception as e:
|
2662
|
+
# If any error occurs, the transaction remains uncommitted
|
2663
|
+
commit_transaction = False
|
2664
|
+
logger.error(f"Error during alter_namespace: {e}")
|
2665
|
+
raise
|
2666
|
+
finally:
|
2667
|
+
if commit_transaction:
|
2668
|
+
# Seal the interactive transaction to commit all operations atomically
|
2669
|
+
alter_ns_transaction.seal()
|
588
2670
|
|
589
2671
|
|
590
|
-
def drop_namespace(
|
2672
|
+
def drop_namespace(
|
2673
|
+
namespace: str,
|
2674
|
+
*args,
|
2675
|
+
purge: bool = False,
|
2676
|
+
transaction: Optional[Transaction] = None,
|
2677
|
+
**kwargs,
|
2678
|
+
) -> None:
|
591
2679
|
"""Drop a namespace and all of its tables from the catalog.
|
592
2680
|
|
593
2681
|
Args:
|
594
2682
|
namespace: Name of the namespace to drop.
|
595
|
-
purge: If True, permanently delete all
|
596
|
-
If False, only
|
2683
|
+
purge: If True, permanently delete all table data in the namespace.
|
2684
|
+
If False, only removes the namespace from the catalog.
|
2685
|
+
transaction: Optional transaction to use. If None, creates a new transaction.
|
597
2686
|
|
598
2687
|
Returns:
|
599
2688
|
None
|
@@ -603,50 +2692,39 @@ def drop_namespace(namespace: str, *args, purge: bool = False, **kwargs) -> None
|
|
603
2692
|
if purge:
|
604
2693
|
raise NotImplementedError("Purge flag is not currently supported.")
|
605
2694
|
|
606
|
-
|
607
|
-
|
608
|
-
|
2695
|
+
# Set up transaction handling
|
2696
|
+
drop_ns_transaction, commit_transaction = setup_transaction(transaction, **kwargs)
|
2697
|
+
kwargs["transaction"] = drop_ns_transaction
|
2698
|
+
|
2699
|
+
try:
|
2700
|
+
_get_storage(**kwargs).delete_namespace(
|
2701
|
+
*args,
|
2702
|
+
namespace=namespace,
|
2703
|
+
purge=purge,
|
2704
|
+
**kwargs,
|
2705
|
+
)
|
2706
|
+
|
2707
|
+
except Exception as e:
|
2708
|
+
# If any error occurs, the transaction remains uncommitted
|
2709
|
+
commit_transaction = False
|
2710
|
+
logger.error(f"Error during drop_namespace: {e}")
|
2711
|
+
raise
|
2712
|
+
finally:
|
2713
|
+
if commit_transaction:
|
2714
|
+
# Seal the interactive transaction to commit all operations atomically
|
2715
|
+
drop_ns_transaction.seal()
|
609
2716
|
|
610
2717
|
|
611
2718
|
def default_namespace(*args, **kwargs) -> str:
|
612
2719
|
"""Return the default namespace for the catalog.
|
613
2720
|
|
614
2721
|
Returns:
|
615
|
-
|
2722
|
+
Name of the default namespace.
|
616
2723
|
"""
|
617
|
-
return DEFAULT_NAMESPACE
|
618
|
-
|
619
|
-
|
620
|
-
def _validate_read_table_args(
|
621
|
-
namespace: Optional[str] = None,
|
622
|
-
table_type: Optional[TableType] = None,
|
623
|
-
distributed_dataset_type: Optional[DistributedDatasetType] = None,
|
624
|
-
merge_on_read: Optional[bool] = None,
|
625
|
-
**kwargs,
|
626
|
-
):
|
627
|
-
storage = _get_storage(**kwargs)
|
628
|
-
if storage is None:
|
629
|
-
raise ValueError(
|
630
|
-
"Catalog not initialized. Did you miss calling "
|
631
|
-
"initialize(ds=<deltacat_storage>)?"
|
632
|
-
)
|
633
|
-
|
634
|
-
if merge_on_read:
|
635
|
-
raise ValueError("Merge on read not supported currently.")
|
636
|
-
|
637
|
-
if table_type is not TableType.PYARROW:
|
638
|
-
raise ValueError("Only PYARROW table type is supported as of now")
|
639
|
-
|
640
|
-
if distributed_dataset_type is not DistributedDatasetType.DAFT:
|
641
|
-
raise ValueError("Only DAFT dataset type is supported as of now")
|
2724
|
+
return DEFAULT_NAMESPACE
|
642
2725
|
|
643
|
-
if namespace is None:
|
644
|
-
raise ValueError(
|
645
|
-
"namespace must be passed to uniquely identify a table in the catalog."
|
646
|
-
)
|
647
2726
|
|
648
|
-
|
649
|
-
def _get_latest_or_given_table_version(
|
2727
|
+
def _get_latest_active_or_given_table_version(
|
650
2728
|
namespace: str,
|
651
2729
|
table_name: str,
|
652
2730
|
table_version: Optional[str] = None,
|
@@ -655,9 +2733,16 @@ def _get_latest_or_given_table_version(
|
|
655
2733
|
) -> TableVersion:
|
656
2734
|
table_version_obj = None
|
657
2735
|
if table_version is None:
|
658
|
-
table_version_obj = _get_storage(**kwargs).
|
659
|
-
namespace=namespace,
|
2736
|
+
table_version_obj = _get_storage(**kwargs).get_latest_active_table_version(
|
2737
|
+
namespace=namespace,
|
2738
|
+
table_name=table_name,
|
2739
|
+
*args,
|
2740
|
+
**kwargs,
|
660
2741
|
)
|
2742
|
+
if table_version_obj is None:
|
2743
|
+
raise TableVersionNotFoundError(
|
2744
|
+
f"No active table version found for table {namespace}.{table_name}"
|
2745
|
+
)
|
661
2746
|
table_version = table_version_obj.table_version
|
662
2747
|
else:
|
663
2748
|
table_version_obj = _get_storage(**kwargs).get_table_version(
|
@@ -671,18 +2756,82 @@ def _get_latest_or_given_table_version(
|
|
671
2756
|
return table_version_obj
|
672
2757
|
|
673
2758
|
|
2759
|
+
def _get_all_committed_partitions(
|
2760
|
+
table: str,
|
2761
|
+
namespace: str,
|
2762
|
+
table_version: str,
|
2763
|
+
**kwargs,
|
2764
|
+
) -> List[Union[Partition, PartitionLocator]]:
|
2765
|
+
"""Get all committed partitions for a table and validate uniqueness."""
|
2766
|
+
logger.info(
|
2767
|
+
f"Reading all partitions metadata in the table={table} "
|
2768
|
+
"as partition_filter was None."
|
2769
|
+
)
|
2770
|
+
|
2771
|
+
all_partitions = (
|
2772
|
+
_get_storage(**kwargs)
|
2773
|
+
.list_partitions(
|
2774
|
+
table_name=table,
|
2775
|
+
namespace=namespace,
|
2776
|
+
table_version=table_version,
|
2777
|
+
**kwargs,
|
2778
|
+
)
|
2779
|
+
.all_items()
|
2780
|
+
)
|
2781
|
+
|
2782
|
+
committed_partitions = [
|
2783
|
+
partition
|
2784
|
+
for partition in all_partitions
|
2785
|
+
if partition.state == CommitState.COMMITTED
|
2786
|
+
]
|
2787
|
+
|
2788
|
+
logger.info(
|
2789
|
+
f"Found {len(committed_partitions)} committed partitions for "
|
2790
|
+
f"table={namespace}/{table}/{table_version}"
|
2791
|
+
)
|
2792
|
+
|
2793
|
+
_validate_partition_uniqueness(
|
2794
|
+
committed_partitions, namespace, table, table_version
|
2795
|
+
)
|
2796
|
+
return committed_partitions
|
2797
|
+
|
2798
|
+
|
2799
|
+
def _validate_partition_uniqueness(
|
2800
|
+
partitions: List[Partition], namespace: str, table: str, table_version: str
|
2801
|
+
) -> None:
|
2802
|
+
"""Validate that there are no duplicate committed partitions for the same partition values."""
|
2803
|
+
commit_count_per_partition_value = defaultdict(int)
|
2804
|
+
for partition in partitions:
|
2805
|
+
# Normalize partition values: both None and [] represent unpartitioned data
|
2806
|
+
normalized_values = (
|
2807
|
+
None
|
2808
|
+
if (
|
2809
|
+
partition.partition_values is None
|
2810
|
+
or (
|
2811
|
+
isinstance(partition.partition_values, list)
|
2812
|
+
and len(partition.partition_values) == 0
|
2813
|
+
)
|
2814
|
+
)
|
2815
|
+
else partition.partition_values
|
2816
|
+
)
|
2817
|
+
commit_count_per_partition_value[normalized_values] += 1
|
2818
|
+
|
2819
|
+
# Check for multiple committed partitions for the same partition values
|
2820
|
+
for partition_values, commit_count in commit_count_per_partition_value.items():
|
2821
|
+
if commit_count > 1:
|
2822
|
+
raise RuntimeError(
|
2823
|
+
f"Multiple committed partitions found for table={namespace}/{table}/{table_version}. "
|
2824
|
+
f"Partition values: {partition_values}. Commit count: {commit_count}. "
|
2825
|
+
f"This should not happen."
|
2826
|
+
)
|
2827
|
+
|
2828
|
+
|
674
2829
|
def _get_deltas_from_partition_filter(
|
675
2830
|
partition_filter: Optional[List[Union[Partition, PartitionLocator]]] = None,
|
676
|
-
stream_position_range_inclusive: Optional[Tuple[int, int]] = None,
|
677
2831
|
*args,
|
678
2832
|
**kwargs,
|
679
2833
|
):
|
680
|
-
|
681
2834
|
result_deltas = []
|
682
|
-
start_stream_position, end_stream_position = stream_position_range_inclusive or (
|
683
|
-
None,
|
684
|
-
None,
|
685
|
-
)
|
686
2835
|
for partition_like in partition_filter:
|
687
2836
|
deltas = (
|
688
2837
|
_get_storage(**kwargs)
|
@@ -690,26 +2839,33 @@ def _get_deltas_from_partition_filter(
|
|
690
2839
|
partition_like=partition_like,
|
691
2840
|
ascending_order=True,
|
692
2841
|
include_manifest=True,
|
693
|
-
start_stream_position=start_stream_position,
|
694
|
-
last_stream_position=end_stream_position,
|
695
2842
|
*args,
|
696
2843
|
**kwargs,
|
697
2844
|
)
|
698
2845
|
.all_items()
|
699
2846
|
)
|
700
2847
|
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
2848
|
+
# Validate that all qualified deltas are append type - merge-on-read not yet implemented
|
2849
|
+
# TODO(pdames): Run compaction minus materialize for MoR of each partition.
|
2850
|
+
if deltas:
|
2851
|
+
non_append_deltas = []
|
2852
|
+
for delta in deltas:
|
2853
|
+
if delta.type != DeltaType.APPEND:
|
2854
|
+
non_append_deltas.append(delta)
|
2855
|
+
else:
|
2856
|
+
result_deltas.append(delta)
|
2857
|
+
if non_append_deltas:
|
2858
|
+
delta_types = {delta.type for delta in non_append_deltas}
|
2859
|
+
delta_info = [
|
2860
|
+
(str(delta.locator), delta.type) for delta in non_append_deltas[:5]
|
2861
|
+
] # Show first 5
|
2862
|
+
raise NotImplementedError(
|
2863
|
+
f"Merge-on-read is not yet implemented. Found {len(non_append_deltas)} non-append deltas "
|
2864
|
+
f"with types {delta_types}. All deltas must be APPEND type for read operations. "
|
2865
|
+
f"Examples: {delta_info}. Please run compaction first to merge non-append deltas."
|
2866
|
+
)
|
2867
|
+
|
2868
|
+
logger.info(f"Validated {len(deltas)} qualified deltas are all APPEND type")
|
713
2869
|
return result_deltas
|
714
2870
|
|
715
2871
|
|