deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
deltacat/storage/model/schema.py
CHANGED
@@ -3,21 +3,45 @@ from __future__ import annotations
|
|
3
3
|
|
4
4
|
import logging
|
5
5
|
import copy
|
6
|
+
import base64
|
6
7
|
|
7
8
|
import msgpack
|
8
|
-
from typing import Optional, Any, Dict, Union, List, Callable, Tuple
|
9
|
+
from typing import Optional, Any, Dict, Union, List, Callable, Tuple, TYPE_CHECKING
|
9
10
|
|
10
11
|
import pyarrow as pa
|
11
12
|
from pyarrow import ArrowInvalid
|
13
|
+
import pandas as pd
|
14
|
+
import numpy as np
|
15
|
+
|
16
|
+
# Daft DataFrame support - required for core functionality
|
17
|
+
import daft
|
18
|
+
from daft import DataFrame as DaftDataFrame
|
12
19
|
|
13
20
|
from deltacat.constants import BYTES_PER_KIBIBYTE
|
21
|
+
from deltacat.exceptions import (
|
22
|
+
SchemaCompatibilityError,
|
23
|
+
SchemaValidationError,
|
24
|
+
)
|
14
25
|
from deltacat.storage.model.types import (
|
15
26
|
SchemaConsistencyType,
|
16
27
|
SortOrder,
|
17
28
|
NullOrder,
|
18
29
|
)
|
30
|
+
from deltacat.types.tables import (
|
31
|
+
get_table_length,
|
32
|
+
to_pyarrow,
|
33
|
+
from_pyarrow,
|
34
|
+
get_dataset_type,
|
35
|
+
SchemaEvolutionMode,
|
36
|
+
)
|
37
|
+
from deltacat.types.media import DatasetType
|
38
|
+
|
39
|
+
if TYPE_CHECKING:
|
40
|
+
from deltacat.storage.model.sort_key import SortKey
|
41
|
+
|
19
42
|
from deltacat import logs
|
20
43
|
|
44
|
+
|
21
45
|
# PyArrow Field Metadata Key used to set the Field ID when writing to Parquet.
|
22
46
|
# See: https://arrow.apache.org/docs/cpp/parquet.html#parquet-field-id
|
23
47
|
PARQUET_FIELD_ID_KEY_NAME = b"PARQUET:field_id"
|
@@ -53,6 +77,52 @@ SUBSCHEMAS_KEY_NAME = b"DELTACAT:subschemas"
|
|
53
77
|
# Apache Iceberg, which sets aside this range for reserved fields
|
54
78
|
MAX_FIELD_ID_EXCLUSIVE = 2147483447
|
55
79
|
|
80
|
+
|
81
|
+
def _encode_metadata_value(value: Any) -> bytes:
|
82
|
+
"""
|
83
|
+
Encode a value for storage in PyArrow field metadata.
|
84
|
+
|
85
|
+
Uses msgpack for efficient serialization, then base64 encoding to ensure
|
86
|
+
UTF-8 compatibility with all Parquet readers (Polars, Daft, etc.).
|
87
|
+
|
88
|
+
Args:
|
89
|
+
value: The value to encode
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Base64-encoded msgpack bytes that are UTF-8 safe
|
93
|
+
"""
|
94
|
+
msgpack_bytes = msgpack.dumps(value)
|
95
|
+
return base64.b64encode(msgpack_bytes)
|
96
|
+
|
97
|
+
|
98
|
+
def _decode_metadata_value(encoded_bytes: bytes) -> Any:
|
99
|
+
"""
|
100
|
+
Decode a value from PyArrow field metadata.
|
101
|
+
|
102
|
+
Handles both new base64-encoded format and legacy raw msgpack format
|
103
|
+
for backward compatibility.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
encoded_bytes: The encoded bytes from field metadata
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
The decoded value
|
110
|
+
|
111
|
+
Raises:
|
112
|
+
ValueError: If the data cannot be decoded
|
113
|
+
"""
|
114
|
+
try:
|
115
|
+
# Try new base64-encoded format first
|
116
|
+
msgpack_bytes = base64.b64decode(encoded_bytes)
|
117
|
+
return msgpack.loads(msgpack_bytes)
|
118
|
+
except Exception:
|
119
|
+
try:
|
120
|
+
# Fall back to legacy raw msgpack format
|
121
|
+
return msgpack.loads(encoded_bytes)
|
122
|
+
except Exception as e:
|
123
|
+
raise ValueError(f"Failed to decode metadata value: {e}") from e
|
124
|
+
|
125
|
+
|
56
126
|
# Default name assigned to the base, unnamed single schema when a new named
|
57
127
|
# subschema is first added.
|
58
128
|
BASE_SCHEMA_NAME = "_base"
|
@@ -64,6 +134,86 @@ FieldName = str
|
|
64
134
|
NestedFieldName = List[str]
|
65
135
|
FieldLocator = Union[FieldName, NestedFieldName, FieldId]
|
66
136
|
|
137
|
+
|
138
|
+
class SchemaUpdateOperation(tuple):
|
139
|
+
"""
|
140
|
+
Represents a single schema update operation (add, remove, or update field).
|
141
|
+
|
142
|
+
This class inherits from tuple and stores:
|
143
|
+
- operation: str ("add", "remove", "update")
|
144
|
+
- field_locator: Optional[FieldLocator] (name, path, or ID)
|
145
|
+
- field: Optional[Field] (the field data for add/update operations)
|
146
|
+
"""
|
147
|
+
|
148
|
+
@staticmethod
|
149
|
+
def add_field(field: Field) -> SchemaUpdateOperation:
|
150
|
+
"""Create an operation to add a new field."""
|
151
|
+
return SchemaUpdateOperation(("add", None, field))
|
152
|
+
|
153
|
+
@staticmethod
|
154
|
+
def remove_field(field_locator: FieldLocator) -> SchemaUpdateOperation:
|
155
|
+
"""Create an operation to remove an existing field."""
|
156
|
+
return SchemaUpdateOperation(("remove", field_locator, None))
|
157
|
+
|
158
|
+
@staticmethod
|
159
|
+
def update_field(
|
160
|
+
field_locator: FieldLocator, field: Field
|
161
|
+
) -> SchemaUpdateOperation:
|
162
|
+
"""Create an operation to update an existing field."""
|
163
|
+
return SchemaUpdateOperation(("update", field_locator, field))
|
164
|
+
|
165
|
+
@property
|
166
|
+
def operation(self) -> str:
|
167
|
+
"""The operation type: 'add', 'remove', or 'update'."""
|
168
|
+
return self[0]
|
169
|
+
|
170
|
+
@property
|
171
|
+
def field_locator(self) -> Optional[FieldLocator]:
|
172
|
+
"""The field locator (name, path, or ID)."""
|
173
|
+
return self[1]
|
174
|
+
|
175
|
+
@property
|
176
|
+
def field(self) -> Optional[Field]:
|
177
|
+
"""The field data (None for remove operations)."""
|
178
|
+
return self[2]
|
179
|
+
|
180
|
+
def field_locator_matches(self, other_locator: FieldLocator) -> bool:
|
181
|
+
"""Check if this operation's field_locator matches the given field_locator."""
|
182
|
+
return SchemaUpdate._field_locators_match(self.field_locator, other_locator)
|
183
|
+
|
184
|
+
|
185
|
+
class SchemaUpdateOperations(List[SchemaUpdateOperation]):
|
186
|
+
"""
|
187
|
+
A list of schema update operations that can be applied to a schema.
|
188
|
+
|
189
|
+
This class inherits from List[SchemaUpdateOperation] and provides convenience
|
190
|
+
methods for creating and managing schema update operations.
|
191
|
+
"""
|
192
|
+
|
193
|
+
@staticmethod
|
194
|
+
def of(operations: List[SchemaUpdateOperation]) -> SchemaUpdateOperations:
|
195
|
+
"""Create a SchemaUpdateOperations list from a list of operations."""
|
196
|
+
typed_operations = SchemaUpdateOperations()
|
197
|
+
for operation in operations:
|
198
|
+
if operation is not None and not isinstance(
|
199
|
+
operation, SchemaUpdateOperation
|
200
|
+
):
|
201
|
+
operation = SchemaUpdateOperation(operation)
|
202
|
+
typed_operations.append(operation)
|
203
|
+
return typed_operations
|
204
|
+
|
205
|
+
def __getitem__(self, item):
|
206
|
+
"""Override to ensure items are properly typed as SchemaUpdateOperation."""
|
207
|
+
val = super().__getitem__(item)
|
208
|
+
if val is not None and not isinstance(val, SchemaUpdateOperation):
|
209
|
+
self[item] = val = SchemaUpdateOperation(val)
|
210
|
+
return val
|
211
|
+
|
212
|
+
def __iter__(self):
|
213
|
+
for i in range(len(self)):
|
214
|
+
yield self[i] # This triggers __getitem__ conversion
|
215
|
+
|
216
|
+
|
67
217
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
68
218
|
|
69
219
|
|
@@ -224,6 +374,10 @@ class Field(dict):
|
|
224
374
|
def merge_order(self) -> Optional[MergeOrder]:
|
225
375
|
return Field._merge_order(self.arrow)
|
226
376
|
|
377
|
+
@property
|
378
|
+
def is_event_time(self) -> Optional[bool]:
|
379
|
+
return Field._is_event_time(self.arrow)
|
380
|
+
|
227
381
|
@property
|
228
382
|
def doc(self) -> Optional[str]:
|
229
383
|
return Field._doc(self.arrow)
|
@@ -273,7 +427,7 @@ class Field(dict):
|
|
273
427
|
merge_order = None
|
274
428
|
if field.metadata:
|
275
429
|
bytes_val = field.metadata.get(FIELD_MERGE_ORDER_KEY_NAME)
|
276
|
-
merge_order =
|
430
|
+
merge_order = _decode_metadata_value(bytes_val) if bytes_val else None
|
277
431
|
return merge_order
|
278
432
|
|
279
433
|
@staticmethod
|
@@ -289,7 +443,7 @@ class Field(dict):
|
|
289
443
|
default = None
|
290
444
|
if field.metadata:
|
291
445
|
bytes_val = field.metadata.get(FIELD_PAST_DEFAULT_KEY_NAME)
|
292
|
-
default =
|
446
|
+
default = _decode_metadata_value(bytes_val) if bytes_val else None
|
293
447
|
return default
|
294
448
|
|
295
449
|
@staticmethod
|
@@ -297,7 +451,7 @@ class Field(dict):
|
|
297
451
|
default = None
|
298
452
|
if field.metadata:
|
299
453
|
bytes_val = field.metadata.get(FIELD_FUTURE_DEFAULT_KEY_NAME)
|
300
|
-
default =
|
454
|
+
default = _decode_metadata_value(bytes_val) if bytes_val else None
|
301
455
|
return default
|
302
456
|
|
303
457
|
@staticmethod
|
@@ -309,19 +463,53 @@ class Field(dict):
|
|
309
463
|
return t
|
310
464
|
|
311
465
|
@staticmethod
|
312
|
-
def _validate_merge_key(
|
313
|
-
|
314
|
-
|
466
|
+
def _validate_merge_key(
|
467
|
+
field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
|
468
|
+
):
|
469
|
+
# Note: large_strings were explicitly allowed for compatibility with PyIceberg Iceberg Schema to PyArrow converter
|
470
|
+
if not (
|
471
|
+
pa.types.is_string(field.type)
|
472
|
+
or pa.types.is_primitive(field.type)
|
473
|
+
or pa.types.is_large_string(field.type)
|
474
|
+
):
|
475
|
+
raise ValueError(
|
476
|
+
f"Merge key {field} must be a primitive type or large string."
|
477
|
+
)
|
478
|
+
|
479
|
+
# Merge key fields must have VALIDATE consistency type to prevent type promotion
|
480
|
+
if (
|
481
|
+
consistency_type is not None
|
482
|
+
and consistency_type != SchemaConsistencyType.VALIDATE
|
483
|
+
):
|
484
|
+
raise ValueError(
|
485
|
+
f"Merge key field '{field.name}' must have VALIDATE consistency type, "
|
486
|
+
f"got {consistency_type}. Type promotion is not allowed for merge keys."
|
487
|
+
)
|
488
|
+
|
315
489
|
if pa.types.is_floating(field.type):
|
316
490
|
raise ValueError(f"Merge key {field} cannot be floating point.")
|
317
491
|
|
318
492
|
@staticmethod
|
319
|
-
def _validate_merge_order(
|
493
|
+
def _validate_merge_order(
|
494
|
+
field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
|
495
|
+
):
|
320
496
|
if not pa.types.is_primitive(field.type):
|
321
497
|
raise ValueError(f"Merge order {field} must be a primitive type.")
|
322
498
|
|
499
|
+
# Merge order fields must have VALIDATE consistency type to prevent type promotion
|
500
|
+
if (
|
501
|
+
consistency_type is not None
|
502
|
+
and consistency_type != SchemaConsistencyType.VALIDATE
|
503
|
+
):
|
504
|
+
raise ValueError(
|
505
|
+
f"Merge order field '{field.name}' must have VALIDATE consistency type, "
|
506
|
+
f"got {consistency_type}. Type promotion is not allowed for merge order fields."
|
507
|
+
)
|
508
|
+
|
323
509
|
@staticmethod
|
324
|
-
def _validate_event_time(
|
510
|
+
def _validate_event_time(
|
511
|
+
field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
|
512
|
+
):
|
325
513
|
if (
|
326
514
|
not pa.types.is_integer(field.type)
|
327
515
|
and not pa.types.is_floating(field.type)
|
@@ -329,6 +517,16 @@ class Field(dict):
|
|
329
517
|
):
|
330
518
|
raise ValueError(f"Event time {field} must be numeric or date type.")
|
331
519
|
|
520
|
+
# Event time fields must have VALIDATE consistency type to prevent type promotion
|
521
|
+
if (
|
522
|
+
consistency_type is not None
|
523
|
+
and consistency_type != SchemaConsistencyType.VALIDATE
|
524
|
+
):
|
525
|
+
raise ValueError(
|
526
|
+
f"Event time field '{field.name}' must have VALIDATE consistency type, "
|
527
|
+
f"got {consistency_type}. Type promotion is not allowed for event time fields."
|
528
|
+
)
|
529
|
+
|
332
530
|
@staticmethod
|
333
531
|
def _validate_default(
|
334
532
|
default: Optional[Any],
|
@@ -354,22 +552,31 @@ class Field(dict):
|
|
354
552
|
future_default: Optional[Any],
|
355
553
|
consistency_type: Optional[SchemaConsistencyType],
|
356
554
|
) -> pa.Field:
|
555
|
+
# Auto-set future_default to past_default if past_default exists but future_default doesn't
|
556
|
+
if past_default is not None and future_default is None:
|
557
|
+
future_default = past_default
|
558
|
+
|
559
|
+
# Default critical columns (merge key, merge order, event time) to VALIDATE consistency type
|
560
|
+
# to prevent type promotion which could break merge semantics
|
561
|
+
if consistency_type is None and (is_merge_key or merge_order or is_event_time):
|
562
|
+
consistency_type = SchemaConsistencyType.VALIDATE
|
563
|
+
|
357
564
|
meta = {}
|
358
565
|
if is_merge_key:
|
359
|
-
Field._validate_merge_key(field)
|
566
|
+
Field._validate_merge_key(field, consistency_type)
|
360
567
|
meta[FIELD_MERGE_KEY_NAME] = str(is_merge_key)
|
361
568
|
if merge_order:
|
362
|
-
Field._validate_merge_order(field)
|
363
|
-
meta[FIELD_MERGE_ORDER_KEY_NAME] =
|
569
|
+
Field._validate_merge_order(field, consistency_type)
|
570
|
+
meta[FIELD_MERGE_ORDER_KEY_NAME] = _encode_metadata_value(merge_order)
|
364
571
|
if is_event_time:
|
365
|
-
Field._validate_event_time(field)
|
572
|
+
Field._validate_event_time(field, consistency_type)
|
366
573
|
meta[FIELD_EVENT_TIME_KEY_NAME] = str(is_event_time)
|
367
574
|
if past_default is not None:
|
368
575
|
Field._validate_default(past_default, field)
|
369
|
-
meta[FIELD_PAST_DEFAULT_KEY_NAME] =
|
576
|
+
meta[FIELD_PAST_DEFAULT_KEY_NAME] = _encode_metadata_value(past_default)
|
370
577
|
if future_default is not None:
|
371
578
|
Field._validate_default(future_default, field)
|
372
|
-
meta[FIELD_FUTURE_DEFAULT_KEY_NAME] =
|
579
|
+
meta[FIELD_FUTURE_DEFAULT_KEY_NAME] = _encode_metadata_value(future_default)
|
373
580
|
if field_id is not None:
|
374
581
|
meta[PARQUET_FIELD_ID_KEY_NAME] = str(field_id)
|
375
582
|
if doc is not None:
|
@@ -383,6 +590,217 @@ class Field(dict):
|
|
383
590
|
metadata=meta,
|
384
591
|
)
|
385
592
|
|
593
|
+
def validate(
|
594
|
+
self,
|
595
|
+
column_type: pa.DataType,
|
596
|
+
) -> None:
|
597
|
+
"""Validate that data in a column matches this field's type and constraints.
|
598
|
+
|
599
|
+
Args:
|
600
|
+
column_datatype: PyArrow DataType containing the column data to validate
|
601
|
+
|
602
|
+
Raises:
|
603
|
+
ValueError: If data doesn't match field requirements.
|
604
|
+
"""
|
605
|
+
# Check if the data type matches the field type
|
606
|
+
if not column_type.equals(self.arrow.type):
|
607
|
+
raise SchemaValidationError(
|
608
|
+
f"Data type mismatch for field '{self.arrow.name}': "
|
609
|
+
f"expected {self.arrow.type}, got {column_type}"
|
610
|
+
)
|
611
|
+
|
612
|
+
def coerce(
|
613
|
+
self,
|
614
|
+
column_data: pa.Array,
|
615
|
+
) -> pa.Array:
|
616
|
+
"""Coerce data in a column to match this field's type.
|
617
|
+
|
618
|
+
Args:
|
619
|
+
column_data: PyArrow Array containing the column data to coerce
|
620
|
+
|
621
|
+
Returns:
|
622
|
+
pa.Array: Coerced data matching this field's type
|
623
|
+
|
624
|
+
Raises:
|
625
|
+
ValueError: If data cannot be coerced to the field type
|
626
|
+
"""
|
627
|
+
try:
|
628
|
+
return pa.compute.cast(column_data, self.arrow.type)
|
629
|
+
except (pa.ArrowTypeError, pa.ArrowInvalid) as e:
|
630
|
+
raise SchemaValidationError(
|
631
|
+
f"Cannot coerce data for field '{self.arrow.name}' "
|
632
|
+
f"from {column_data.type} to {self.arrow.type}: {e}"
|
633
|
+
)
|
634
|
+
|
635
|
+
def coerce_daft(
|
636
|
+
self,
|
637
|
+
dataframe: DaftDataFrame,
|
638
|
+
column_name: str,
|
639
|
+
target_type: Optional[pa.DataType] = None,
|
640
|
+
) -> DaftDataFrame:
|
641
|
+
"""Coerce a Daft DataFrame column to match this field's type.
|
642
|
+
|
643
|
+
Args:
|
644
|
+
dataframe: Daft DataFrame containing the column to coerce
|
645
|
+
column_name: Name of the column to coerce
|
646
|
+
target_type: Optional target type to coerce to (defaults to self.arrow.type)
|
647
|
+
|
648
|
+
Returns:
|
649
|
+
DaftDataFrame: DataFrame with the coerced column
|
650
|
+
|
651
|
+
Raises:
|
652
|
+
SchemaValidationError: If data cannot be coerced to the field type
|
653
|
+
"""
|
654
|
+
target_arrow_type = target_type or self.arrow.type
|
655
|
+
target_daft_type = daft.DataType.from_arrow_type(target_arrow_type)
|
656
|
+
|
657
|
+
try:
|
658
|
+
# Use Daft's cast expression to coerce the column
|
659
|
+
coerced_dataframe = dataframe.with_column(
|
660
|
+
column_name, daft.col(column_name).cast(target_daft_type)
|
661
|
+
)
|
662
|
+
return coerced_dataframe
|
663
|
+
except Exception as e:
|
664
|
+
raise SchemaValidationError(
|
665
|
+
f"Cannot coerce Daft column '{column_name}' for field '{self.arrow.name}' "
|
666
|
+
f"to type {target_arrow_type}: {e}"
|
667
|
+
)
|
668
|
+
|
669
|
+
def promote_type_if_needed(
|
670
|
+
self,
|
671
|
+
column_data: pa.Array,
|
672
|
+
) -> Tuple[pa.Array, bool]:
|
673
|
+
"""Promote field type to accommodate new data when consistency type is NONE.
|
674
|
+
Use PyArrow's unify_schemas to find the most permissive type that can accommodate both
|
675
|
+
the current and new data types.
|
676
|
+
|
677
|
+
Args:
|
678
|
+
column_data: PyArrow Array containing the column data
|
679
|
+
|
680
|
+
Returns:
|
681
|
+
Tuple[pa.Array, bool]: (data, type_was_promoted)
|
682
|
+
- data: Either original data or data cast to promoted type
|
683
|
+
- type_was_promoted: True if field type should be updated
|
684
|
+
|
685
|
+
Raises:
|
686
|
+
SchemaValidationError: If column data cannot be promoted to a unified type
|
687
|
+
"""
|
688
|
+
current_type = self.arrow.type
|
689
|
+
data_type = column_data.type
|
690
|
+
|
691
|
+
# Early return if types are already compatible
|
692
|
+
if current_type.equals(data_type):
|
693
|
+
return column_data, False
|
694
|
+
|
695
|
+
# Find the promoted type that can accommodate both types
|
696
|
+
promoted_type = self._find_promoted_type(current_type, data_type)
|
697
|
+
|
698
|
+
# Handle type coercion vs promotion
|
699
|
+
if promoted_type.equals(current_type):
|
700
|
+
return self._coerce_to_current_type(column_data, current_type)
|
701
|
+
else:
|
702
|
+
return self._promote_to_new_type(column_data, promoted_type)
|
703
|
+
|
704
|
+
def _coerce_to_current_type(
|
705
|
+
self,
|
706
|
+
column_data: pa.Array,
|
707
|
+
current_type: pa.DataType,
|
708
|
+
) -> Tuple[pa.Array, bool]:
|
709
|
+
"""Try to coerce data to current type without promoting the field type."""
|
710
|
+
try:
|
711
|
+
coerced_data = pa.compute.cast(column_data, current_type)
|
712
|
+
return coerced_data, False
|
713
|
+
except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
|
714
|
+
return column_data, False
|
715
|
+
|
716
|
+
def _promote_to_new_type(
|
717
|
+
self,
|
718
|
+
column_data: pa.Array,
|
719
|
+
promoted_type: pa.DataType,
|
720
|
+
) -> Tuple[pa.Array, bool]:
|
721
|
+
"""Try to cast data to the promoted type."""
|
722
|
+
try:
|
723
|
+
promoted_data = pa.compute.cast(column_data, promoted_type)
|
724
|
+
return promoted_data, True
|
725
|
+
except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
|
726
|
+
# If direct cast fails, the promotion is not valid
|
727
|
+
raise SchemaValidationError(
|
728
|
+
f"Cannot cast data for field '{self.arrow.name}' from type {column_data.type} "
|
729
|
+
f"to promoted type {promoted_type}"
|
730
|
+
)
|
731
|
+
|
732
|
+
def _cast_default_to_promoted_type(
|
733
|
+
self,
|
734
|
+
default_value: Any,
|
735
|
+
promoted_type: pa.DataType,
|
736
|
+
) -> Optional[Any]:
|
737
|
+
"""Cast a default value to match a promoted type.
|
738
|
+
|
739
|
+
Args:
|
740
|
+
default_value: The original default value
|
741
|
+
promoted_type: The new promoted type
|
742
|
+
|
743
|
+
Returns:
|
744
|
+
The default value cast to the promoted type.
|
745
|
+
|
746
|
+
Raises:
|
747
|
+
SchemaValidationError: If the default value cannot be cast to the promoted type
|
748
|
+
"""
|
749
|
+
if default_value is None:
|
750
|
+
return None
|
751
|
+
|
752
|
+
try:
|
753
|
+
# Create a scalar with the original default value
|
754
|
+
original_scalar = pa.scalar(default_value)
|
755
|
+
# Cast to the promoted type
|
756
|
+
promoted_scalar = pa.compute.cast(original_scalar, promoted_type)
|
757
|
+
# Return the Python value
|
758
|
+
return promoted_scalar.as_py()
|
759
|
+
except (
|
760
|
+
pa.ArrowTypeError,
|
761
|
+
pa.ArrowInvalid,
|
762
|
+
pa.ArrowNotImplementedError,
|
763
|
+
TypeError,
|
764
|
+
ValueError,
|
765
|
+
):
|
766
|
+
raise SchemaValidationError(
|
767
|
+
f"Cannot cast default value `{default_value}` to promoted type {promoted_type}"
|
768
|
+
)
|
769
|
+
|
770
|
+
def _find_promoted_type(
|
771
|
+
self,
|
772
|
+
current_type: pa.DataType,
|
773
|
+
new_type: pa.DataType,
|
774
|
+
) -> Optional[pa.DataType]:
|
775
|
+
"""Find the most specific type that can accommodate both current and new types
|
776
|
+
using PyArrow's unify_schemas with permissive promotion options.
|
777
|
+
|
778
|
+
Returns:
|
779
|
+
The promoted type.
|
780
|
+
|
781
|
+
Raises:
|
782
|
+
SchemaValidationError: If the types cannot be unified.
|
783
|
+
"""
|
784
|
+
try:
|
785
|
+
# Create schemas with the same field name but different types
|
786
|
+
schema1 = pa.schema([("field", current_type)])
|
787
|
+
schema2 = pa.schema([("field", new_type)])
|
788
|
+
|
789
|
+
# Use PyArrow's built-in permissive type promotion
|
790
|
+
unified_schema = pa.unify_schemas(
|
791
|
+
[schema1, schema2], promote_options="permissive"
|
792
|
+
)
|
793
|
+
|
794
|
+
# Return the promoted type
|
795
|
+
return unified_schema.field("field").type
|
796
|
+
|
797
|
+
except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
|
798
|
+
# If unification fails, no promotion is possible
|
799
|
+
raise SchemaValidationError(
|
800
|
+
f"Cannot unify types for field '{self.arrow.name}': "
|
801
|
+
f"current type {current_type} incompatible with new data type {new_type}"
|
802
|
+
)
|
803
|
+
|
386
804
|
|
387
805
|
SingleSchema = Union[List[Field], pa.Schema]
|
388
806
|
MultiSchema = Union[Dict[SchemaName, List[Field]], Dict[SchemaName, pa.Schema]]
|
@@ -432,6 +850,8 @@ class Schema(dict):
|
|
432
850
|
Returns:
|
433
851
|
A new DeltaCAT Schema.
|
434
852
|
"""
|
853
|
+
if schema_id and schema_id < 0:
|
854
|
+
raise ValueError(f"Schema ID must be non-negative, got {schema_id}")
|
435
855
|
# normalize the input as a unified pyarrow schema
|
436
856
|
# if the input included multiple subschemas, then also save a mapping
|
437
857
|
# from each subschema to its unique field names
|
@@ -454,6 +874,8 @@ class Schema(dict):
|
|
454
874
|
visit=Schema._populate_fields,
|
455
875
|
visitor_dict=visitor_dict,
|
456
876
|
)
|
877
|
+
# recalculate max field ID after field population (in case new field IDs were assigned)
|
878
|
+
max_field_id = max(field_ids_to_fields.keys()) if field_ids_to_fields else 0
|
457
879
|
if schema.metadata:
|
458
880
|
schema_metadata.update(schema.metadata)
|
459
881
|
# populate merge keys
|
@@ -477,7 +899,9 @@ class Schema(dict):
|
|
477
899
|
schema_metadata[SCHEMA_ID_KEY_NAME] = str(schema_id)
|
478
900
|
if schema_metadata.get(SCHEMA_ID_KEY_NAME) is None:
|
479
901
|
schema_metadata[SCHEMA_ID_KEY_NAME] = str(0)
|
480
|
-
schema_metadata[SUBSCHEMAS_KEY_NAME] =
|
902
|
+
schema_metadata[SUBSCHEMAS_KEY_NAME] = _encode_metadata_value(
|
903
|
+
subschema_to_field_ids
|
904
|
+
)
|
481
905
|
final_schema = pyarrow_schema.with_metadata(schema_metadata)
|
482
906
|
return Schema(
|
483
907
|
{
|
@@ -546,6 +970,32 @@ class Schema(dict):
|
|
546
970
|
schema_id=self.id + 1,
|
547
971
|
)
|
548
972
|
|
973
|
+
def update(self, allow_incompatible_changes: bool = False) -> SchemaUpdate:
|
974
|
+
"""
|
975
|
+
Create a SchemaUpdate instance for safely evolving this schema.
|
976
|
+
|
977
|
+
This method provides a convenient way to create a SchemaUpdate for this schema
|
978
|
+
without needing to call SchemaUpdate.of() directly.
|
979
|
+
|
980
|
+
Args:
|
981
|
+
allow_incompatible_changes: If True, allows changes that may break
|
982
|
+
backward compatibility. If False (default), raises SchemaCompatibilityError
|
983
|
+
for incompatible changes.
|
984
|
+
|
985
|
+
Returns:
|
986
|
+
A new SchemaUpdate instance configured for this schema
|
987
|
+
|
988
|
+
Example:
|
989
|
+
>>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
|
990
|
+
>>> new_field = Field.of(pa.field("name", pa.string()))
|
991
|
+
>>> updated_schema = (schema.update()
|
992
|
+
... .add_field("name", new_field)
|
993
|
+
... .apply())
|
994
|
+
"""
|
995
|
+
return SchemaUpdate.of(
|
996
|
+
self, allow_incompatible_changes=allow_incompatible_changes
|
997
|
+
)
|
998
|
+
|
549
999
|
def field_id(self, name: Union[FieldName, NestedFieldName]) -> FieldId:
|
550
1000
|
return Schema._field_name_to_field_id(self.arrow, name)
|
551
1001
|
|
@@ -563,125 +1013,570 @@ class Schema(dict):
|
|
563
1013
|
)
|
564
1014
|
return self.field_ids_to_fields[field_id]
|
565
1015
|
|
566
|
-
|
567
|
-
|
568
|
-
field_ids_to_fields = self.field_ids_to_fields
|
569
|
-
return list(field_ids_to_fields.values())
|
1016
|
+
def merge_order_sort_keys(self) -> Optional[List[SortKey]]:
|
1017
|
+
"""Extract sort keys from fields with merge_order defined, or use event_time as fallback.
|
570
1018
|
|
571
|
-
|
572
|
-
|
573
|
-
|
1019
|
+
If explicit merge_order fields are defined, they take precedence.
|
1020
|
+
If no merge_order fields are defined but an event_time field exists, use event_time
|
1021
|
+
with DESCENDING merge_order (keep latest events by default).
|
574
1022
|
|
575
|
-
|
576
|
-
|
577
|
-
|
1023
|
+
Note: The sort order is inverted because deduplication keeps the "last" record
|
1024
|
+
after sorting. To keep the record with the smallest merge_order value, we need
|
1025
|
+
to sort in DESCENDING order so that record appears last.
|
578
1026
|
|
579
|
-
|
580
|
-
|
581
|
-
|
1027
|
+
Returns:
|
1028
|
+
List of SortKey objects constructed from fields with merge_order or event_time,
|
1029
|
+
or None if neither are defined.
|
1030
|
+
"""
|
1031
|
+
# First priority: explicit merge_order fields
|
1032
|
+
fields_with_merge_order = self._get_fields_with_merge_order()
|
1033
|
+
if fields_with_merge_order:
|
1034
|
+
return self._create_sort_keys_from_merge_order_fields(
|
1035
|
+
fields_with_merge_order
|
1036
|
+
)
|
582
1037
|
|
583
|
-
|
584
|
-
|
585
|
-
|
1038
|
+
# Second priority: event_time field as default merge_order key
|
1039
|
+
event_time_fields = self._get_event_time_fields()
|
1040
|
+
if event_time_fields:
|
1041
|
+
return self._create_sort_keys_from_event_time_fields(event_time_fields)
|
586
1042
|
|
587
|
-
|
588
|
-
def id(self) -> SchemaId:
|
589
|
-
return Schema._schema_id(self.arrow)
|
1043
|
+
return None
|
590
1044
|
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
1045
|
+
def _validate_and_coerce_table(
|
1046
|
+
self,
|
1047
|
+
table: pa.Table,
|
1048
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
|
1049
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
|
1050
|
+
) -> Tuple[pa.Table, Schema]:
|
1051
|
+
"""Validate and coerce a PyArrow table to match this schema's field types and constraints.
|
595
1052
|
|
596
|
-
|
597
|
-
|
598
|
-
# return cached subschemas first if they exist
|
599
|
-
subschemas = self.get("subschemas")
|
600
|
-
if not subschemas:
|
601
|
-
# retrieve any defined subschemas
|
602
|
-
subschemas_to_field_ids = self.subschemas_to_field_ids
|
603
|
-
# rebuild and return the subschema cache
|
604
|
-
if subschemas_to_field_ids:
|
605
|
-
subschemas = {
|
606
|
-
schema_name: Schema.of(
|
607
|
-
schema=pa.schema(
|
608
|
-
[self.field(field_id).arrow for field_id in field_ids]
|
609
|
-
),
|
610
|
-
schema_id=self.id,
|
611
|
-
native_object=self.native_object,
|
612
|
-
)
|
613
|
-
for schema_name, field_ids in subschemas_to_field_ids.items()
|
614
|
-
}
|
615
|
-
self["subschemas"] = subschemas
|
616
|
-
return subschemas or {}
|
1053
|
+
This method now uses SchemaUpdate for safe schema evolution, ensuring all field
|
1054
|
+
protection rules and validation are applied consistently.
|
617
1055
|
|
618
|
-
|
619
|
-
|
620
|
-
|
1056
|
+
Args:
|
1057
|
+
table: PyArrow Table to validate and coerce
|
1058
|
+
schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
|
1059
|
+
default_schema_consistency_type: Default consistency type for new fields in AUTO mode
|
621
1060
|
|
622
|
-
|
623
|
-
|
624
|
-
|
1061
|
+
Returns:
|
1062
|
+
Tuple[pa.Table, Schema]: Table with data validated/coerced according to schema consistency types,
|
1063
|
+
and the (potentially updated) schema
|
625
1064
|
|
626
|
-
|
627
|
-
|
628
|
-
|
1065
|
+
Raises:
|
1066
|
+
SchemaValidationError: If validation fails or coercion is not possible
|
1067
|
+
SchemaCompatibilityError: If schema evolution would break compatibility
|
1068
|
+
"""
|
1069
|
+
if not self.field_ids_to_fields:
|
1070
|
+
# No fields defined in schema, return original table
|
1071
|
+
return table, self
|
629
1072
|
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
return schema_id
|
1073
|
+
# Setup
|
1074
|
+
field_name_to_field = self._create_field_name_mapping()
|
1075
|
+
field_updates = {} # field_name -> updated_field
|
1076
|
+
new_fields = {} # field_name -> new_field
|
1077
|
+
new_columns = []
|
1078
|
+
new_schema_fields = []
|
637
1079
|
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
) -> Dict[SchemaName, List[FieldId]]:
|
642
|
-
subschemas = None
|
643
|
-
if schema.metadata:
|
644
|
-
bytes_val = schema.metadata.get(SUBSCHEMAS_KEY_NAME)
|
645
|
-
subschemas = msgpack.loads(bytes_val) if bytes_val else None
|
646
|
-
return subschemas
|
1080
|
+
# Process each column in the table
|
1081
|
+
for column_name in table.column_names:
|
1082
|
+
column_data = table.column(column_name)
|
647
1083
|
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
field = field[part]
|
661
|
-
return Field.of(field).id
|
662
|
-
raise ValueError(f"Unknown field name type: {type(name)}")
|
1084
|
+
(
|
1085
|
+
processed_data,
|
1086
|
+
schema_field,
|
1087
|
+
field_update,
|
1088
|
+
new_field,
|
1089
|
+
) = self._process_existing_table_column(
|
1090
|
+
column_name,
|
1091
|
+
column_data,
|
1092
|
+
field_name_to_field,
|
1093
|
+
schema_evolution_mode,
|
1094
|
+
default_schema_consistency_type,
|
1095
|
+
)
|
663
1096
|
|
664
|
-
|
665
|
-
|
666
|
-
current: Union[pa.Schema, pa.Field],
|
667
|
-
visit: Callable,
|
668
|
-
path: NestedFieldName = [],
|
669
|
-
*args,
|
670
|
-
**kwargs,
|
671
|
-
) -> None:
|
672
|
-
"""
|
673
|
-
Recursively visit all fields in a PyArrow schema, including nested
|
674
|
-
fields.
|
1097
|
+
new_columns.append(processed_data)
|
1098
|
+
new_schema_fields.append(schema_field)
|
675
1099
|
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
1100
|
+
if field_update:
|
1101
|
+
field_updates[column_name] = field_update
|
1102
|
+
if new_field:
|
1103
|
+
new_fields[column_name] = new_field
|
1104
|
+
|
1105
|
+
# Add any missing fields from schema
|
1106
|
+
table_column_names = set(table.column_names)
|
1107
|
+
self._add_missing_schema_fields(
|
1108
|
+
table, table_column_names, new_columns, new_schema_fields
|
1109
|
+
)
|
1110
|
+
|
1111
|
+
# Apply schema updates if any modifications were made
|
1112
|
+
updated_schema = self._apply_schema_updates(field_updates, new_fields)
|
1113
|
+
|
1114
|
+
return (
|
1115
|
+
pa.table(new_columns, schema=pa.schema(new_schema_fields)),
|
1116
|
+
updated_schema,
|
1117
|
+
)
|
1118
|
+
|
1119
|
+
def validate_and_coerce_dataset(
|
1120
|
+
self,
|
1121
|
+
dataset: Union[pa.Table, Any],
|
1122
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
|
1123
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
|
1124
|
+
) -> Tuple[Union[pa.Table, Any], Schema]:
|
1125
|
+
"""Validate and coerce a dataset to match this schema's field types and constraints.
|
1126
|
+
|
1127
|
+
Args:
|
1128
|
+
dataset: Dataset to validate and coerce (PyArrow Table, Daft DataFrame, etc.)
|
1129
|
+
schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
|
1130
|
+
default_schema_consistency_type: Default consistency type for new fields in AUTO mode
|
1131
|
+
|
1132
|
+
Returns:
|
1133
|
+
Tuple[Dataset, Schema]: Dataset with data validated/coerced according to schema consistency types,
|
1134
|
+
and the (potentially updated) schema
|
1135
|
+
|
1136
|
+
Raises:
|
1137
|
+
SchemaValidationError: If validation fails or coercion is not possible
|
1138
|
+
SchemaCompatibilityError: If schema evolution would break compatibility
|
1139
|
+
"""
|
1140
|
+
# Handle PyArrow tables using existing method
|
1141
|
+
if get_dataset_type(dataset) == DatasetType.PYARROW:
|
1142
|
+
return self._validate_and_coerce_table(
|
1143
|
+
dataset,
|
1144
|
+
schema_evolution_mode,
|
1145
|
+
default_schema_consistency_type,
|
1146
|
+
)
|
1147
|
+
|
1148
|
+
# Handle Daft DataFrames without collecting to memory
|
1149
|
+
if get_dataset_type(dataset) == DatasetType.DAFT:
|
1150
|
+
return self._validate_and_coerce_daft_dataframe(
|
1151
|
+
dataset,
|
1152
|
+
schema_evolution_mode,
|
1153
|
+
default_schema_consistency_type,
|
1154
|
+
)
|
1155
|
+
|
1156
|
+
# Handle Ray Datasets by converting to Daft
|
1157
|
+
if get_dataset_type(dataset) == DatasetType.RAY_DATASET:
|
1158
|
+
daft_dataframe = dataset.to_daft()
|
1159
|
+
return self._validate_and_coerce_daft_dataframe(
|
1160
|
+
daft_dataframe,
|
1161
|
+
schema_evolution_mode,
|
1162
|
+
default_schema_consistency_type,
|
1163
|
+
)
|
1164
|
+
|
1165
|
+
# For other types, convert to PyArrow and back
|
1166
|
+
# Don't pass schema during conversion as it may contain columns not yet in the dataset
|
1167
|
+
pa_table = to_pyarrow(dataset)
|
1168
|
+
coerced_table, updated_schema = self._validate_and_coerce_table(
|
1169
|
+
pa_table,
|
1170
|
+
schema_evolution_mode,
|
1171
|
+
default_schema_consistency_type,
|
1172
|
+
)
|
1173
|
+
return from_pyarrow(coerced_table, get_dataset_type(dataset)), updated_schema
|
1174
|
+
|
1175
|
+
def coerce(
|
1176
|
+
self,
|
1177
|
+
dataset: Union[pa.Table, pd.DataFrame, np.ndarray, Any],
|
1178
|
+
manifest_entry_schema: Optional[Schema] = None,
|
1179
|
+
) -> Union[pa.Table, pd.DataFrame, np.ndarray, Any]:
|
1180
|
+
"""Coerce a dataset to match this schema using field type promotion.
|
1181
|
+
|
1182
|
+
This method processes different dataset types and applies type promotion
|
1183
|
+
using the field's promote_type_if_needed method. It handles:
|
1184
|
+
- PyArrow Tables
|
1185
|
+
- Pandas DataFrames
|
1186
|
+
- NumPy arrays (1D and 2D)
|
1187
|
+
- Polars DataFrames (if available)
|
1188
|
+
- Daft DataFrames (if available)
|
1189
|
+
- Other types with to_arrow() method
|
1190
|
+
|
1191
|
+
For each column, it:
|
1192
|
+
- Fields that exist in both dataset and schema: applies type promotion
|
1193
|
+
- Fields in dataset but not in schema: preserves as-is
|
1194
|
+
- Fields in schema but not in dataset: adds with null or past default values
|
1195
|
+
- Reorders columns to match schema order
|
1196
|
+
|
1197
|
+
Args:
|
1198
|
+
dataset: Dataset to coerce to this schema
|
1199
|
+
manifest_entry_schema: Original manifest entry schema used to write the dataset.
|
1200
|
+
|
1201
|
+
Returns:
|
1202
|
+
Dataset of the same type, coerced to match this schema.
|
1203
|
+
|
1204
|
+
Raises:
|
1205
|
+
SchemaValidationError: If coercion fails
|
1206
|
+
"""
|
1207
|
+
if not self.field_ids_to_fields:
|
1208
|
+
# No fields defined in schema, return original dataset
|
1209
|
+
return dataset
|
1210
|
+
|
1211
|
+
# Convert dataset to PyArrow table for processing
|
1212
|
+
pa_table = to_pyarrow(
|
1213
|
+
dataset,
|
1214
|
+
schema=manifest_entry_schema.arrow if manifest_entry_schema else None,
|
1215
|
+
)
|
1216
|
+
|
1217
|
+
# Process columns using field coercion
|
1218
|
+
coerced_columns, coerced_fields = self._coerce_table_columns(pa_table)
|
1219
|
+
|
1220
|
+
# Reorder columns to match schema order
|
1221
|
+
reordered_columns, reordered_fields = self._reorder_columns_to_schema(
|
1222
|
+
coerced_columns, coerced_fields, pa_table
|
1223
|
+
)
|
1224
|
+
|
1225
|
+
# Create new table with processed columns
|
1226
|
+
coerced_table = pa.table(reordered_columns, schema=pa.schema(reordered_fields))
|
1227
|
+
|
1228
|
+
# Convert back to original dataset type
|
1229
|
+
return from_pyarrow(coerced_table, get_dataset_type(dataset))
|
1230
|
+
|
1231
|
+
def _validate_and_coerce_daft_dataframe(
|
1232
|
+
self,
|
1233
|
+
dataframe: Any, # DaftDataFrame type
|
1234
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
|
1235
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
|
1236
|
+
) -> Tuple[Any, Schema]:
|
1237
|
+
"""Validate and coerce a Daft DataFrame without collecting to memory.
|
1238
|
+
|
1239
|
+
This method processes Daft DataFrames column by column using Daft expressions
|
1240
|
+
for validation and coercion, avoiding memory collection.
|
1241
|
+
|
1242
|
+
Args:
|
1243
|
+
dataframe: Daft DataFrame to validate and coerce
|
1244
|
+
schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
|
1245
|
+
default_schema_consistency_type: Default consistency type for new fields in AUTO mode
|
1246
|
+
|
1247
|
+
Returns:
|
1248
|
+
Tuple[DaftDataFrame, Schema]: Processed DataFrame and updated schema
|
1249
|
+
|
1250
|
+
Raises:
|
1251
|
+
SchemaValidationError: If validation fails or coercion is not possible
|
1252
|
+
SchemaCompatibilityError: If schema evolution would break compatibility
|
1253
|
+
"""
|
1254
|
+
if not self.field_ids_to_fields:
|
1255
|
+
# No fields defined in schema, return original dataframe
|
1256
|
+
return dataframe, self
|
1257
|
+
|
1258
|
+
# Setup
|
1259
|
+
field_name_to_field = self._create_field_name_mapping()
|
1260
|
+
field_updates = {} # field_name -> updated_field
|
1261
|
+
new_fields = {} # field_name -> new_field
|
1262
|
+
processed_dataframe = dataframe
|
1263
|
+
|
1264
|
+
# Process each column in the dataframe
|
1265
|
+
for column_name in dataframe.column_names:
|
1266
|
+
column_type = dataframe.schema()[column_name].dtype.to_arrow_dtype()
|
1267
|
+
|
1268
|
+
(
|
1269
|
+
processed_dataframe,
|
1270
|
+
schema_field,
|
1271
|
+
field_update,
|
1272
|
+
new_field,
|
1273
|
+
) = self._process_existing_daft_column(
|
1274
|
+
processed_dataframe,
|
1275
|
+
column_name,
|
1276
|
+
column_type,
|
1277
|
+
field_name_to_field,
|
1278
|
+
schema_evolution_mode,
|
1279
|
+
default_schema_consistency_type,
|
1280
|
+
)
|
1281
|
+
|
1282
|
+
if field_update:
|
1283
|
+
field_updates[column_name] = field_update
|
1284
|
+
if new_field:
|
1285
|
+
new_fields[column_name] = new_field
|
1286
|
+
|
1287
|
+
# Add any missing fields from schema
|
1288
|
+
dataframe_column_names = set(dataframe.column_names)
|
1289
|
+
processed_dataframe = self._add_missing_schema_fields_daft(
|
1290
|
+
processed_dataframe, dataframe_column_names
|
1291
|
+
)
|
1292
|
+
|
1293
|
+
# Apply schema updates if any modifications were made
|
1294
|
+
updated_schema = self._apply_schema_updates(field_updates, new_fields)
|
1295
|
+
|
1296
|
+
return processed_dataframe, updated_schema
|
1297
|
+
|
1298
|
+
def _process_existing_daft_column(
|
1299
|
+
self,
|
1300
|
+
dataframe: Any, # DaftDataFrame type
|
1301
|
+
column_name: str,
|
1302
|
+
column_type: pa.DataType,
|
1303
|
+
field_name_to_field: Dict[str, Field],
|
1304
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode],
|
1305
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType],
|
1306
|
+
) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
|
1307
|
+
"""Process a Daft DataFrame column that exists in the dataset.
|
1308
|
+
|
1309
|
+
Args:
|
1310
|
+
dataframe: Daft DataFrame to process
|
1311
|
+
column_name: Name of the column to process
|
1312
|
+
column_type: PyArrow DataType of the column
|
1313
|
+
field_name_to_field: Mapping from field names to Field objects
|
1314
|
+
schema_evolution_mode: How to handle fields not in schema
|
1315
|
+
default_schema_consistency_type: Default consistency type for new fields
|
1316
|
+
|
1317
|
+
Returns:
|
1318
|
+
Tuple of (processed_dataframe, schema_field, field_update, new_field)
|
1319
|
+
"""
|
1320
|
+
if column_name in field_name_to_field:
|
1321
|
+
# Field exists in schema - validate/coerce according to consistency type
|
1322
|
+
field = field_name_to_field[column_name]
|
1323
|
+
|
1324
|
+
if field.consistency_type == SchemaConsistencyType.VALIDATE:
|
1325
|
+
field.validate(column_type)
|
1326
|
+
return dataframe, field.arrow, None, None
|
1327
|
+
elif field.consistency_type == SchemaConsistencyType.COERCE:
|
1328
|
+
coerced_dataframe = field.coerce_daft(dataframe, column_name)
|
1329
|
+
return coerced_dataframe, field.arrow, None, None
|
1330
|
+
else:
|
1331
|
+
# NONE or no consistency type - use type promotion
|
1332
|
+
return self._handle_daft_type_promotion(
|
1333
|
+
dataframe, column_name, column_type, field
|
1334
|
+
)
|
1335
|
+
else:
|
1336
|
+
# Field not in schema - handle based on evolution mode
|
1337
|
+
return self._handle_new_daft_field(
|
1338
|
+
dataframe,
|
1339
|
+
column_name,
|
1340
|
+
column_type,
|
1341
|
+
schema_evolution_mode,
|
1342
|
+
default_schema_consistency_type,
|
1343
|
+
)
|
1344
|
+
|
1345
|
+
def _handle_daft_type_promotion(
|
1346
|
+
self,
|
1347
|
+
dataframe: Any, # DaftDataFrame type
|
1348
|
+
column_name: str,
|
1349
|
+
column_type: pa.DataType,
|
1350
|
+
field: Field,
|
1351
|
+
) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
|
1352
|
+
"""Handle type promotion for a Daft column with NONE consistency type."""
|
1353
|
+
# Create a dummy array to check type promotion
|
1354
|
+
dummy_array = pa.array([None], type=column_type)
|
1355
|
+
promoted_data, type_was_promoted = field.promote_type_if_needed(dummy_array)
|
1356
|
+
|
1357
|
+
if type_was_promoted:
|
1358
|
+
# Cast the Daft column to the promoted type
|
1359
|
+
promoted_dataframe = field.coerce_daft(
|
1360
|
+
dataframe, column_name, promoted_data.type
|
1361
|
+
)
|
1362
|
+
|
1363
|
+
# Cast default values to match the promoted type
|
1364
|
+
promoted_past_default = (
|
1365
|
+
field._cast_default_to_promoted_type(
|
1366
|
+
field.past_default, promoted_data.type
|
1367
|
+
)
|
1368
|
+
if field.past_default is not None
|
1369
|
+
else None
|
1370
|
+
)
|
1371
|
+
promoted_future_default = (
|
1372
|
+
field._cast_default_to_promoted_type(
|
1373
|
+
field.future_default, promoted_data.type
|
1374
|
+
)
|
1375
|
+
if field.future_default is not None
|
1376
|
+
else None
|
1377
|
+
)
|
1378
|
+
|
1379
|
+
# Create updated field with promoted type
|
1380
|
+
promoted_field = pa.field(
|
1381
|
+
field.arrow.name,
|
1382
|
+
promoted_data.type,
|
1383
|
+
field.arrow.nullable,
|
1384
|
+
field.arrow.metadata,
|
1385
|
+
)
|
1386
|
+
|
1387
|
+
updated_field = Field.of(
|
1388
|
+
promoted_field,
|
1389
|
+
field_id=field.id,
|
1390
|
+
past_default=promoted_past_default,
|
1391
|
+
future_default=promoted_future_default,
|
1392
|
+
consistency_type=field.consistency_type,
|
1393
|
+
path=field.path,
|
1394
|
+
native_object=field.native_object,
|
1395
|
+
)
|
1396
|
+
|
1397
|
+
return promoted_dataframe, promoted_field, updated_field, None
|
1398
|
+
else:
|
1399
|
+
return dataframe, field.arrow, None, None
|
1400
|
+
|
1401
|
+
def _handle_new_daft_field(
|
1402
|
+
self,
|
1403
|
+
dataframe: Any, # DaftDataFrame type
|
1404
|
+
column_name: str,
|
1405
|
+
column_type: pa.DataType,
|
1406
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode],
|
1407
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType],
|
1408
|
+
) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
|
1409
|
+
"""Handle a field that's not in the schema for Daft DataFrames."""
|
1410
|
+
if schema_evolution_mode == SchemaEvolutionMode.AUTO:
|
1411
|
+
# Create new field with default consistency type
|
1412
|
+
next_field_id = self.max_field_id + 1
|
1413
|
+
new_field = Field.of(
|
1414
|
+
field=pa.field(column_name, column_type),
|
1415
|
+
field_id=next_field_id,
|
1416
|
+
consistency_type=default_schema_consistency_type
|
1417
|
+
or SchemaConsistencyType.NONE,
|
1418
|
+
)
|
1419
|
+
return dataframe, new_field.arrow, None, new_field
|
1420
|
+
else:
|
1421
|
+
# MANUAL mode or not specified - raise error
|
1422
|
+
raise SchemaValidationError(
|
1423
|
+
f"Field '{column_name}' is not present in the schema and schema evolution mode is '{schema_evolution_mode}'"
|
1424
|
+
)
|
1425
|
+
|
1426
|
+
def _add_missing_schema_fields_daft(
|
1427
|
+
self,
|
1428
|
+
dataframe: Any, # DaftDataFrame type
|
1429
|
+
dataframe_column_names: set,
|
1430
|
+
) -> Any:
|
1431
|
+
"""Add columns for fields that exist in schema but not in Daft DataFrame."""
|
1432
|
+
processed_dataframe = dataframe
|
1433
|
+
|
1434
|
+
for field in self.field_ids_to_fields.values():
|
1435
|
+
if field.arrow.name not in dataframe_column_names:
|
1436
|
+
# Add column with null values or default value to Daft DataFrame
|
1437
|
+
if field.future_default is not None:
|
1438
|
+
# Convert default value to Daft literal
|
1439
|
+
processed_dataframe = processed_dataframe.with_column(
|
1440
|
+
field.arrow.name,
|
1441
|
+
daft.lit(field.future_default).cast(
|
1442
|
+
daft.DataType.from_arrow_type(field.arrow.type)
|
1443
|
+
),
|
1444
|
+
)
|
1445
|
+
elif field.arrow.nullable:
|
1446
|
+
# Add null column
|
1447
|
+
processed_dataframe = processed_dataframe.with_column(
|
1448
|
+
field.arrow.name,
|
1449
|
+
daft.lit(None).cast(
|
1450
|
+
daft.DataType.from_arrow_type(field.arrow.type)
|
1451
|
+
),
|
1452
|
+
)
|
1453
|
+
else:
|
1454
|
+
raise SchemaValidationError(
|
1455
|
+
f"Field '{field.arrow.name}' is required but not present and no future_default is set"
|
1456
|
+
)
|
1457
|
+
|
1458
|
+
return processed_dataframe
|
1459
|
+
|
1460
|
+
@property
|
1461
|
+
def fields(self) -> List[Field]:
|
1462
|
+
field_ids_to_fields = self.field_ids_to_fields
|
1463
|
+
return list(field_ids_to_fields.values())
|
1464
|
+
|
1465
|
+
@property
|
1466
|
+
def merge_keys(self) -> Optional[List[FieldId]]:
|
1467
|
+
return self.get("mergeKeys")
|
1468
|
+
|
1469
|
+
@property
|
1470
|
+
def field_ids_to_fields(self) -> Dict[FieldId, Field]:
|
1471
|
+
return self.get("fieldIdsToFields")
|
1472
|
+
|
1473
|
+
@property
|
1474
|
+
def arrow(self) -> pa.Schema:
|
1475
|
+
return self["arrow"]
|
1476
|
+
|
1477
|
+
@property
|
1478
|
+
def max_field_id(self) -> FieldId:
|
1479
|
+
return self["maxFieldId"]
|
1480
|
+
|
1481
|
+
@property
|
1482
|
+
def id(self) -> SchemaId:
|
1483
|
+
return Schema._schema_id(self.arrow)
|
1484
|
+
|
1485
|
+
@property
|
1486
|
+
def subschema(self, name: SchemaName) -> Optional[Schema]:
|
1487
|
+
subschemas = self.subschemas
|
1488
|
+
return subschemas.get(name) if subschemas else None
|
1489
|
+
|
1490
|
+
@property
|
1491
|
+
def subschemas(self) -> Dict[SchemaName, Schema]:
|
1492
|
+
# return cached subschemas first if they exist
|
1493
|
+
subschemas = self.get("subschemas")
|
1494
|
+
if not subschemas:
|
1495
|
+
# retrieve any defined subschemas
|
1496
|
+
subschemas_to_field_ids = self.subschemas_to_field_ids
|
1497
|
+
# rebuild and return the subschema cache
|
1498
|
+
if subschemas_to_field_ids:
|
1499
|
+
subschemas = {
|
1500
|
+
schema_name: Schema.of(
|
1501
|
+
schema=pa.schema(
|
1502
|
+
[self.field(field_id).arrow for field_id in field_ids]
|
1503
|
+
),
|
1504
|
+
schema_id=self.id,
|
1505
|
+
native_object=self.native_object,
|
1506
|
+
)
|
1507
|
+
for schema_name, field_ids in subschemas_to_field_ids.items()
|
1508
|
+
}
|
1509
|
+
self["subschemas"] = subschemas
|
1510
|
+
return subschemas or {}
|
1511
|
+
|
1512
|
+
@property
|
1513
|
+
def subschema_field_ids(self, name: SchemaName) -> Optional[List[FieldId]]:
|
1514
|
+
return self.subschemas_to_field_ids.get(name)
|
1515
|
+
|
1516
|
+
@property
|
1517
|
+
def subschemas_to_field_ids(self) -> Dict[SchemaName, List[FieldId]]:
|
1518
|
+
return Schema._subschemas(self.arrow)
|
1519
|
+
|
1520
|
+
@property
|
1521
|
+
def native_object(self) -> Optional[Any]:
|
1522
|
+
return self.get("nativeObject")
|
1523
|
+
|
1524
|
+
@staticmethod
|
1525
|
+
def _schema_id(schema: pa.Schema) -> SchemaId:
|
1526
|
+
schema_id = None
|
1527
|
+
if schema.metadata:
|
1528
|
+
bytes_val = schema.metadata.get(SCHEMA_ID_KEY_NAME)
|
1529
|
+
schema_id = int(bytes_val.decode()) if bytes_val else None
|
1530
|
+
return schema_id
|
1531
|
+
|
1532
|
+
@staticmethod
|
1533
|
+
def _subschemas(
|
1534
|
+
schema: pa.Schema,
|
1535
|
+
) -> Dict[SchemaName, List[FieldId]]:
|
1536
|
+
subschemas = None
|
1537
|
+
if schema.metadata:
|
1538
|
+
bytes_val = schema.metadata.get(SUBSCHEMAS_KEY_NAME)
|
1539
|
+
subschemas = _decode_metadata_value(bytes_val) if bytes_val else None
|
1540
|
+
return subschemas
|
1541
|
+
|
1542
|
+
@staticmethod
|
1543
|
+
def _field_name_to_field_id(
|
1544
|
+
schema: pa.Schema,
|
1545
|
+
name: Union[FieldName, NestedFieldName],
|
1546
|
+
) -> FieldId:
|
1547
|
+
if isinstance(name, str):
|
1548
|
+
return Field.of(schema.field(name)).id
|
1549
|
+
if isinstance(name, List):
|
1550
|
+
if not len(name):
|
1551
|
+
raise ValueError(f"Nested field name `{name}` is empty.")
|
1552
|
+
field = schema
|
1553
|
+
for part in name:
|
1554
|
+
field = field[part]
|
1555
|
+
return Field.of(field).id
|
1556
|
+
raise ValueError(f"Unknown field name type: {type(name)}")
|
1557
|
+
|
1558
|
+
@staticmethod
|
1559
|
+
def _visit_fields(
|
1560
|
+
current: Union[pa.Schema, pa.Field],
|
1561
|
+
visit: Callable,
|
1562
|
+
path: Optional[NestedFieldName] = None,
|
1563
|
+
*args,
|
1564
|
+
**kwargs,
|
1565
|
+
) -> None:
|
1566
|
+
"""
|
1567
|
+
Recursively visit all fields in a PyArrow schema, including nested
|
1568
|
+
fields.
|
1569
|
+
|
1570
|
+
Args:
|
1571
|
+
current: The schema or field to visit.
|
1572
|
+
visit: A function that visits the current field.
|
1573
|
+
path: The current path to the field.
|
1574
|
+
*args: Additional args to pass to the visit function.
|
681
1575
|
**kwargs: Additional keyword args to pass to the visit function.
|
682
1576
|
Returns:
|
683
1577
|
None
|
684
1578
|
"""
|
1579
|
+
path = [] if path is None else path
|
685
1580
|
if isinstance(current, pa.Schema):
|
686
1581
|
for field in current:
|
687
1582
|
Schema._visit_fields(
|
@@ -750,14 +1645,13 @@ class Schema(dict):
|
|
750
1645
|
visitor_dict: Dict[str, Any],
|
751
1646
|
) -> None:
|
752
1647
|
field_ids_to_fields = visitor_dict["fieldIdsToFields"]
|
753
|
-
max_field_id = (
|
754
|
-
visitor_dict["maxFieldId"] + len(field_ids_to_fields)
|
755
|
-
) % MAX_FIELD_ID_EXCLUSIVE
|
756
1648
|
dc_field = Field.of(field)
|
757
1649
|
if dc_field is not None and dc_field.id is not None:
|
758
1650
|
field_id = dc_field.id
|
759
1651
|
else:
|
760
|
-
field_id =
|
1652
|
+
field_id = (
|
1653
|
+
visitor_dict["maxFieldId"] + len(field_ids_to_fields)
|
1654
|
+
) % MAX_FIELD_ID_EXCLUSIVE
|
761
1655
|
|
762
1656
|
if (dupe := field_ids_to_fields.get(field_id)) is not None:
|
763
1657
|
raise ValueError(
|
@@ -846,47 +1740,1421 @@ class Schema(dict):
|
|
846
1740
|
return pa.unify_schemas(all_schemas), subschema_to_field_names
|
847
1741
|
return Schema._to_pyarrow_schema(schema), {} # SingleSchema
|
848
1742
|
|
849
|
-
|
850
|
-
|
851
|
-
name: SchemaName,
|
852
|
-
subschemas: Dict[SchemaName, Schema],
|
853
|
-
) -> Dict[SchemaName, Schema]:
|
854
|
-
deleted_subschema = subschemas.pop(name, None)
|
855
|
-
if deleted_subschema is None:
|
856
|
-
raise ValueError(f"Subschema `{name}` does not exist.")
|
857
|
-
return subschemas
|
1743
|
+
def _get_fields_with_merge_order(self) -> List[Field]:
|
1744
|
+
"""Get all fields that have merge_order defined.
|
858
1745
|
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
1746
|
+
Returns:
|
1747
|
+
List of fields with merge_order defined, or empty list if none
|
1748
|
+
"""
|
1749
|
+
return [field for field in self.fields if field.merge_order is not None]
|
1750
|
+
|
1751
|
+
def _create_sort_keys_from_merge_order_fields(
|
1752
|
+
self, fields_with_merge_order: List[Field]
|
1753
|
+
) -> List[SortKey]:
|
1754
|
+
"""Create sort keys from fields with explicit merge_order.
|
1755
|
+
|
1756
|
+
Args:
|
1757
|
+
fields_with_merge_order: List of fields with merge_order defined
|
1758
|
+
|
1759
|
+
Returns:
|
1760
|
+
List of SortKey objects with inverted sort order for deduplication
|
1761
|
+
"""
|
1762
|
+
from deltacat.storage.model.sort_key import SortKey
|
1763
|
+
|
1764
|
+
sort_keys = []
|
1765
|
+
for field in fields_with_merge_order:
|
1766
|
+
merge_order = field.merge_order
|
1767
|
+
desired_sort_order = merge_order[0]
|
1768
|
+
|
1769
|
+
# Invert the sort order because deduplication keeps the "last" record
|
1770
|
+
# ASCENDING merge_order (keep smallest) → DESCENDING sort (smallest appears last)
|
1771
|
+
# DESCENDING merge_order (keep largest) → ASCENDING sort (largest appears last)
|
1772
|
+
if desired_sort_order == SortOrder.ASCENDING:
|
1773
|
+
actual_sort_order = SortOrder.DESCENDING
|
1774
|
+
else:
|
1775
|
+
actual_sort_order = SortOrder.ASCENDING
|
1776
|
+
|
1777
|
+
sort_key = SortKey.of(
|
1778
|
+
key=[field.arrow.name],
|
1779
|
+
sort_order=actual_sort_order,
|
1780
|
+
null_order=merge_order[1], # NullOrder (AT_START/AT_END)
|
869
1781
|
)
|
870
|
-
|
871
|
-
|
872
|
-
for key, val in subschemas.items():
|
873
|
-
subschemas[key] = val.arrow
|
874
|
-
subschemas[name] = schema
|
875
|
-
return subschemas
|
1782
|
+
sort_keys.append(sort_key)
|
1783
|
+
return sort_keys
|
876
1784
|
|
1785
|
+
def _get_event_time_fields(self) -> List[Field]:
|
1786
|
+
"""Get all fields marked as event_time.
|
877
1787
|
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
for item in items:
|
883
|
-
if item is not None and not isinstance(item, Schema):
|
884
|
-
item = Schema(item)
|
885
|
-
typed_items.append(item)
|
886
|
-
return typed_items
|
1788
|
+
Returns:
|
1789
|
+
List of event_time fields, or empty list if none
|
1790
|
+
"""
|
1791
|
+
return [field for field in self.fields if field.is_event_time]
|
887
1792
|
|
888
|
-
def
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
1793
|
+
def _create_sort_keys_from_event_time_fields(
|
1794
|
+
self, event_time_fields: List[Field]
|
1795
|
+
) -> List:
|
1796
|
+
"""Create sort keys from event_time fields with default DESCENDING merge_order.
|
1797
|
+
|
1798
|
+
Args:
|
1799
|
+
event_time_fields: List of event_time fields
|
1800
|
+
|
1801
|
+
Returns:
|
1802
|
+
List of SortKey objects with ASCENDING sort order (inverted from DESCENDING merge_order)
|
1803
|
+
"""
|
1804
|
+
from deltacat.storage.model.sort_key import SortKey
|
1805
|
+
|
1806
|
+
sort_keys = []
|
1807
|
+
for field in event_time_fields:
|
1808
|
+
sort_key = SortKey.of(
|
1809
|
+
key=[field.arrow.name],
|
1810
|
+
sort_order=SortOrder.ASCENDING, # Inverted: DESCENDING merge_order → ASCENDING sort
|
1811
|
+
null_order=NullOrder.AT_END,
|
1812
|
+
)
|
1813
|
+
sort_keys.append(sort_key)
|
1814
|
+
return sort_keys
|
1815
|
+
|
1816
|
+
def _create_field_name_mapping(self) -> Dict[str, Field]:
|
1817
|
+
"""Create a mapping from field names to Field objects."""
|
1818
|
+
field_name_to_field = {}
|
1819
|
+
for field in self.field_ids_to_fields.values():
|
1820
|
+
field_name_to_field[field.arrow.name] = field
|
1821
|
+
return field_name_to_field
|
1822
|
+
|
1823
|
+
def _process_existing_table_column(
|
1824
|
+
self,
|
1825
|
+
column_name: str,
|
1826
|
+
column_data: pa.Array,
|
1827
|
+
field_name_to_field: Dict[str, Field],
|
1828
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode],
|
1829
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType],
|
1830
|
+
) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
|
1831
|
+
"""Process a column that exists in the table.
|
1832
|
+
|
1833
|
+
Returns:
|
1834
|
+
Tuple of (processed_column_data, schema_field, field_update, new_field)
|
1835
|
+
"""
|
1836
|
+
if column_name in field_name_to_field:
|
1837
|
+
# Field exists in schema - validate/coerce according to consistency type
|
1838
|
+
field = field_name_to_field[column_name]
|
1839
|
+
|
1840
|
+
if field.consistency_type == SchemaConsistencyType.VALIDATE:
|
1841
|
+
field.validate(column_data.type)
|
1842
|
+
return column_data, field.arrow, None, None
|
1843
|
+
elif field.consistency_type == SchemaConsistencyType.COERCE:
|
1844
|
+
coerced_data = field.coerce(column_data)
|
1845
|
+
return coerced_data, field.arrow, None, None
|
1846
|
+
else:
|
1847
|
+
# NONE or no consistency type - use type promotion
|
1848
|
+
return self._handle_type_promotion(column_name, column_data, field)
|
1849
|
+
else:
|
1850
|
+
# Field not in schema - handle based on evolution mode
|
1851
|
+
return self._handle_new_field(
|
1852
|
+
column_name,
|
1853
|
+
column_data,
|
1854
|
+
schema_evolution_mode,
|
1855
|
+
default_schema_consistency_type,
|
1856
|
+
)
|
1857
|
+
|
1858
|
+
def _handle_type_promotion(
|
1859
|
+
self, column_name: str, column_data: pa.Array, field: Field
|
1860
|
+
) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
|
1861
|
+
"""Handle type promotion for a field with NONE consistency type."""
|
1862
|
+
promoted_data, type_was_promoted = field.promote_type_if_needed(column_data)
|
1863
|
+
|
1864
|
+
if type_was_promoted:
|
1865
|
+
# Cast default values to match the promoted type
|
1866
|
+
promoted_past_default = (
|
1867
|
+
field._cast_default_to_promoted_type(
|
1868
|
+
field.past_default, promoted_data.type
|
1869
|
+
)
|
1870
|
+
if field.past_default is not None
|
1871
|
+
else None
|
1872
|
+
)
|
1873
|
+
|
1874
|
+
promoted_future_default = (
|
1875
|
+
field._cast_default_to_promoted_type(
|
1876
|
+
field.future_default, promoted_data.type
|
1877
|
+
)
|
1878
|
+
if field.future_default is not None
|
1879
|
+
else None
|
1880
|
+
)
|
1881
|
+
|
1882
|
+
# Create updated field with same properties but new type and cast defaults
|
1883
|
+
promoted_field = pa.field(
|
1884
|
+
field.arrow.name,
|
1885
|
+
promoted_data.type,
|
1886
|
+
nullable=field.arrow.nullable,
|
1887
|
+
metadata=field.arrow.metadata,
|
1888
|
+
)
|
1889
|
+
|
1890
|
+
updated_field = Field.of(
|
1891
|
+
promoted_field,
|
1892
|
+
field_id=field.id,
|
1893
|
+
is_merge_key=field.is_merge_key,
|
1894
|
+
merge_order=field.merge_order,
|
1895
|
+
is_event_time=field.is_event_time,
|
1896
|
+
doc=field.doc,
|
1897
|
+
past_default=promoted_past_default,
|
1898
|
+
future_default=promoted_future_default,
|
1899
|
+
consistency_type=field.consistency_type,
|
1900
|
+
path=field.path,
|
1901
|
+
native_object=field.native_object,
|
1902
|
+
)
|
1903
|
+
|
1904
|
+
return promoted_data, promoted_field, updated_field, None
|
1905
|
+
else:
|
1906
|
+
return promoted_data, field.arrow, None, None
|
1907
|
+
|
1908
|
+
def _handle_new_field(
|
1909
|
+
self,
|
1910
|
+
column_name: str,
|
1911
|
+
column_data: pa.Array,
|
1912
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode],
|
1913
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType],
|
1914
|
+
) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
|
1915
|
+
"""Handle a field that's not in the schema."""
|
1916
|
+
if schema_evolution_mode == SchemaEvolutionMode.AUTO:
|
1917
|
+
# Create new field with default consistency type
|
1918
|
+
next_field_id = self.max_field_id + 1
|
1919
|
+
new_field = Field.of(
|
1920
|
+
pa.field(column_name, column_data.type, nullable=True),
|
1921
|
+
field_id=next_field_id,
|
1922
|
+
consistency_type=default_schema_consistency_type
|
1923
|
+
or SchemaConsistencyType.NONE,
|
1924
|
+
)
|
1925
|
+
return column_data, new_field.arrow, None, new_field
|
1926
|
+
else:
|
1927
|
+
# MANUAL mode or disabled - raise error
|
1928
|
+
raise SchemaValidationError(
|
1929
|
+
f"Field '{column_name}' is not present in the schema and schema evolution mode is '{schema_evolution_mode}'"
|
1930
|
+
)
|
1931
|
+
|
1932
|
+
def _add_missing_schema_fields(
|
1933
|
+
self,
|
1934
|
+
table: pa.Table,
|
1935
|
+
table_column_names: set,
|
1936
|
+
new_columns: List[pa.Array],
|
1937
|
+
new_schema_fields: List[pa.Field],
|
1938
|
+
) -> None:
|
1939
|
+
"""Add columns for fields that exist in schema but not in table."""
|
1940
|
+
for field in self.field_ids_to_fields.values():
|
1941
|
+
if field.arrow.name not in table_column_names:
|
1942
|
+
# Use future_default if available, otherwise check if nullable
|
1943
|
+
if field.future_default is not None:
|
1944
|
+
# Create column with future_default value
|
1945
|
+
default_array = pa.array(
|
1946
|
+
[field.future_default] * get_table_length(table),
|
1947
|
+
type=field.arrow.type,
|
1948
|
+
)
|
1949
|
+
new_columns.append(default_array)
|
1950
|
+
elif field.arrow.nullable:
|
1951
|
+
# Backfill with nulls if field is nullable
|
1952
|
+
null_column = pa.nulls(
|
1953
|
+
get_table_length(table), type=field.arrow.type
|
1954
|
+
)
|
1955
|
+
new_columns.append(null_column)
|
1956
|
+
else:
|
1957
|
+
# Field is not nullable and no future_default - error
|
1958
|
+
raise SchemaValidationError(
|
1959
|
+
f"Field '{field.arrow.name}' is required but not present and no future_default is set"
|
1960
|
+
)
|
1961
|
+
new_schema_fields.append(field.arrow)
|
1962
|
+
|
1963
|
+
def _apply_schema_updates(
|
1964
|
+
self, field_updates: Dict[str, Field], new_fields: Dict[str, Field]
|
1965
|
+
) -> Schema:
|
1966
|
+
"""Apply collected schema updates and return the updated schema."""
|
1967
|
+
if not field_updates and not new_fields:
|
1968
|
+
return self
|
1969
|
+
|
1970
|
+
# Initialize schema update with allow_incompatible_changes=True for type promotion
|
1971
|
+
schema_update = self.update(allow_incompatible_changes=True)
|
1972
|
+
|
1973
|
+
# Apply field updates
|
1974
|
+
for field_name, updated_field in field_updates.items():
|
1975
|
+
schema_update = schema_update._update_field(field_name, updated_field)
|
1976
|
+
|
1977
|
+
# Apply new fields
|
1978
|
+
for field_name, new_field in new_fields.items():
|
1979
|
+
schema_update = schema_update.add_field(new_field)
|
1980
|
+
|
1981
|
+
# Apply all updates
|
1982
|
+
return schema_update.apply()
|
1983
|
+
|
1984
|
+
def _process_existing_columns_for_coercion(
|
1985
|
+
self, pa_table: pa.Table, field_name_to_field: Dict[str, Field]
|
1986
|
+
) -> Tuple[List[pa.Array], List[pa.Field]]:
|
1987
|
+
"""Process columns that exist in the table for coercion.
|
1988
|
+
|
1989
|
+
Args:
|
1990
|
+
pa_table: PyArrow table to process
|
1991
|
+
field_name_to_field: Mapping from field names to Field objects
|
1992
|
+
|
1993
|
+
Returns:
|
1994
|
+
Tuple of (processed columns, corresponding fields)
|
1995
|
+
"""
|
1996
|
+
new_columns = []
|
1997
|
+
new_schema_fields = []
|
1998
|
+
|
1999
|
+
for column_name in pa_table.column_names:
|
2000
|
+
column_data = pa_table.column(column_name)
|
2001
|
+
|
2002
|
+
if column_name in field_name_to_field:
|
2003
|
+
# Field exists in target schema - use promote_type_if_needed for coercion
|
2004
|
+
field = field_name_to_field[column_name]
|
2005
|
+
promoted_data, _ = field.promote_type_if_needed(column_data)
|
2006
|
+
new_columns.append(promoted_data)
|
2007
|
+
new_schema_fields.append(field.arrow)
|
2008
|
+
else:
|
2009
|
+
# Field not in target schema - preserve as-is
|
2010
|
+
new_columns.append(column_data)
|
2011
|
+
new_schema_fields.append(pa.field(column_name, column_data.type))
|
2012
|
+
|
2013
|
+
return new_columns, new_schema_fields
|
2014
|
+
|
2015
|
+
def _add_missing_fields_for_coercion(
|
2016
|
+
self,
|
2017
|
+
pa_table: pa.Table,
|
2018
|
+
field_name_to_field: Dict[str, Field],
|
2019
|
+
existing_columns: List[pa.Array],
|
2020
|
+
existing_fields: List[pa.Field],
|
2021
|
+
) -> Tuple[List[pa.Array], List[pa.Field]]:
|
2022
|
+
"""Add columns for fields that exist in schema but not in table.
|
2023
|
+
|
2024
|
+
Args:
|
2025
|
+
pa_table: Original PyArrow table
|
2026
|
+
field_name_to_field: Mapping from field names to Field objects
|
2027
|
+
existing_columns: Columns already processed
|
2028
|
+
existing_fields: Fields already processed
|
2029
|
+
|
2030
|
+
Returns:
|
2031
|
+
Tuple of (all columns including added ones, all corresponding fields)
|
2032
|
+
"""
|
2033
|
+
all_columns = existing_columns.copy()
|
2034
|
+
all_fields = existing_fields.copy()
|
2035
|
+
|
2036
|
+
# Add any missing fields from target schema with null values or past_default values
|
2037
|
+
target_field_names = {
|
2038
|
+
field.arrow.name for field in self.field_ids_to_fields.values()
|
2039
|
+
}
|
2040
|
+
table_field_names = set(pa_table.column_names)
|
2041
|
+
|
2042
|
+
for field_name in target_field_names - table_field_names:
|
2043
|
+
field = field_name_to_field[field_name]
|
2044
|
+
|
2045
|
+
# Check if field has past_default value and use it instead of nulls
|
2046
|
+
if field.past_default is not None:
|
2047
|
+
# Create array filled with past_default value
|
2048
|
+
default_column = pa.array(
|
2049
|
+
[field.past_default] * get_table_length(pa_table),
|
2050
|
+
type=field.arrow.type,
|
2051
|
+
)
|
2052
|
+
all_columns.append(default_column)
|
2053
|
+
else:
|
2054
|
+
# Use null values as before
|
2055
|
+
null_column = pa.nulls(
|
2056
|
+
get_table_length(pa_table), type=field.arrow.type
|
2057
|
+
)
|
2058
|
+
all_columns.append(null_column)
|
2059
|
+
|
2060
|
+
all_fields.append(field.arrow)
|
2061
|
+
|
2062
|
+
return all_columns, all_fields
|
2063
|
+
|
2064
|
+
def _coerce_table_columns(
|
2065
|
+
self, pa_table: pa.Table
|
2066
|
+
) -> Tuple[List[pa.Array], List[pa.Field]]:
|
2067
|
+
"""Process table columns using field coercion and add missing fields.
|
2068
|
+
|
2069
|
+
Args:
|
2070
|
+
pa_table: PyArrow table to process
|
2071
|
+
|
2072
|
+
Returns:
|
2073
|
+
Tuple of (list of coerced columns, list of corresponding fields)
|
2074
|
+
"""
|
2075
|
+
# Create mapping from field names to Field objects
|
2076
|
+
field_name_to_field = self._create_field_name_mapping()
|
2077
|
+
|
2078
|
+
# Process existing columns in the table
|
2079
|
+
(
|
2080
|
+
processed_columns,
|
2081
|
+
processed_fields,
|
2082
|
+
) = self._process_existing_columns_for_coercion(pa_table, field_name_to_field)
|
2083
|
+
|
2084
|
+
# Add any missing fields from target schema
|
2085
|
+
all_columns, all_fields = self._add_missing_fields_for_coercion(
|
2086
|
+
pa_table, field_name_to_field, processed_columns, processed_fields
|
2087
|
+
)
|
2088
|
+
|
2089
|
+
return all_columns, all_fields
|
2090
|
+
|
2091
|
+
def _reorder_columns_to_schema(
|
2092
|
+
self, columns: List[pa.Array], fields: List[pa.Field], original_table: pa.Table
|
2093
|
+
) -> Tuple[List[pa.Array], List[pa.Field]]:
|
2094
|
+
"""Reorder columns to match schema order, preserving extra fields.
|
2095
|
+
|
2096
|
+
Args:
|
2097
|
+
columns: List of processed columns
|
2098
|
+
fields: List of corresponding field schemas
|
2099
|
+
original_table: Original table for field name ordering
|
2100
|
+
|
2101
|
+
Returns:
|
2102
|
+
Tuple of (reordered columns, reordered fields)
|
2103
|
+
"""
|
2104
|
+
# Reorder columns to match schema order
|
2105
|
+
reordered_columns = []
|
2106
|
+
reordered_fields = []
|
2107
|
+
schema_field_names = [
|
2108
|
+
field.arrow.name for field in self.field_ids_to_fields.values()
|
2109
|
+
]
|
2110
|
+
|
2111
|
+
# Add schema fields in schema order
|
2112
|
+
for field_name in schema_field_names:
|
2113
|
+
for i, field in enumerate(fields):
|
2114
|
+
if field.name == field_name:
|
2115
|
+
reordered_columns.append(columns[i])
|
2116
|
+
reordered_fields.append(field)
|
2117
|
+
break
|
2118
|
+
|
2119
|
+
# Add any extra fields that aren't in schema (preserve original order)
|
2120
|
+
target_field_names = set(schema_field_names)
|
2121
|
+
table_field_names = set(original_table.column_names)
|
2122
|
+
extra_field_names = table_field_names - target_field_names
|
2123
|
+
|
2124
|
+
for field_name in original_table.column_names:
|
2125
|
+
if field_name in extra_field_names:
|
2126
|
+
for i, field in enumerate(fields):
|
2127
|
+
if field.name == field_name:
|
2128
|
+
reordered_columns.append(columns[i])
|
2129
|
+
reordered_fields.append(field)
|
2130
|
+
break
|
2131
|
+
|
2132
|
+
return reordered_columns, reordered_fields
|
2133
|
+
|
2134
|
+
@staticmethod
|
2135
|
+
def _del_subschema(
|
2136
|
+
name: SchemaName,
|
2137
|
+
subschemas: Dict[SchemaName, Schema],
|
2138
|
+
) -> Dict[SchemaName, Schema]:
|
2139
|
+
deleted_subschema = subschemas.pop(name, None)
|
2140
|
+
if deleted_subschema is None:
|
2141
|
+
raise ValueError(f"Subschema `{name}` does not exist.")
|
2142
|
+
return subschemas
|
2143
|
+
|
2144
|
+
@staticmethod
|
2145
|
+
def _add_subschema(
|
2146
|
+
name: SchemaName,
|
2147
|
+
schema: SingleSchema,
|
2148
|
+
subschemas: Dict[SchemaName, Schema],
|
2149
|
+
) -> Dict[SchemaName, Schema]:
|
2150
|
+
Schema._validate_schema_name(name)
|
2151
|
+
if name == BASE_SCHEMA_NAME:
|
2152
|
+
raise ValueError(
|
2153
|
+
f"Cannot add subschema with reserved name: {BASE_SCHEMA_NAME}"
|
2154
|
+
)
|
2155
|
+
if name in subschemas:
|
2156
|
+
raise ValueError(f"Subschema `{name}` already exists.")
|
2157
|
+
for key, val in subschemas.items():
|
2158
|
+
subschemas[key] = val.arrow
|
2159
|
+
subschemas[name] = schema
|
2160
|
+
return subschemas
|
2161
|
+
|
2162
|
+
|
2163
|
+
class SchemaList(List[Schema]):
|
2164
|
+
@staticmethod
|
2165
|
+
def of(items: List[Schema]) -> SchemaList:
|
2166
|
+
typed_items = SchemaList()
|
2167
|
+
for item in items:
|
2168
|
+
if item is not None and not isinstance(item, Schema):
|
2169
|
+
item = Schema(item)
|
2170
|
+
typed_items.append(item)
|
2171
|
+
return typed_items
|
2172
|
+
|
2173
|
+
def __getitem__(self, item):
|
2174
|
+
val = super().__getitem__(item)
|
2175
|
+
if val is not None and not isinstance(val, Schema):
|
2176
|
+
self[item] = val = Schema(val)
|
2177
|
+
return val
|
2178
|
+
|
2179
|
+
def __iter__(self):
|
2180
|
+
for i in range(len(self)):
|
2181
|
+
yield self[i] # This triggers __getitem__ conversion
|
2182
|
+
|
2183
|
+
|
2184
|
+
class SchemaUpdate(dict):
|
2185
|
+
"""
|
2186
|
+
Provides safe schema evolution capabilities for DeltaCAT schemas.
|
2187
|
+
|
2188
|
+
SchemaUpdate allows users to:
|
2189
|
+
1. Add new fields to a schema
|
2190
|
+
2. Remove existing fields from a schema
|
2191
|
+
3. Update existing fields with compatible changes
|
2192
|
+
4. Validate schema compatibility to prevent breaking existing dataset consumers
|
2193
|
+
|
2194
|
+
The class enforces backward compatibility by default to ensure that table
|
2195
|
+
consumer jobs writtten using PyArrow, Pandas, Polars, Ray Data, Daft, and other
|
2196
|
+
dataset types continue to work after schema changes.
|
2197
|
+
|
2198
|
+
Example:
|
2199
|
+
Using Schema.update():
|
2200
|
+
>>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
|
2201
|
+
>>> new_field = Field.of(pa.field("name", pa.string()))
|
2202
|
+
>>> updated_schema = (schema.update()
|
2203
|
+
... .add_field("name", new_field)
|
2204
|
+
... .apply())
|
2205
|
+
|
2206
|
+
Using SchemaUpdate.of():
|
2207
|
+
>>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
|
2208
|
+
>>> update = SchemaUpdate.of(schema)
|
2209
|
+
>>> new_field = Field.of(pa.field("name", pa.string()))
|
2210
|
+
>>> updated_schema = update.add_field("name", new_field).apply()
|
2211
|
+
"""
|
2212
|
+
|
2213
|
+
@staticmethod
|
2214
|
+
def of(
|
2215
|
+
base_schema: Schema, allow_incompatible_changes: bool = False
|
2216
|
+
) -> SchemaUpdate:
|
2217
|
+
"""
|
2218
|
+
Create a SchemaUpdate for the given base schema.
|
2219
|
+
|
2220
|
+
Args:
|
2221
|
+
base_schema: The original schema to update
|
2222
|
+
allow_incompatible_changes: If True, allows changes that may break
|
2223
|
+
backward compatibility. If False (default), raises SchemaCompatibilityError
|
2224
|
+
for incompatible changes.
|
2225
|
+
|
2226
|
+
Returns:
|
2227
|
+
A new SchemaUpdate instance
|
2228
|
+
"""
|
2229
|
+
return SchemaUpdate(
|
2230
|
+
{
|
2231
|
+
"baseSchema": base_schema,
|
2232
|
+
"allowIncompatibleChanges": allow_incompatible_changes,
|
2233
|
+
"operations": SchemaUpdateOperations.of([]),
|
2234
|
+
}
|
2235
|
+
)
|
2236
|
+
|
2237
|
+
@property
|
2238
|
+
def base_schema(self) -> Schema:
|
2239
|
+
"""Get the base schema being updated."""
|
2240
|
+
return self["baseSchema"]
|
2241
|
+
|
2242
|
+
@base_schema.setter
|
2243
|
+
def base_schema(self, value: Schema) -> None:
|
2244
|
+
"""Set the base schema being updated."""
|
2245
|
+
self["baseSchema"] = value
|
2246
|
+
|
2247
|
+
@property
|
2248
|
+
def allow_incompatible_changes(self) -> bool:
|
2249
|
+
"""Get whether incompatible changes are allowed."""
|
2250
|
+
return self["allowIncompatibleChanges"]
|
2251
|
+
|
2252
|
+
@allow_incompatible_changes.setter
|
2253
|
+
def allow_incompatible_changes(self, value: bool) -> None:
|
2254
|
+
"""Set whether incompatible changes are allowed."""
|
2255
|
+
self["allowIncompatibleChanges"] = value
|
2256
|
+
|
2257
|
+
@property
|
2258
|
+
def operations(self) -> SchemaUpdateOperations:
|
2259
|
+
"""Get the list of pending operations."""
|
2260
|
+
return self["operations"]
|
2261
|
+
|
2262
|
+
@operations.setter
|
2263
|
+
def operations(self, value: SchemaUpdateOperations) -> None:
|
2264
|
+
"""Set the list of pending operations."""
|
2265
|
+
self["operations"] = value
|
2266
|
+
|
2267
|
+
def add_field(
|
2268
|
+
self,
|
2269
|
+
new_field: Field,
|
2270
|
+
) -> SchemaUpdate:
|
2271
|
+
"""
|
2272
|
+
Add a new field to the schema.
|
2273
|
+
|
2274
|
+
Args:
|
2275
|
+
field_locator: Location identifier for the new field (name, nested path, or ID)
|
2276
|
+
new_field: The Field object to add
|
2277
|
+
|
2278
|
+
Returns:
|
2279
|
+
Self for method chaining
|
2280
|
+
|
2281
|
+
Raises:
|
2282
|
+
SchemaCompatibilityError: If field already exists or addition would break compatibility
|
2283
|
+
"""
|
2284
|
+
self.operations.append(SchemaUpdateOperation.add_field(new_field))
|
2285
|
+
return self
|
2286
|
+
|
2287
|
+
def remove_field(self, field_locator: FieldLocator) -> SchemaUpdate:
|
2288
|
+
"""
|
2289
|
+
Remove an existing field from the schema.
|
2290
|
+
|
2291
|
+
Args:
|
2292
|
+
field_locator: Location identifier for the field to remove
|
2293
|
+
|
2294
|
+
Returns:
|
2295
|
+
Self for method chaining
|
2296
|
+
|
2297
|
+
Raises:
|
2298
|
+
SchemaCompatibilityError: If field doesn't exist or removal would break compatibility
|
2299
|
+
"""
|
2300
|
+
self.operations.append(SchemaUpdateOperation.remove_field(field_locator))
|
2301
|
+
return self
|
2302
|
+
|
2303
|
+
def rename_field(
|
2304
|
+
self,
|
2305
|
+
field_locator: FieldLocator,
|
2306
|
+
new_name: str,
|
2307
|
+
) -> SchemaUpdate:
|
2308
|
+
"""
|
2309
|
+
Rename an existing field while keeping all other properties the same.
|
2310
|
+
|
2311
|
+
Args:
|
2312
|
+
field_locator: Location identifier for the field to rename
|
2313
|
+
new_name: The new name for the field
|
2314
|
+
|
2315
|
+
Returns:
|
2316
|
+
Self for method chaining
|
2317
|
+
|
2318
|
+
Raises:
|
2319
|
+
SchemaCompatibilityError: If field doesn't exist or rename would break compatibility
|
2320
|
+
"""
|
2321
|
+
# Get the existing field
|
2322
|
+
existing_field = self._get_existing_field(field_locator)
|
2323
|
+
|
2324
|
+
# Create a deep copy of the field
|
2325
|
+
updated_field = copy.deepcopy(existing_field)
|
2326
|
+
|
2327
|
+
# Update only the arrow field name
|
2328
|
+
updated_field["arrow"] = pa.field(
|
2329
|
+
new_name,
|
2330
|
+
existing_field.arrow.type,
|
2331
|
+
nullable=existing_field.arrow.nullable,
|
2332
|
+
metadata=existing_field.arrow.metadata,
|
2333
|
+
)
|
2334
|
+
|
2335
|
+
return self._update_field(field_locator, updated_field)
|
2336
|
+
|
2337
|
+
def update_field_type(
|
2338
|
+
self, field_locator: FieldLocator, new_type: pa.DataType
|
2339
|
+
) -> SchemaUpdate:
|
2340
|
+
"""
|
2341
|
+
Update the PyArrow data type of an existing field while keeping all other properties the same.
|
2342
|
+
|
2343
|
+
Args:
|
2344
|
+
field_locator: Location identifier for the field to update
|
2345
|
+
new_type: The new PyArrow data type for the field
|
2346
|
+
|
2347
|
+
Returns:
|
2348
|
+
Self for method chaining
|
2349
|
+
|
2350
|
+
Raises:
|
2351
|
+
SchemaCompatibilityError: If field doesn't exist or type change would break compatibility
|
2352
|
+
"""
|
2353
|
+
# Get the existing field
|
2354
|
+
existing_field = self._get_existing_field(field_locator)
|
2355
|
+
|
2356
|
+
# Create a deep copy of the field
|
2357
|
+
updated_field = copy.deepcopy(existing_field)
|
2358
|
+
|
2359
|
+
# Update only the arrow field type
|
2360
|
+
updated_field["arrow"] = pa.field(
|
2361
|
+
existing_field.arrow.name,
|
2362
|
+
new_type,
|
2363
|
+
nullable=existing_field.arrow.nullable,
|
2364
|
+
metadata=existing_field.arrow.metadata,
|
2365
|
+
)
|
2366
|
+
|
2367
|
+
return self._update_field(field_locator, updated_field)
|
2368
|
+
|
2369
|
+
def update_field_doc(
|
2370
|
+
self,
|
2371
|
+
field_locator: FieldLocator,
|
2372
|
+
new_doc: Optional[str],
|
2373
|
+
) -> SchemaUpdate:
|
2374
|
+
"""
|
2375
|
+
Update the documentation of an existing field while keeping all other properties the same.
|
2376
|
+
|
2377
|
+
Args:
|
2378
|
+
field_locator: Location identifier for the field to update
|
2379
|
+
new_doc: The new documentation string for the field
|
2380
|
+
|
2381
|
+
Returns:
|
2382
|
+
Self for method chaining
|
2383
|
+
|
2384
|
+
Raises:
|
2385
|
+
SchemaCompatibilityError: If field doesn't exist
|
2386
|
+
"""
|
2387
|
+
# Get the existing field
|
2388
|
+
existing_field = self._get_existing_field(field_locator)
|
2389
|
+
|
2390
|
+
# Create a deep copy of the field
|
2391
|
+
updated_field = copy.deepcopy(existing_field)
|
2392
|
+
|
2393
|
+
# Update the arrow field metadata to set/remove doc
|
2394
|
+
new_metadata = copy.deepcopy(existing_field.arrow.metadata)
|
2395
|
+
new_metadata.pop(FIELD_DOC_KEY_NAME, None)
|
2396
|
+
if new_doc is not None:
|
2397
|
+
new_metadata[FIELD_DOC_KEY_NAME] = new_doc
|
2398
|
+
|
2399
|
+
updated_field["arrow"] = pa.field(
|
2400
|
+
existing_field.arrow.name,
|
2401
|
+
existing_field.arrow.type,
|
2402
|
+
nullable=existing_field.arrow.nullable,
|
2403
|
+
metadata=new_metadata if new_metadata else None,
|
2404
|
+
)
|
2405
|
+
|
2406
|
+
return self._update_field(field_locator, updated_field)
|
2407
|
+
|
2408
|
+
def update_field_nullability(
|
2409
|
+
self, field_locator: FieldLocator, nullable: bool
|
2410
|
+
) -> SchemaUpdate:
|
2411
|
+
"""
|
2412
|
+
Update the nullability of an existing field while keeping all other properties the same.
|
2413
|
+
|
2414
|
+
Args:
|
2415
|
+
field_locator: Location identifier for the field to update
|
2416
|
+
nullable: Whether the field should allow null values
|
2417
|
+
|
2418
|
+
Returns:
|
2419
|
+
Self for method chaining
|
2420
|
+
|
2421
|
+
Raises:
|
2422
|
+
SchemaCompatibilityError: If field doesn't exist or nullability change would break compatibility
|
2423
|
+
"""
|
2424
|
+
# Get the existing field
|
2425
|
+
existing_field = self._get_existing_field(field_locator)
|
2426
|
+
|
2427
|
+
# Create a deep copy of the field
|
2428
|
+
updated_field = copy.deepcopy(existing_field)
|
2429
|
+
|
2430
|
+
# Update only the arrow field nullability
|
2431
|
+
updated_field["arrow"] = pa.field(
|
2432
|
+
existing_field.arrow.name,
|
2433
|
+
existing_field.arrow.type,
|
2434
|
+
nullable=nullable,
|
2435
|
+
metadata=existing_field.arrow.metadata,
|
2436
|
+
)
|
2437
|
+
|
2438
|
+
return self._update_field(field_locator, updated_field)
|
2439
|
+
|
2440
|
+
def update_field_consistency_type(
|
2441
|
+
self,
|
2442
|
+
field_locator: FieldLocator,
|
2443
|
+
consistency_type: Optional[SchemaConsistencyType],
|
2444
|
+
) -> SchemaUpdate:
|
2445
|
+
"""
|
2446
|
+
Update the schema consistency type of an existing field while keeping all other properties the same.
|
2447
|
+
|
2448
|
+
Args:
|
2449
|
+
field_locator: Location identifier for the field to update
|
2450
|
+
consistency_type: The new schema consistency type for the field
|
2451
|
+
|
2452
|
+
Returns:
|
2453
|
+
Self for method chaining
|
2454
|
+
|
2455
|
+
Raises:
|
2456
|
+
SchemaCompatibilityError: If field doesn't exist
|
2457
|
+
"""
|
2458
|
+
# Get the existing field
|
2459
|
+
existing_field = self._get_existing_field(field_locator)
|
2460
|
+
|
2461
|
+
# Create a deep copy of the field
|
2462
|
+
updated_field = copy.deepcopy(existing_field)
|
2463
|
+
|
2464
|
+
# Update the arrow field metadata to set/remove consistency type
|
2465
|
+
new_metadata = copy.deepcopy(existing_field.arrow.metadata)
|
2466
|
+
new_metadata.pop(FIELD_CONSISTENCY_TYPE_KEY_NAME, None)
|
2467
|
+
|
2468
|
+
if consistency_type is not None:
|
2469
|
+
new_metadata[FIELD_CONSISTENCY_TYPE_KEY_NAME] = consistency_type.value
|
2470
|
+
|
2471
|
+
updated_field["arrow"] = pa.field(
|
2472
|
+
existing_field.arrow.name,
|
2473
|
+
existing_field.arrow.type,
|
2474
|
+
nullable=existing_field.arrow.nullable,
|
2475
|
+
metadata=new_metadata if new_metadata else None,
|
2476
|
+
)
|
2477
|
+
|
2478
|
+
return self._update_field(field_locator, updated_field)
|
2479
|
+
|
2480
|
+
def update_field_future_default(
|
2481
|
+
self, field_locator: FieldLocator, future_default: Optional[Any]
|
2482
|
+
) -> SchemaUpdate:
|
2483
|
+
"""
|
2484
|
+
Update the future default value of an existing field while keeping all other properties the same.
|
2485
|
+
The future default is validated to ensure it's compatible with the field's data type.
|
2486
|
+
|
2487
|
+
Args:
|
2488
|
+
field_locator: Location identifier for the field to update
|
2489
|
+
future_default: The new future default value for the field
|
2490
|
+
|
2491
|
+
Returns:
|
2492
|
+
Self for method chaining
|
2493
|
+
|
2494
|
+
Raises:
|
2495
|
+
SchemaCompatibilityError: If field doesn't exist
|
2496
|
+
ValueError: If future_default is not compatible with the field's data type
|
2497
|
+
"""
|
2498
|
+
# Get the existing field
|
2499
|
+
existing_field = self._get_existing_field(field_locator)
|
2500
|
+
|
2501
|
+
# Validate that the future_default is compatible with the field's type
|
2502
|
+
if future_default is not None:
|
2503
|
+
self._validate_default_value(existing_field.arrow.type, future_default)
|
2504
|
+
|
2505
|
+
# Create a deep copy of the field
|
2506
|
+
updated_field = copy.deepcopy(existing_field)
|
2507
|
+
|
2508
|
+
# Update the arrow field metadata to set/remove future default
|
2509
|
+
new_metadata = copy.deepcopy(existing_field.arrow.metadata)
|
2510
|
+
new_metadata.pop(FIELD_FUTURE_DEFAULT_KEY_NAME, None)
|
2511
|
+
|
2512
|
+
if future_default is not None:
|
2513
|
+
new_metadata[FIELD_FUTURE_DEFAULT_KEY_NAME] = _encode_metadata_value(
|
2514
|
+
future_default
|
2515
|
+
)
|
2516
|
+
|
2517
|
+
updated_field["arrow"] = pa.field(
|
2518
|
+
existing_field.arrow.name,
|
2519
|
+
existing_field.arrow.type,
|
2520
|
+
nullable=existing_field.arrow.nullable,
|
2521
|
+
metadata=new_metadata if new_metadata else None,
|
2522
|
+
)
|
2523
|
+
|
2524
|
+
return self._update_field(field_locator, updated_field)
|
2525
|
+
|
2526
|
+
def _update_field(
|
2527
|
+
self, field_locator: FieldLocator, updated_field: Field
|
2528
|
+
) -> SchemaUpdate:
|
2529
|
+
"""
|
2530
|
+
Update an existing field with compatible changes. This is the protected method
|
2531
|
+
that handles the general case of field updates.
|
2532
|
+
|
2533
|
+
Args:
|
2534
|
+
field_locator: Location identifier for the field to update
|
2535
|
+
updated_field: The new Field object to replace the existing field
|
2536
|
+
|
2537
|
+
Returns:
|
2538
|
+
Self for method chaining
|
2539
|
+
|
2540
|
+
Raises:
|
2541
|
+
SchemaCompatibilityError: If field doesn't exist or update would break compatibility
|
2542
|
+
"""
|
2543
|
+
self.operations.append(
|
2544
|
+
SchemaUpdateOperation.update_field(field_locator, updated_field)
|
2545
|
+
)
|
2546
|
+
return self
|
2547
|
+
|
2548
|
+
def _get_existing_field(self, field_locator: FieldLocator) -> Field:
|
2549
|
+
"""
|
2550
|
+
Helper method to retrieve an existing field, accounting for pending operations.
|
2551
|
+
|
2552
|
+
Args:
|
2553
|
+
field_locator: Location identifier for the field to retrieve
|
2554
|
+
|
2555
|
+
Returns:
|
2556
|
+
The existing Field object (with any pending updates applied)
|
2557
|
+
|
2558
|
+
Raises:
|
2559
|
+
SchemaCompatibilityError: If field doesn't exist
|
2560
|
+
"""
|
2561
|
+
field_name = self._get_field_name(field_locator)
|
2562
|
+
# Search for the field in the base schema
|
2563
|
+
base_field = None
|
2564
|
+
for field in self.base_schema.fields:
|
2565
|
+
field_field_name = field.path[0] if field.path else f"field_{field.id}"
|
2566
|
+
if field_field_name == field_name:
|
2567
|
+
base_field = field
|
2568
|
+
break
|
2569
|
+
|
2570
|
+
if base_field is None:
|
2571
|
+
# Field not found
|
2572
|
+
raise SchemaCompatibilityError(
|
2573
|
+
f"Field '{field_name}' does not exist in schema", field_locator
|
2574
|
+
)
|
2575
|
+
|
2576
|
+
# Apply any pending operations that affect this field to get the current state
|
2577
|
+
current_field = copy.deepcopy(base_field)
|
2578
|
+
|
2579
|
+
for operation in self.operations:
|
2580
|
+
if operation.field_locator_matches(field_locator):
|
2581
|
+
# Apply this operation to get the cumulative state
|
2582
|
+
current_field = operation.field
|
2583
|
+
|
2584
|
+
return current_field
|
2585
|
+
|
2586
|
+
def _validate_default_value(
|
2587
|
+
self, arrow_type: pa.DataType, default_value: Any
|
2588
|
+
) -> None:
|
2589
|
+
"""
|
2590
|
+
Helper method to validate that a default value is compatible with a PyArrow data type.
|
2591
|
+
|
2592
|
+
Args:
|
2593
|
+
arrow_type: The PyArrow data type to validate against
|
2594
|
+
default_value: The default value to validate
|
2595
|
+
|
2596
|
+
Raises:
|
2597
|
+
ValueError: If the default value is not compatible with the data type
|
2598
|
+
"""
|
2599
|
+
try:
|
2600
|
+
# Try to create a PyArrow array with the default value to validate compatibility
|
2601
|
+
pa.array([default_value], type=arrow_type)
|
2602
|
+
except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError, ValueError) as e:
|
2603
|
+
raise ValueError(
|
2604
|
+
f"Default value {default_value} is not compatible with type {arrow_type}: {e}"
|
2605
|
+
)
|
2606
|
+
|
2607
|
+
def apply(self) -> Schema:
|
2608
|
+
"""
|
2609
|
+
Apply all pending operations and return the updated schema.
|
2610
|
+
|
2611
|
+
Returns:
|
2612
|
+
New Schema object with all updates applied
|
2613
|
+
|
2614
|
+
Raises:
|
2615
|
+
SchemaCompatibilityError: If any operation would break backward compatibility
|
2616
|
+
and allow_incompatible_changes is False
|
2617
|
+
"""
|
2618
|
+
# Start with a copy of the base schema
|
2619
|
+
updated_fields = list(self.base_schema.fields)
|
2620
|
+
field_name_to_index = {
|
2621
|
+
field.path[0] if field.path else f"field_{field.id}": i
|
2622
|
+
for i, field in enumerate(updated_fields)
|
2623
|
+
}
|
2624
|
+
|
2625
|
+
# Track next available field ID for new fields during schema evolution
|
2626
|
+
next_available_field_id = self.base_schema.max_field_id + 1
|
2627
|
+
if next_available_field_id >= MAX_FIELD_ID_EXCLUSIVE:
|
2628
|
+
# Just raise an error instead of wrapping to 0, since this
|
2629
|
+
# breaks our guarantee of unique field IDs across schema
|
2630
|
+
# evolution history (e.g., we may overflow on a schema with IDs
|
2631
|
+
# 0-1MM or 2, 10, etc. already assigned).
|
2632
|
+
raise SchemaCompatibilityError(
|
2633
|
+
f"Schema Field ID overflow: {next_available_field_id} >= {MAX_FIELD_ID_EXCLUSIVE}",
|
2634
|
+
)
|
2635
|
+
|
2636
|
+
# Validate no conflicting operations before applying
|
2637
|
+
self._validate_no_conflicting_operations()
|
2638
|
+
|
2639
|
+
# Apply operations in order
|
2640
|
+
for operation in self.operations:
|
2641
|
+
if operation.operation == "add":
|
2642
|
+
next_available_field_id = self._apply_add_field(
|
2643
|
+
updated_fields,
|
2644
|
+
field_name_to_index,
|
2645
|
+
operation.field,
|
2646
|
+
next_available_field_id,
|
2647
|
+
)
|
2648
|
+
elif operation.operation == "remove":
|
2649
|
+
self._apply_remove_field(
|
2650
|
+
updated_fields,
|
2651
|
+
field_name_to_index,
|
2652
|
+
operation.field_locator,
|
2653
|
+
)
|
2654
|
+
elif operation.operation == "update":
|
2655
|
+
self._apply_update_field(
|
2656
|
+
updated_fields,
|
2657
|
+
field_name_to_index,
|
2658
|
+
operation.field_locator,
|
2659
|
+
operation.field,
|
2660
|
+
)
|
2661
|
+
|
2662
|
+
# Create new schema from updated fields with incremented schema ID
|
2663
|
+
new_schema = Schema.of(updated_fields, schema_id=self.base_schema.id + 1)
|
2664
|
+
|
2665
|
+
# Ensure max_field_id never decreases, even when fields are removed
|
2666
|
+
# This prevents field ID reuse across schema evolution history
|
2667
|
+
if new_schema.max_field_id < self.base_schema.max_field_id:
|
2668
|
+
new_schema["maxFieldId"] = self.base_schema.max_field_id
|
2669
|
+
|
2670
|
+
return new_schema
|
2671
|
+
|
2672
|
+
def _validate_no_conflicting_operations(self) -> None:
|
2673
|
+
"""Validate that operations don't conflict with each other."""
|
2674
|
+
field_operations = {} # field_name -> set of operations
|
2675
|
+
|
2676
|
+
for operation in self.operations:
|
2677
|
+
field_name = None
|
2678
|
+
|
2679
|
+
if operation.operation == "add" and operation.field:
|
2680
|
+
field_name = operation.field.arrow.name
|
2681
|
+
elif (
|
2682
|
+
operation.operation in ("remove", "update") and operation.field_locator
|
2683
|
+
):
|
2684
|
+
# Extract field name from locator
|
2685
|
+
if isinstance(operation.field_locator, str):
|
2686
|
+
field_name = operation.field_locator
|
2687
|
+
elif hasattr(operation.field_locator, "name"):
|
2688
|
+
field_name = operation.field_locator.name
|
2689
|
+
elif (
|
2690
|
+
isinstance(operation.field_locator, list)
|
2691
|
+
and operation.field_locator
|
2692
|
+
):
|
2693
|
+
field_name = operation.field_locator[0]
|
2694
|
+
|
2695
|
+
if field_name:
|
2696
|
+
if field_name not in field_operations:
|
2697
|
+
field_operations[field_name] = set()
|
2698
|
+
field_operations[field_name].add(operation.operation)
|
2699
|
+
|
2700
|
+
# Check for conflicting operations on same field
|
2701
|
+
for field_name, operations in field_operations.items():
|
2702
|
+
if len(operations) > 1:
|
2703
|
+
unique_ops = set(operations)
|
2704
|
+
# Allow multiple update operations on same field (they are cumulative)
|
2705
|
+
if unique_ops == {"update"}:
|
2706
|
+
continue # Multiple updates on same field are allowed
|
2707
|
+
# Any other combination is conflicting
|
2708
|
+
message_suffix = f"Cannot perform {', '.join(sorted(unique_ops))} operations on the same field"
|
2709
|
+
|
2710
|
+
raise ValueError(
|
2711
|
+
f"Conflicting operations detected on field '{field_name}': {sorted(operations)}. "
|
2712
|
+
f"{message_suffix}."
|
2713
|
+
)
|
2714
|
+
|
2715
|
+
def _apply_add_field(
|
2716
|
+
self,
|
2717
|
+
fields: List[Field],
|
2718
|
+
field_name_to_index: Dict[str, int],
|
2719
|
+
new_field: Field,
|
2720
|
+
next_available_field_id: int,
|
2721
|
+
) -> int:
|
2722
|
+
"""Apply add field operation with compatibility validation.
|
2723
|
+
|
2724
|
+
Args:
|
2725
|
+
fields: List of existing fields to append to
|
2726
|
+
field_name_to_index: Mapping of field names to indices
|
2727
|
+
new_field: The field to add (user-specified field_id will be ignored)
|
2728
|
+
next_available_field_id: The next available field ID to assign
|
2729
|
+
|
2730
|
+
Returns:
|
2731
|
+
The next available field ID for subsequent operations
|
2732
|
+
"""
|
2733
|
+
field_name = new_field.arrow.name
|
2734
|
+
|
2735
|
+
# Check if field already exists
|
2736
|
+
if field_name in field_name_to_index:
|
2737
|
+
raise SchemaCompatibilityError(
|
2738
|
+
f"Field '{field_name}' already exists in schema",
|
2739
|
+
)
|
2740
|
+
|
2741
|
+
# Validate compatibility for new field
|
2742
|
+
if not self.allow_incompatible_changes:
|
2743
|
+
self._validate_add_field_compatibility(new_field)
|
2744
|
+
|
2745
|
+
# For add operations, ignore user-specified field ID and auto-assign
|
2746
|
+
auto_assigned_field_id = next_available_field_id
|
2747
|
+
|
2748
|
+
# Create a copy of the field with auto-assigned field ID and correct path
|
2749
|
+
field_with_auto_id = Field.of(
|
2750
|
+
new_field.arrow,
|
2751
|
+
field_id=auto_assigned_field_id,
|
2752
|
+
is_merge_key=new_field.is_merge_key,
|
2753
|
+
merge_order=new_field.merge_order,
|
2754
|
+
is_event_time=new_field.is_event_time,
|
2755
|
+
doc=new_field.doc,
|
2756
|
+
past_default=new_field.past_default,
|
2757
|
+
future_default=new_field.future_default,
|
2758
|
+
consistency_type=new_field.consistency_type,
|
2759
|
+
path=[field_name],
|
2760
|
+
native_object=new_field.native_object,
|
2761
|
+
)
|
2762
|
+
|
2763
|
+
# Add the field
|
2764
|
+
fields.append(field_with_auto_id)
|
2765
|
+
field_name_to_index[field_name] = len(fields) - 1
|
2766
|
+
|
2767
|
+
# Return next available field ID
|
2768
|
+
return next_available_field_id + 1
|
2769
|
+
|
2770
|
+
def _apply_remove_field(
|
2771
|
+
self,
|
2772
|
+
fields: List[Field],
|
2773
|
+
field_name_to_index: Dict[str, int],
|
2774
|
+
field_locator: FieldLocator,
|
2775
|
+
) -> None:
|
2776
|
+
"""Apply remove field operation with compatibility validation."""
|
2777
|
+
field_name = self._get_field_name(field_locator)
|
2778
|
+
|
2779
|
+
# Check if field exists
|
2780
|
+
if field_name not in field_name_to_index:
|
2781
|
+
raise SchemaCompatibilityError(
|
2782
|
+
f"Field '{field_name}' does not exist in schema", field_locator
|
2783
|
+
)
|
2784
|
+
|
2785
|
+
# Validate compatibility for field removal
|
2786
|
+
if not self.allow_incompatible_changes:
|
2787
|
+
field_index = field_name_to_index[field_name]
|
2788
|
+
self._validate_remove_field_compatibility(
|
2789
|
+
fields[field_index], field_locator
|
2790
|
+
)
|
2791
|
+
|
2792
|
+
# Remove the field
|
2793
|
+
field_index = field_name_to_index[field_name]
|
2794
|
+
fields.pop(field_index)
|
2795
|
+
|
2796
|
+
# Update indices
|
2797
|
+
del field_name_to_index[field_name]
|
2798
|
+
for name, index in field_name_to_index.items():
|
2799
|
+
if index > field_index:
|
2800
|
+
field_name_to_index[name] = index - 1
|
2801
|
+
|
2802
|
+
def _apply_update_field(
|
2803
|
+
self,
|
2804
|
+
fields: List[Field],
|
2805
|
+
field_name_to_index: Dict[str, int],
|
2806
|
+
field_locator: FieldLocator,
|
2807
|
+
updated_field: Field,
|
2808
|
+
) -> None:
|
2809
|
+
"""Apply update field operation with compatibility validation."""
|
2810
|
+
field_name = self._get_field_name(field_locator)
|
2811
|
+
|
2812
|
+
# Check if field exists
|
2813
|
+
if field_name not in field_name_to_index:
|
2814
|
+
raise SchemaCompatibilityError(
|
2815
|
+
f"Field '{field_name}' does not exist in schema", field_locator
|
2816
|
+
)
|
2817
|
+
|
2818
|
+
field_index = field_name_to_index[field_name]
|
2819
|
+
old_field = fields[field_index]
|
2820
|
+
|
2821
|
+
# Validate compatibility for field update
|
2822
|
+
if not self.allow_incompatible_changes:
|
2823
|
+
self._validate_update_field_compatibility(
|
2824
|
+
old_field, updated_field, field_locator
|
2825
|
+
)
|
2826
|
+
|
2827
|
+
# Get the new field name from the updated field
|
2828
|
+
new_field_name = updated_field.arrow.name
|
2829
|
+
|
2830
|
+
# Create a copy of the updated field with the correct path
|
2831
|
+
field_with_path = Field.of(
|
2832
|
+
updated_field.arrow,
|
2833
|
+
field_id=updated_field.id,
|
2834
|
+
is_merge_key=updated_field.is_merge_key,
|
2835
|
+
merge_order=updated_field.merge_order,
|
2836
|
+
is_event_time=updated_field.is_event_time,
|
2837
|
+
doc=updated_field.doc,
|
2838
|
+
past_default=updated_field.past_default,
|
2839
|
+
future_default=updated_field.future_default,
|
2840
|
+
consistency_type=updated_field.consistency_type,
|
2841
|
+
path=[new_field_name],
|
2842
|
+
native_object=updated_field.native_object,
|
2843
|
+
)
|
2844
|
+
|
2845
|
+
# Update the field
|
2846
|
+
fields[field_index] = field_with_path
|
2847
|
+
|
2848
|
+
# If field name changed (rename), update the mapping
|
2849
|
+
if field_name != new_field_name:
|
2850
|
+
del field_name_to_index[field_name]
|
2851
|
+
field_name_to_index[new_field_name] = field_index
|
2852
|
+
|
2853
|
+
def _get_field_name(self, field_locator: FieldLocator) -> str:
|
2854
|
+
"""Extract field name from various field locator types."""
|
2855
|
+
if isinstance(field_locator, str):
|
2856
|
+
return field_locator
|
2857
|
+
elif isinstance(field_locator, list):
|
2858
|
+
return field_locator[0] if field_locator else ""
|
2859
|
+
elif isinstance(field_locator, int):
|
2860
|
+
# For field ID, try to find the corresponding field
|
2861
|
+
try:
|
2862
|
+
field = self.base_schema.field(field_locator)
|
2863
|
+
return field.path[0] if field.path else f"field_{field_locator}"
|
2864
|
+
except Exception:
|
2865
|
+
return f"field_{field_locator}"
|
2866
|
+
else:
|
2867
|
+
raise ValueError(f"Invalid field locator type: {type(field_locator)}")
|
2868
|
+
|
2869
|
+
@staticmethod
|
2870
|
+
def _field_locators_match(locator1: FieldLocator, locator2: FieldLocator) -> bool:
|
2871
|
+
"""Check if two field locators refer to the same field."""
|
2872
|
+
# For simplicity, convert both to string names and compare
|
2873
|
+
# This works because we primarily use field names in our operations
|
2874
|
+
if isinstance(locator1, str) and isinstance(locator2, str):
|
2875
|
+
return locator1 == locator2
|
2876
|
+
elif isinstance(locator1, list) and isinstance(locator2, list):
|
2877
|
+
return locator1 == locator2
|
2878
|
+
elif isinstance(locator1, int) and isinstance(locator2, int):
|
2879
|
+
return locator1 == locator2
|
2880
|
+
else:
|
2881
|
+
# Convert to strings and compare (this is a simplified approach)
|
2882
|
+
str1 = (
|
2883
|
+
locator1
|
2884
|
+
if isinstance(locator1, str)
|
2885
|
+
else (
|
2886
|
+
locator1[0]
|
2887
|
+
if isinstance(locator1, list) and locator1
|
2888
|
+
else str(locator1)
|
2889
|
+
)
|
2890
|
+
)
|
2891
|
+
str2 = (
|
2892
|
+
locator2
|
2893
|
+
if isinstance(locator2, str)
|
2894
|
+
else (
|
2895
|
+
locator2[0]
|
2896
|
+
if isinstance(locator2, list) and locator2
|
2897
|
+
else str(locator2)
|
2898
|
+
)
|
2899
|
+
)
|
2900
|
+
return str1 == str2
|
2901
|
+
|
2902
|
+
def _validate_add_field_compatibility(self, new_field: Field) -> None:
|
2903
|
+
"""Validate that adding a new field won't break compatibility."""
|
2904
|
+
field_name = new_field.arrow.name
|
2905
|
+
arrow_field = new_field.arrow
|
2906
|
+
|
2907
|
+
# Check if field is nullable or has default values
|
2908
|
+
is_nullable = arrow_field.nullable
|
2909
|
+
has_past_default = new_field.past_default is not None
|
2910
|
+
has_future_default = new_field.future_default is not None
|
2911
|
+
|
2912
|
+
if not (is_nullable or has_past_default or has_future_default):
|
2913
|
+
raise SchemaCompatibilityError(
|
2914
|
+
f"Adding non-nullable field '{field_name}' without "
|
2915
|
+
f"default values would break compatibility with existing data",
|
2916
|
+
)
|
2917
|
+
|
2918
|
+
def _validate_remove_field_compatibility(
|
2919
|
+
self, field: Field, field_locator: FieldLocator
|
2920
|
+
) -> None:
|
2921
|
+
"""Validate that removing a field won't break compatibility."""
|
2922
|
+
field_name = self._get_field_name(field_locator)
|
2923
|
+
|
2924
|
+
# Check for protected field types that should never be removed
|
2925
|
+
if field.is_merge_key:
|
2926
|
+
raise SchemaCompatibilityError(
|
2927
|
+
f"Cannot remove merge key field '{field_name}'. "
|
2928
|
+
f"Merge keys are critical for data integrity and cannot be removed.",
|
2929
|
+
field_locator,
|
2930
|
+
)
|
2931
|
+
|
2932
|
+
if field.merge_order is not None:
|
2933
|
+
raise SchemaCompatibilityError(
|
2934
|
+
f"Cannot remove merge order field '{field_name}'. "
|
2935
|
+
f"Fields with merge_order are critical for data ordering and cannot be removed.",
|
2936
|
+
field_locator,
|
2937
|
+
)
|
2938
|
+
|
2939
|
+
if field.is_event_time:
|
2940
|
+
raise SchemaCompatibilityError(
|
2941
|
+
f"Cannot remove event time field '{field_name}'. "
|
2942
|
+
f"Event time fields are critical for temporal operations and cannot be removed.",
|
2943
|
+
field_locator,
|
2944
|
+
)
|
2945
|
+
|
2946
|
+
# Removing fields generally breaks compatibility for consumers expecting them
|
2947
|
+
raise SchemaCompatibilityError(
|
2948
|
+
f"Removing field '{field_name}' would break compatibility with existing consumers. "
|
2949
|
+
f"Set allow_incompatible_changes=True to force removal.",
|
2950
|
+
field_locator,
|
2951
|
+
)
|
2952
|
+
|
2953
|
+
def _validate_update_field_compatibility(
|
2954
|
+
self, old_field: Field, new_field: Field, field_locator: FieldLocator
|
2955
|
+
) -> None:
|
2956
|
+
"""Validate that updating a field won't break compatibility."""
|
2957
|
+
old_arrow = old_field.arrow
|
2958
|
+
new_arrow = new_field.arrow
|
2959
|
+
field_name = self._get_field_name(field_locator)
|
2960
|
+
|
2961
|
+
# Protect critical field attributes that should never be changed
|
2962
|
+
if old_field.is_merge_key != new_field.is_merge_key:
|
2963
|
+
raise SchemaCompatibilityError(
|
2964
|
+
f"Cannot change merge key status for field '{field_name}'. "
|
2965
|
+
f"Merge key designation is critical for data integrity and cannot be modified.",
|
2966
|
+
field_locator,
|
2967
|
+
)
|
2968
|
+
|
2969
|
+
if old_field.merge_order != new_field.merge_order:
|
2970
|
+
raise SchemaCompatibilityError(
|
2971
|
+
f"Cannot change merge order for field '{field_name}'. "
|
2972
|
+
f"Merge order is critical for data consistency and cannot be modified.",
|
2973
|
+
field_locator,
|
2974
|
+
)
|
2975
|
+
|
2976
|
+
if old_field.is_event_time != new_field.is_event_time:
|
2977
|
+
raise SchemaCompatibilityError(
|
2978
|
+
f"Cannot change event time status for field '{field_name}'. "
|
2979
|
+
f"Event time designation is critical for temporal operations and cannot be modified.",
|
2980
|
+
field_locator,
|
2981
|
+
)
|
2982
|
+
|
2983
|
+
# Validate schema consistency type evolution rules
|
2984
|
+
self._validate_consistency_type_evolution(old_field, new_field, field_locator)
|
2985
|
+
|
2986
|
+
# Protect past_default immutability
|
2987
|
+
if old_field.past_default != new_field.past_default:
|
2988
|
+
raise SchemaCompatibilityError(
|
2989
|
+
f"Cannot change past_default for field '{field_name}'. "
|
2990
|
+
f"The past_default value is immutable once set to maintain data consistency.",
|
2991
|
+
field_locator,
|
2992
|
+
)
|
2993
|
+
|
2994
|
+
# Check for duplicate field IDs (if field ID is being changed)
|
2995
|
+
if old_field.id != new_field.id and new_field.id is not None:
|
2996
|
+
existing_field_ids = {
|
2997
|
+
f.id
|
2998
|
+
for f in self.base_schema.fields
|
2999
|
+
if f.id is not None and f != old_field
|
3000
|
+
}
|
3001
|
+
if new_field.id in existing_field_ids:
|
3002
|
+
raise SchemaCompatibilityError(
|
3003
|
+
f"Cannot update field '{field_name}' to use duplicate field ID {new_field.id}. "
|
3004
|
+
f"Field IDs must be unique across all fields in the schema.",
|
3005
|
+
field_locator,
|
3006
|
+
)
|
3007
|
+
|
3008
|
+
# Check data type compatibility
|
3009
|
+
if not self._is_type_compatible(old_arrow.type, new_arrow.type):
|
3010
|
+
raise SchemaCompatibilityError(
|
3011
|
+
f"Cannot change field '{field_name}' from {old_arrow.type} to {new_arrow.type}. "
|
3012
|
+
f"This change would break compatibility with PyArrow, Pandas, Polars, Ray Data, and Daft.",
|
3013
|
+
field_locator,
|
3014
|
+
)
|
3015
|
+
|
3016
|
+
# Check nullability - making a field non-nullable is incompatible
|
3017
|
+
if old_arrow.nullable and not new_arrow.nullable:
|
3018
|
+
# Only allow if we have past/future defaults to fill null values
|
3019
|
+
has_past_default = new_field.past_default is not None
|
3020
|
+
has_future_default = new_field.future_default is not None
|
3021
|
+
|
3022
|
+
if not (has_past_default and has_future_default):
|
3023
|
+
raise SchemaCompatibilityError(
|
3024
|
+
f"Cannot make nullable field '{field_name}' non-nullable without "
|
3025
|
+
f"providing both past_default and future_default values",
|
3026
|
+
field_locator,
|
3027
|
+
)
|
3028
|
+
|
3029
|
+
def _validate_consistency_type_evolution(
|
3030
|
+
self, old_field: Field, new_field: Field, field_locator: FieldLocator
|
3031
|
+
) -> None:
|
3032
|
+
"""
|
3033
|
+
Validate schema consistency type evolution rules.
|
3034
|
+
|
3035
|
+
Allowed transitions:
|
3036
|
+
- COERCE -> VALIDATE
|
3037
|
+
- VALIDATE -> COERCE
|
3038
|
+
- COERCE -> NONE
|
3039
|
+
- VALIDATE -> NONE
|
3040
|
+
|
3041
|
+
Forbidden transitions:
|
3042
|
+
- NONE -> COERCE
|
3043
|
+
- NONE -> VALIDATE
|
3044
|
+
"""
|
3045
|
+
old_type = old_field.consistency_type
|
3046
|
+
new_type = new_field.consistency_type
|
3047
|
+
field_name = self._get_field_name(field_locator)
|
3048
|
+
|
3049
|
+
# If types are the same, no validation needed
|
3050
|
+
if old_type == new_type:
|
3051
|
+
return
|
3052
|
+
|
3053
|
+
# Handle None values (treat as no consistency type set)
|
3054
|
+
if old_type is None and new_type is None:
|
3055
|
+
return
|
3056
|
+
|
3057
|
+
# Allow transitions from any type to NONE (relaxing constraints)
|
3058
|
+
if new_type == SchemaConsistencyType.NONE or new_type is None:
|
3059
|
+
return
|
3060
|
+
|
3061
|
+
# Allow transitions between COERCE and VALIDATE (bidirectional)
|
3062
|
+
if old_type in (
|
3063
|
+
SchemaConsistencyType.COERCE,
|
3064
|
+
SchemaConsistencyType.VALIDATE,
|
3065
|
+
) and new_type in (
|
3066
|
+
SchemaConsistencyType.COERCE,
|
3067
|
+
SchemaConsistencyType.VALIDATE,
|
3068
|
+
):
|
3069
|
+
return
|
3070
|
+
|
3071
|
+
# Allow transitions from None to COERCE or VALIDATE (adding constraints)
|
3072
|
+
if old_type is None and new_type in (
|
3073
|
+
SchemaConsistencyType.COERCE,
|
3074
|
+
SchemaConsistencyType.VALIDATE,
|
3075
|
+
):
|
3076
|
+
return
|
3077
|
+
|
3078
|
+
# Forbid transitions from NONE to COERCE or VALIDATE (tightening constraints)
|
3079
|
+
if old_type == SchemaConsistencyType.NONE and new_type in (
|
3080
|
+
SchemaConsistencyType.COERCE,
|
3081
|
+
SchemaConsistencyType.VALIDATE,
|
3082
|
+
):
|
3083
|
+
raise SchemaCompatibilityError(
|
3084
|
+
f"Cannot change consistency type for field '{field_name}' from {old_type.value} to {new_type.value}. "
|
3085
|
+
f"Transitioning from NONE to {new_type.value} would tighten validation constraints "
|
3086
|
+
f"and potentially break existing data processing.",
|
3087
|
+
field_locator,
|
3088
|
+
)
|
3089
|
+
|
3090
|
+
# If we get here, it's an unexpected combination
|
3091
|
+
raise SchemaCompatibilityError(
|
3092
|
+
f"Invalid consistency type transition for field '{field_name}' from "
|
3093
|
+
f"{old_type.value if old_type else 'None'} to {new_type.value if new_type else 'None'}.",
|
3094
|
+
field_locator,
|
3095
|
+
)
|
3096
|
+
|
3097
|
+
def _is_type_compatible(self, old_type: pa.DataType, new_type: pa.DataType) -> bool:
|
3098
|
+
"""
|
3099
|
+
Check if changing from old_type to new_type is backward compatible.
|
3100
|
+
|
3101
|
+
Compatible changes include:
|
3102
|
+
- Same type
|
3103
|
+
- Widening numeric types (int32 -> int64, float32 -> float64)
|
3104
|
+
- Making string/binary types longer
|
3105
|
+
- Adding fields to struct types
|
3106
|
+
- Making list/map value types more permissive
|
3107
|
+
"""
|
3108
|
+
# Same type is always compatible
|
3109
|
+
if old_type.equals(new_type):
|
3110
|
+
return True
|
3111
|
+
|
3112
|
+
# Numeric type widening
|
3113
|
+
if pa.types.is_integer(old_type) and pa.types.is_integer(new_type):
|
3114
|
+
# Check bit width and signedness using string representation
|
3115
|
+
old_signed = "int" in str(old_type) and "uint" not in str(old_type)
|
3116
|
+
new_signed = "int" in str(new_type) and "uint" not in str(new_type)
|
3117
|
+
return new_type.bit_width >= old_type.bit_width and old_signed == new_signed
|
3118
|
+
|
3119
|
+
if pa.types.is_floating(old_type) and pa.types.is_floating(new_type):
|
3120
|
+
return new_type.bit_width >= old_type.bit_width
|
3121
|
+
|
3122
|
+
# Integer to float promotion
|
3123
|
+
if pa.types.is_integer(old_type) and pa.types.is_floating(new_type):
|
3124
|
+
return True
|
3125
|
+
|
3126
|
+
# String/binary type compatibility
|
3127
|
+
if pa.types.is_string(old_type) and pa.types.is_string(new_type):
|
3128
|
+
return True
|
3129
|
+
if pa.types.is_binary(old_type) and pa.types.is_binary(new_type):
|
3130
|
+
return True
|
3131
|
+
|
3132
|
+
# Struct type compatibility (new fields can be added)
|
3133
|
+
if pa.types.is_struct(old_type) and pa.types.is_struct(new_type):
|
3134
|
+
old_names = {field.name for field in old_type}
|
3135
|
+
new_names = {field.name for field in new_type}
|
3136
|
+
|
3137
|
+
# All old fields must exist in new type
|
3138
|
+
if not old_names.issubset(new_names):
|
3139
|
+
return False
|
3140
|
+
|
3141
|
+
# Check compatibility of common fields
|
3142
|
+
for old_field in old_type:
|
3143
|
+
new_field = new_type.field(old_field.name)
|
3144
|
+
if not self._is_type_compatible(old_field.type, new_field.type):
|
3145
|
+
return False
|
3146
|
+
|
3147
|
+
return True
|
3148
|
+
|
3149
|
+
# List type compatibility
|
3150
|
+
if pa.types.is_list(old_type) and pa.types.is_list(new_type):
|
3151
|
+
return self._is_type_compatible(old_type.value_type, new_type.value_type)
|
3152
|
+
|
3153
|
+
# Map type compatibility
|
3154
|
+
if pa.types.is_map(old_type) and pa.types.is_map(new_type):
|
3155
|
+
return self._is_type_compatible(
|
3156
|
+
old_type.key_type, new_type.key_type
|
3157
|
+
) and self._is_type_compatible(old_type.item_type, new_type.item_type)
|
3158
|
+
|
3159
|
+
# Default: types are incompatible
|
3160
|
+
return False
|