deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3160 @@
|
|
1
|
+
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import logging
|
5
|
+
import copy
|
6
|
+
import base64
|
7
|
+
|
8
|
+
import msgpack
|
9
|
+
from typing import Optional, Any, Dict, Union, List, Callable, Tuple, TYPE_CHECKING
|
10
|
+
|
11
|
+
import pyarrow as pa
|
12
|
+
from pyarrow import ArrowInvalid
|
13
|
+
import pandas as pd
|
14
|
+
import numpy as np
|
15
|
+
|
16
|
+
# Daft DataFrame support - required for core functionality
|
17
|
+
import daft
|
18
|
+
from daft import DataFrame as DaftDataFrame
|
19
|
+
|
20
|
+
from deltacat.constants import BYTES_PER_KIBIBYTE
|
21
|
+
from deltacat.exceptions import (
|
22
|
+
SchemaCompatibilityError,
|
23
|
+
SchemaValidationError,
|
24
|
+
)
|
25
|
+
from deltacat.storage.model.types import (
|
26
|
+
SchemaConsistencyType,
|
27
|
+
SortOrder,
|
28
|
+
NullOrder,
|
29
|
+
)
|
30
|
+
from deltacat.types.tables import (
|
31
|
+
get_table_length,
|
32
|
+
to_pyarrow,
|
33
|
+
from_pyarrow,
|
34
|
+
get_dataset_type,
|
35
|
+
SchemaEvolutionMode,
|
36
|
+
)
|
37
|
+
from deltacat.types.media import DatasetType
|
38
|
+
|
39
|
+
if TYPE_CHECKING:
|
40
|
+
from deltacat.storage.model.sort_key import SortKey
|
41
|
+
|
42
|
+
from deltacat import logs
|
43
|
+
|
44
|
+
|
45
|
+
# PyArrow Field Metadata Key used to set the Field ID when writing to Parquet.
|
46
|
+
# See: https://arrow.apache.org/docs/cpp/parquet.html#parquet-field-id
|
47
|
+
PARQUET_FIELD_ID_KEY_NAME = b"PARQUET:field_id"
|
48
|
+
|
49
|
+
# PyArrow Field Metadata Key used to store field documentation.
|
50
|
+
FIELD_DOC_KEY_NAME = b"DELTACAT:doc"
|
51
|
+
|
52
|
+
# PyArrow Field Metadata Key used to identify the field as a merge key.
|
53
|
+
FIELD_MERGE_KEY_NAME = b"DELTACAT:merge_key"
|
54
|
+
|
55
|
+
# PyArrow Field Metadata Key used to identify the field as a merge order key.
|
56
|
+
FIELD_MERGE_ORDER_KEY_NAME = b"DELTACAT:merge_order"
|
57
|
+
|
58
|
+
# PyArrow Field Metadata Key used to identify the field as an event time.
|
59
|
+
FIELD_EVENT_TIME_KEY_NAME = b"DELTACAT:event_time"
|
60
|
+
|
61
|
+
# PyArrow Field Metadata Key used to store field past default values.
|
62
|
+
FIELD_PAST_DEFAULT_KEY_NAME = b"DELTACAT:past_default"
|
63
|
+
|
64
|
+
# PyArrow Field Metadata Key used to store field future default values.
|
65
|
+
FIELD_FUTURE_DEFAULT_KEY_NAME = b"DELTACAT:future_default"
|
66
|
+
|
67
|
+
# PyArrow Field Metadata Key used to store field schema consistency type.
|
68
|
+
FIELD_CONSISTENCY_TYPE_KEY_NAME = b"DELTACAT:consistency_type"
|
69
|
+
|
70
|
+
# PyArrow Schema Metadata Key used to store schema ID value.
|
71
|
+
SCHEMA_ID_KEY_NAME = b"DELTACAT:schema_id"
|
72
|
+
|
73
|
+
# PyArrow Schema Metadata Key used to store named subschemas
|
74
|
+
SUBSCHEMAS_KEY_NAME = b"DELTACAT:subschemas"
|
75
|
+
|
76
|
+
# Set max field ID to INT32.MAX_VALUE - 200 for backwards-compatibility with
|
77
|
+
# Apache Iceberg, which sets aside this range for reserved fields
|
78
|
+
MAX_FIELD_ID_EXCLUSIVE = 2147483447
|
79
|
+
|
80
|
+
|
81
|
+
def _encode_metadata_value(value: Any) -> bytes:
|
82
|
+
"""
|
83
|
+
Encode a value for storage in PyArrow field metadata.
|
84
|
+
|
85
|
+
Uses msgpack for efficient serialization, then base64 encoding to ensure
|
86
|
+
UTF-8 compatibility with all Parquet readers (Polars, Daft, etc.).
|
87
|
+
|
88
|
+
Args:
|
89
|
+
value: The value to encode
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Base64-encoded msgpack bytes that are UTF-8 safe
|
93
|
+
"""
|
94
|
+
msgpack_bytes = msgpack.dumps(value)
|
95
|
+
return base64.b64encode(msgpack_bytes)
|
96
|
+
|
97
|
+
|
98
|
+
def _decode_metadata_value(encoded_bytes: bytes) -> Any:
|
99
|
+
"""
|
100
|
+
Decode a value from PyArrow field metadata.
|
101
|
+
|
102
|
+
Handles both new base64-encoded format and legacy raw msgpack format
|
103
|
+
for backward compatibility.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
encoded_bytes: The encoded bytes from field metadata
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
The decoded value
|
110
|
+
|
111
|
+
Raises:
|
112
|
+
ValueError: If the data cannot be decoded
|
113
|
+
"""
|
114
|
+
try:
|
115
|
+
# Try new base64-encoded format first
|
116
|
+
msgpack_bytes = base64.b64decode(encoded_bytes)
|
117
|
+
return msgpack.loads(msgpack_bytes)
|
118
|
+
except Exception:
|
119
|
+
try:
|
120
|
+
# Fall back to legacy raw msgpack format
|
121
|
+
return msgpack.loads(encoded_bytes)
|
122
|
+
except Exception as e:
|
123
|
+
raise ValueError(f"Failed to decode metadata value: {e}") from e
|
124
|
+
|
125
|
+
|
126
|
+
# Default name assigned to the base, unnamed single schema when a new named
|
127
|
+
# subschema is first added.
|
128
|
+
BASE_SCHEMA_NAME = "_base"
|
129
|
+
|
130
|
+
SchemaId = int
|
131
|
+
SchemaName = str
|
132
|
+
FieldId = int
|
133
|
+
FieldName = str
|
134
|
+
NestedFieldName = List[str]
|
135
|
+
FieldLocator = Union[FieldName, NestedFieldName, FieldId]
|
136
|
+
|
137
|
+
|
138
|
+
class SchemaUpdateOperation(tuple):
|
139
|
+
"""
|
140
|
+
Represents a single schema update operation (add, remove, or update field).
|
141
|
+
|
142
|
+
This class inherits from tuple and stores:
|
143
|
+
- operation: str ("add", "remove", "update")
|
144
|
+
- field_locator: Optional[FieldLocator] (name, path, or ID)
|
145
|
+
- field: Optional[Field] (the field data for add/update operations)
|
146
|
+
"""
|
147
|
+
|
148
|
+
@staticmethod
|
149
|
+
def add_field(field: Field) -> SchemaUpdateOperation:
|
150
|
+
"""Create an operation to add a new field."""
|
151
|
+
return SchemaUpdateOperation(("add", None, field))
|
152
|
+
|
153
|
+
@staticmethod
|
154
|
+
def remove_field(field_locator: FieldLocator) -> SchemaUpdateOperation:
|
155
|
+
"""Create an operation to remove an existing field."""
|
156
|
+
return SchemaUpdateOperation(("remove", field_locator, None))
|
157
|
+
|
158
|
+
@staticmethod
|
159
|
+
def update_field(
|
160
|
+
field_locator: FieldLocator, field: Field
|
161
|
+
) -> SchemaUpdateOperation:
|
162
|
+
"""Create an operation to update an existing field."""
|
163
|
+
return SchemaUpdateOperation(("update", field_locator, field))
|
164
|
+
|
165
|
+
@property
|
166
|
+
def operation(self) -> str:
|
167
|
+
"""The operation type: 'add', 'remove', or 'update'."""
|
168
|
+
return self[0]
|
169
|
+
|
170
|
+
@property
|
171
|
+
def field_locator(self) -> Optional[FieldLocator]:
|
172
|
+
"""The field locator (name, path, or ID)."""
|
173
|
+
return self[1]
|
174
|
+
|
175
|
+
@property
|
176
|
+
def field(self) -> Optional[Field]:
|
177
|
+
"""The field data (None for remove operations)."""
|
178
|
+
return self[2]
|
179
|
+
|
180
|
+
def field_locator_matches(self, other_locator: FieldLocator) -> bool:
|
181
|
+
"""Check if this operation's field_locator matches the given field_locator."""
|
182
|
+
return SchemaUpdate._field_locators_match(self.field_locator, other_locator)
|
183
|
+
|
184
|
+
|
185
|
+
class SchemaUpdateOperations(List[SchemaUpdateOperation]):
|
186
|
+
"""
|
187
|
+
A list of schema update operations that can be applied to a schema.
|
188
|
+
|
189
|
+
This class inherits from List[SchemaUpdateOperation] and provides convenience
|
190
|
+
methods for creating and managing schema update operations.
|
191
|
+
"""
|
192
|
+
|
193
|
+
@staticmethod
|
194
|
+
def of(operations: List[SchemaUpdateOperation]) -> SchemaUpdateOperations:
|
195
|
+
"""Create a SchemaUpdateOperations list from a list of operations."""
|
196
|
+
typed_operations = SchemaUpdateOperations()
|
197
|
+
for operation in operations:
|
198
|
+
if operation is not None and not isinstance(
|
199
|
+
operation, SchemaUpdateOperation
|
200
|
+
):
|
201
|
+
operation = SchemaUpdateOperation(operation)
|
202
|
+
typed_operations.append(operation)
|
203
|
+
return typed_operations
|
204
|
+
|
205
|
+
def __getitem__(self, item):
|
206
|
+
"""Override to ensure items are properly typed as SchemaUpdateOperation."""
|
207
|
+
val = super().__getitem__(item)
|
208
|
+
if val is not None and not isinstance(val, SchemaUpdateOperation):
|
209
|
+
self[item] = val = SchemaUpdateOperation(val)
|
210
|
+
return val
|
211
|
+
|
212
|
+
def __iter__(self):
|
213
|
+
for i in range(len(self)):
|
214
|
+
yield self[i] # This triggers __getitem__ conversion
|
215
|
+
|
216
|
+
|
217
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
218
|
+
|
219
|
+
|
220
|
+
class MergeOrder(tuple):
|
221
|
+
@staticmethod
|
222
|
+
def of(
|
223
|
+
sort_order: SortOrder = SortOrder.ASCENDING,
|
224
|
+
null_order: NullOrder = NullOrder.AT_END,
|
225
|
+
) -> MergeOrder:
|
226
|
+
return MergeOrder(
|
227
|
+
(
|
228
|
+
sort_order,
|
229
|
+
null_order,
|
230
|
+
)
|
231
|
+
)
|
232
|
+
|
233
|
+
@property
|
234
|
+
def sort_order(self) -> Optional[SortOrder]:
|
235
|
+
return SortOrder(self[0])
|
236
|
+
|
237
|
+
@property
|
238
|
+
def null_order(self) -> Optional[NullOrder]:
|
239
|
+
return NullOrder(self[1])
|
240
|
+
|
241
|
+
|
242
|
+
class Field(dict):
|
243
|
+
@staticmethod
|
244
|
+
def of(
|
245
|
+
field: pa.Field,
|
246
|
+
field_id: Optional[FieldId] = None,
|
247
|
+
is_merge_key: Optional[bool] = None,
|
248
|
+
merge_order: Optional[MergeOrder] = None,
|
249
|
+
is_event_time: Optional[bool] = None,
|
250
|
+
doc: Optional[str] = None,
|
251
|
+
past_default: Optional[Any] = None,
|
252
|
+
future_default: Optional[Any] = None,
|
253
|
+
consistency_type: Optional[SchemaConsistencyType] = None,
|
254
|
+
path: Optional[NestedFieldName] = None,
|
255
|
+
native_object: Optional[Any] = None,
|
256
|
+
) -> Field:
|
257
|
+
"""
|
258
|
+
Creates a DeltaCAT field from a PyArrow base field. The DeltaCAT
|
259
|
+
field contains a copy of the base field, but ensures that the
|
260
|
+
PyArrow Field's metadata is also populated with optional metadata
|
261
|
+
like documentation or metadata used within the context of a parent
|
262
|
+
schema like field ids, merge keys, and default values.
|
263
|
+
|
264
|
+
Args:
|
265
|
+
field (pa.Field): Arrow base field.
|
266
|
+
|
267
|
+
field_id (Optional[FieldId]): Unique ID of the field within its
|
268
|
+
parent schema, or None if this field has no parent schema. If not
|
269
|
+
given, then the field ID will be derived from the Arrow base
|
270
|
+
field's "PARQUET:field_id" metadata key.
|
271
|
+
|
272
|
+
is_merge_key (Optional[bool]): True if this Field is used as a merge
|
273
|
+
key within its parent schema, False or None if it is not a merge
|
274
|
+
key or has no parent schema. If not given, this will be derived from
|
275
|
+
the Arrow base field's "DELTACAT:merge_key" metadata key. Merge keys
|
276
|
+
are the default keys used to find matching records for equality
|
277
|
+
deletes, upserts, and other equality-key-based merge operations.
|
278
|
+
Must be a non-floating-point primitive type.
|
279
|
+
|
280
|
+
merge_order (Optional[MergeOrder]): Merge order for this field
|
281
|
+
within its parent schema. None if it is not used for merge order or
|
282
|
+
has no parent schema. If not given, this will be derived from
|
283
|
+
the Arrow base field's "DELTACAT:merge_order" metadata key. Merge
|
284
|
+
order is used to determine the record kept amongst all records
|
285
|
+
with matching merge keys for equality deletes, upserts, and other
|
286
|
+
equality-key-based merge operations. Must be a primitive type.
|
287
|
+
|
288
|
+
is_event_time (Optional[bool]): True if this Field is used to derive
|
289
|
+
event time within its parent schema, False or None if it is not used
|
290
|
+
or has no parent schema. If not given, this will be derived from
|
291
|
+
the Arrow base field's "DELTACAT:event_time" metadata key. Event
|
292
|
+
times are used to determine a stream's data completeness watermark.
|
293
|
+
Must be an integer, float, or date type.
|
294
|
+
|
295
|
+
doc (Optional[str]): Documentation for this field or None if this
|
296
|
+
field has no documentation. If not given, then docs will be derived
|
297
|
+
from the Arrow base field's "DELTACAT:doc" metadata key.
|
298
|
+
|
299
|
+
past_default (Optional[Any]): Past default values for records
|
300
|
+
written to the parent schema before this field was appended,
|
301
|
+
or None if this field has no parent schema. If not given, this will
|
302
|
+
be derived from the Arrow base field's "DELTACAT:past_default"
|
303
|
+
metadata key. Must be coercible to the field's base arrow type.
|
304
|
+
|
305
|
+
future_default (Optional[Any]): Future default values for records
|
306
|
+
that omit this field in the parent schema they're written to, or
|
307
|
+
None if this field has no parent schema. If not given, this will
|
308
|
+
be derived from the Arrow base field's "DELTACAT:future_default"
|
309
|
+
metadata key. Must be coercible to the field's base arrow type.
|
310
|
+
|
311
|
+
consistency_type (Optional[SchemaConsistencyType]): Schema
|
312
|
+
consistency type for records written to this field within the
|
313
|
+
context of a parent schema, or None if the field has no parent
|
314
|
+
schema. If not given, this will be derived from the Arrow base
|
315
|
+
field's "DELTACAT:consistency_type" metadata key.
|
316
|
+
|
317
|
+
path (Optional[NestedFieldName]): Fully qualified path of this
|
318
|
+
field within its parent schema. Any manually specified path will
|
319
|
+
be overwritten when this field is added to a schema.
|
320
|
+
|
321
|
+
native_object (Optional[Any]): The native object, if any, that this
|
322
|
+
field was originally derived from.
|
323
|
+
Returns:
|
324
|
+
A new DeltaCAT Field.
|
325
|
+
"""
|
326
|
+
final_field = Field._build(
|
327
|
+
field=field,
|
328
|
+
field_id=Field._field_id(field) if field_id is None else field_id,
|
329
|
+
is_merge_key=Field._is_merge_key(field)
|
330
|
+
if is_merge_key is None
|
331
|
+
else is_merge_key,
|
332
|
+
merge_order=Field._merge_order(field)
|
333
|
+
if merge_order is None
|
334
|
+
else merge_order,
|
335
|
+
is_event_time=Field._is_event_time(field)
|
336
|
+
if is_event_time is None
|
337
|
+
else is_event_time,
|
338
|
+
doc=Field._doc(field) if doc is None else doc,
|
339
|
+
past_default=Field._past_default(field)
|
340
|
+
if past_default is None
|
341
|
+
else past_default,
|
342
|
+
future_default=Field._future_default(field)
|
343
|
+
if future_default is None
|
344
|
+
else future_default,
|
345
|
+
consistency_type=Field._consistency_type(field)
|
346
|
+
if consistency_type is None
|
347
|
+
else consistency_type,
|
348
|
+
)
|
349
|
+
return Field(
|
350
|
+
{
|
351
|
+
"arrow": final_field,
|
352
|
+
"path": copy.deepcopy(path),
|
353
|
+
"nativeObject": native_object,
|
354
|
+
}
|
355
|
+
)
|
356
|
+
|
357
|
+
@property
|
358
|
+
def arrow(self) -> pa.Field:
|
359
|
+
return self["arrow"]
|
360
|
+
|
361
|
+
@property
|
362
|
+
def id(self) -> Optional[FieldId]:
|
363
|
+
return Field._field_id(self.arrow)
|
364
|
+
|
365
|
+
@property
|
366
|
+
def path(self) -> Optional[NestedFieldName]:
|
367
|
+
return self.get("path")
|
368
|
+
|
369
|
+
@property
|
370
|
+
def is_merge_key(self) -> Optional[bool]:
|
371
|
+
return Field._is_merge_key(self.arrow)
|
372
|
+
|
373
|
+
@property
|
374
|
+
def merge_order(self) -> Optional[MergeOrder]:
|
375
|
+
return Field._merge_order(self.arrow)
|
376
|
+
|
377
|
+
@property
|
378
|
+
def is_event_time(self) -> Optional[bool]:
|
379
|
+
return Field._is_event_time(self.arrow)
|
380
|
+
|
381
|
+
@property
|
382
|
+
def doc(self) -> Optional[str]:
|
383
|
+
return Field._doc(self.arrow)
|
384
|
+
|
385
|
+
@property
|
386
|
+
def past_default(self) -> Optional[Any]:
|
387
|
+
return Field._past_default(self.arrow)
|
388
|
+
|
389
|
+
@property
|
390
|
+
def future_default(self) -> Optional[Any]:
|
391
|
+
return Field._future_default(self.arrow)
|
392
|
+
|
393
|
+
@property
|
394
|
+
def consistency_type(self) -> Optional[SchemaConsistencyType]:
|
395
|
+
return Field._consistency_type(self.arrow)
|
396
|
+
|
397
|
+
@property
|
398
|
+
def native_object(self) -> Optional[Any]:
|
399
|
+
return self.get("nativeObject")
|
400
|
+
|
401
|
+
@staticmethod
|
402
|
+
def _field_id(field: pa.Field) -> Optional[FieldId]:
|
403
|
+
field_id = None
|
404
|
+
if field.metadata:
|
405
|
+
bytes_val = field.metadata.get(PARQUET_FIELD_ID_KEY_NAME)
|
406
|
+
field_id = int(bytes_val.decode()) if bytes_val else None
|
407
|
+
return field_id
|
408
|
+
|
409
|
+
@staticmethod
|
410
|
+
def _doc(field: pa.Field) -> Optional[str]:
|
411
|
+
doc = None
|
412
|
+
if field.metadata:
|
413
|
+
bytes_val = field.metadata.get(FIELD_DOC_KEY_NAME)
|
414
|
+
doc = bytes_val.decode() if bytes_val else None
|
415
|
+
return doc
|
416
|
+
|
417
|
+
@staticmethod
|
418
|
+
def _is_merge_key(field: pa.Field) -> Optional[bool]:
|
419
|
+
is_merge_key = None
|
420
|
+
if field.metadata:
|
421
|
+
bytes_val = field.metadata.get(FIELD_MERGE_KEY_NAME)
|
422
|
+
is_merge_key = bool(bytes_val.decode()) if bytes_val else None
|
423
|
+
return is_merge_key
|
424
|
+
|
425
|
+
@staticmethod
|
426
|
+
def _merge_order(field: pa.Field) -> Optional[MergeOrder]:
|
427
|
+
merge_order = None
|
428
|
+
if field.metadata:
|
429
|
+
bytes_val = field.metadata.get(FIELD_MERGE_ORDER_KEY_NAME)
|
430
|
+
merge_order = _decode_metadata_value(bytes_val) if bytes_val else None
|
431
|
+
return merge_order
|
432
|
+
|
433
|
+
@staticmethod
|
434
|
+
def _is_event_time(field: pa.Field) -> Optional[bool]:
|
435
|
+
is_event_time = None
|
436
|
+
if field.metadata:
|
437
|
+
bytes_val = field.metadata.get(FIELD_EVENT_TIME_KEY_NAME)
|
438
|
+
is_event_time = bool(bytes_val.decode()) if bytes_val else None
|
439
|
+
return is_event_time
|
440
|
+
|
441
|
+
@staticmethod
|
442
|
+
def _past_default(field: pa.Field) -> Optional[Any]:
|
443
|
+
default = None
|
444
|
+
if field.metadata:
|
445
|
+
bytes_val = field.metadata.get(FIELD_PAST_DEFAULT_KEY_NAME)
|
446
|
+
default = _decode_metadata_value(bytes_val) if bytes_val else None
|
447
|
+
return default
|
448
|
+
|
449
|
+
@staticmethod
|
450
|
+
def _future_default(field: pa.Field) -> Optional[Any]:
|
451
|
+
default = None
|
452
|
+
if field.metadata:
|
453
|
+
bytes_val = field.metadata.get(FIELD_FUTURE_DEFAULT_KEY_NAME)
|
454
|
+
default = _decode_metadata_value(bytes_val) if bytes_val else None
|
455
|
+
return default
|
456
|
+
|
457
|
+
@staticmethod
|
458
|
+
def _consistency_type(field: pa.Field) -> Optional[SchemaConsistencyType]:
|
459
|
+
t = None
|
460
|
+
if field.metadata:
|
461
|
+
bytes_val = field.metadata.get(FIELD_CONSISTENCY_TYPE_KEY_NAME)
|
462
|
+
t = SchemaConsistencyType(bytes_val.decode()) if bytes_val else None
|
463
|
+
return t
|
464
|
+
|
465
|
+
@staticmethod
|
466
|
+
def _validate_merge_key(
|
467
|
+
field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
|
468
|
+
):
|
469
|
+
# Note: large_strings were explicitly allowed for compatibility with PyIceberg Iceberg Schema to PyArrow converter
|
470
|
+
if not (
|
471
|
+
pa.types.is_string(field.type)
|
472
|
+
or pa.types.is_primitive(field.type)
|
473
|
+
or pa.types.is_large_string(field.type)
|
474
|
+
):
|
475
|
+
raise ValueError(
|
476
|
+
f"Merge key {field} must be a primitive type or large string."
|
477
|
+
)
|
478
|
+
|
479
|
+
# Merge key fields must have VALIDATE consistency type to prevent type promotion
|
480
|
+
if (
|
481
|
+
consistency_type is not None
|
482
|
+
and consistency_type != SchemaConsistencyType.VALIDATE
|
483
|
+
):
|
484
|
+
raise ValueError(
|
485
|
+
f"Merge key field '{field.name}' must have VALIDATE consistency type, "
|
486
|
+
f"got {consistency_type}. Type promotion is not allowed for merge keys."
|
487
|
+
)
|
488
|
+
|
489
|
+
if pa.types.is_floating(field.type):
|
490
|
+
raise ValueError(f"Merge key {field} cannot be floating point.")
|
491
|
+
|
492
|
+
@staticmethod
|
493
|
+
def _validate_merge_order(
|
494
|
+
field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
|
495
|
+
):
|
496
|
+
if not pa.types.is_primitive(field.type):
|
497
|
+
raise ValueError(f"Merge order {field} must be a primitive type.")
|
498
|
+
|
499
|
+
# Merge order fields must have VALIDATE consistency type to prevent type promotion
|
500
|
+
if (
|
501
|
+
consistency_type is not None
|
502
|
+
and consistency_type != SchemaConsistencyType.VALIDATE
|
503
|
+
):
|
504
|
+
raise ValueError(
|
505
|
+
f"Merge order field '{field.name}' must have VALIDATE consistency type, "
|
506
|
+
f"got {consistency_type}. Type promotion is not allowed for merge order fields."
|
507
|
+
)
|
508
|
+
|
509
|
+
@staticmethod
|
510
|
+
def _validate_event_time(
|
511
|
+
field: pa.Field, consistency_type: Optional[SchemaConsistencyType] = None
|
512
|
+
):
|
513
|
+
if (
|
514
|
+
not pa.types.is_integer(field.type)
|
515
|
+
and not pa.types.is_floating(field.type)
|
516
|
+
and not pa.types.is_date(field.type)
|
517
|
+
):
|
518
|
+
raise ValueError(f"Event time {field} must be numeric or date type.")
|
519
|
+
|
520
|
+
# Event time fields must have VALIDATE consistency type to prevent type promotion
|
521
|
+
if (
|
522
|
+
consistency_type is not None
|
523
|
+
and consistency_type != SchemaConsistencyType.VALIDATE
|
524
|
+
):
|
525
|
+
raise ValueError(
|
526
|
+
f"Event time field '{field.name}' must have VALIDATE consistency type, "
|
527
|
+
f"got {consistency_type}. Type promotion is not allowed for event time fields."
|
528
|
+
)
|
529
|
+
|
530
|
+
@staticmethod
|
531
|
+
def _validate_default(
|
532
|
+
default: Optional[Any],
|
533
|
+
field: pa.Field,
|
534
|
+
) -> pa.Scalar:
|
535
|
+
try:
|
536
|
+
return pa.scalar(default, field.type)
|
537
|
+
except ArrowInvalid:
|
538
|
+
raise ValueError(
|
539
|
+
f"Cannot treat default value `{default}` as type"
|
540
|
+
f"`{field.type}` for field: {field}"
|
541
|
+
)
|
542
|
+
|
543
|
+
@staticmethod
|
544
|
+
def _build(
|
545
|
+
field: pa.Field,
|
546
|
+
field_id: Optional[int],
|
547
|
+
is_merge_key: Optional[bool],
|
548
|
+
merge_order: Optional[MergeOrder],
|
549
|
+
is_event_time: Optional[bool],
|
550
|
+
doc: Optional[str],
|
551
|
+
past_default: Optional[Any],
|
552
|
+
future_default: Optional[Any],
|
553
|
+
consistency_type: Optional[SchemaConsistencyType],
|
554
|
+
) -> pa.Field:
|
555
|
+
# Auto-set future_default to past_default if past_default exists but future_default doesn't
|
556
|
+
if past_default is not None and future_default is None:
|
557
|
+
future_default = past_default
|
558
|
+
|
559
|
+
# Default critical columns (merge key, merge order, event time) to VALIDATE consistency type
|
560
|
+
# to prevent type promotion which could break merge semantics
|
561
|
+
if consistency_type is None and (is_merge_key or merge_order or is_event_time):
|
562
|
+
consistency_type = SchemaConsistencyType.VALIDATE
|
563
|
+
|
564
|
+
meta = {}
|
565
|
+
if is_merge_key:
|
566
|
+
Field._validate_merge_key(field, consistency_type)
|
567
|
+
meta[FIELD_MERGE_KEY_NAME] = str(is_merge_key)
|
568
|
+
if merge_order:
|
569
|
+
Field._validate_merge_order(field, consistency_type)
|
570
|
+
meta[FIELD_MERGE_ORDER_KEY_NAME] = _encode_metadata_value(merge_order)
|
571
|
+
if is_event_time:
|
572
|
+
Field._validate_event_time(field, consistency_type)
|
573
|
+
meta[FIELD_EVENT_TIME_KEY_NAME] = str(is_event_time)
|
574
|
+
if past_default is not None:
|
575
|
+
Field._validate_default(past_default, field)
|
576
|
+
meta[FIELD_PAST_DEFAULT_KEY_NAME] = _encode_metadata_value(past_default)
|
577
|
+
if future_default is not None:
|
578
|
+
Field._validate_default(future_default, field)
|
579
|
+
meta[FIELD_FUTURE_DEFAULT_KEY_NAME] = _encode_metadata_value(future_default)
|
580
|
+
if field_id is not None:
|
581
|
+
meta[PARQUET_FIELD_ID_KEY_NAME] = str(field_id)
|
582
|
+
if doc is not None:
|
583
|
+
meta[FIELD_DOC_KEY_NAME] = doc
|
584
|
+
if consistency_type is not None:
|
585
|
+
meta[FIELD_CONSISTENCY_TYPE_KEY_NAME] = consistency_type.value
|
586
|
+
return pa.field(
|
587
|
+
name=field.name,
|
588
|
+
type=field.type,
|
589
|
+
nullable=field.nullable,
|
590
|
+
metadata=meta,
|
591
|
+
)
|
592
|
+
|
593
|
+
def validate(
|
594
|
+
self,
|
595
|
+
column_type: pa.DataType,
|
596
|
+
) -> None:
|
597
|
+
"""Validate that data in a column matches this field's type and constraints.
|
598
|
+
|
599
|
+
Args:
|
600
|
+
column_datatype: PyArrow DataType containing the column data to validate
|
601
|
+
|
602
|
+
Raises:
|
603
|
+
ValueError: If data doesn't match field requirements.
|
604
|
+
"""
|
605
|
+
# Check if the data type matches the field type
|
606
|
+
if not column_type.equals(self.arrow.type):
|
607
|
+
raise SchemaValidationError(
|
608
|
+
f"Data type mismatch for field '{self.arrow.name}': "
|
609
|
+
f"expected {self.arrow.type}, got {column_type}"
|
610
|
+
)
|
611
|
+
|
612
|
+
def coerce(
|
613
|
+
self,
|
614
|
+
column_data: pa.Array,
|
615
|
+
) -> pa.Array:
|
616
|
+
"""Coerce data in a column to match this field's type.
|
617
|
+
|
618
|
+
Args:
|
619
|
+
column_data: PyArrow Array containing the column data to coerce
|
620
|
+
|
621
|
+
Returns:
|
622
|
+
pa.Array: Coerced data matching this field's type
|
623
|
+
|
624
|
+
Raises:
|
625
|
+
ValueError: If data cannot be coerced to the field type
|
626
|
+
"""
|
627
|
+
try:
|
628
|
+
return pa.compute.cast(column_data, self.arrow.type)
|
629
|
+
except (pa.ArrowTypeError, pa.ArrowInvalid) as e:
|
630
|
+
raise SchemaValidationError(
|
631
|
+
f"Cannot coerce data for field '{self.arrow.name}' "
|
632
|
+
f"from {column_data.type} to {self.arrow.type}: {e}"
|
633
|
+
)
|
634
|
+
|
635
|
+
def coerce_daft(
|
636
|
+
self,
|
637
|
+
dataframe: DaftDataFrame,
|
638
|
+
column_name: str,
|
639
|
+
target_type: Optional[pa.DataType] = None,
|
640
|
+
) -> DaftDataFrame:
|
641
|
+
"""Coerce a Daft DataFrame column to match this field's type.
|
642
|
+
|
643
|
+
Args:
|
644
|
+
dataframe: Daft DataFrame containing the column to coerce
|
645
|
+
column_name: Name of the column to coerce
|
646
|
+
target_type: Optional target type to coerce to (defaults to self.arrow.type)
|
647
|
+
|
648
|
+
Returns:
|
649
|
+
DaftDataFrame: DataFrame with the coerced column
|
650
|
+
|
651
|
+
Raises:
|
652
|
+
SchemaValidationError: If data cannot be coerced to the field type
|
653
|
+
"""
|
654
|
+
target_arrow_type = target_type or self.arrow.type
|
655
|
+
target_daft_type = daft.DataType.from_arrow_type(target_arrow_type)
|
656
|
+
|
657
|
+
try:
|
658
|
+
# Use Daft's cast expression to coerce the column
|
659
|
+
coerced_dataframe = dataframe.with_column(
|
660
|
+
column_name, daft.col(column_name).cast(target_daft_type)
|
661
|
+
)
|
662
|
+
return coerced_dataframe
|
663
|
+
except Exception as e:
|
664
|
+
raise SchemaValidationError(
|
665
|
+
f"Cannot coerce Daft column '{column_name}' for field '{self.arrow.name}' "
|
666
|
+
f"to type {target_arrow_type}: {e}"
|
667
|
+
)
|
668
|
+
|
669
|
+
def promote_type_if_needed(
|
670
|
+
self,
|
671
|
+
column_data: pa.Array,
|
672
|
+
) -> Tuple[pa.Array, bool]:
|
673
|
+
"""Promote field type to accommodate new data when consistency type is NONE.
|
674
|
+
Use PyArrow's unify_schemas to find the most permissive type that can accommodate both
|
675
|
+
the current and new data types.
|
676
|
+
|
677
|
+
Args:
|
678
|
+
column_data: PyArrow Array containing the column data
|
679
|
+
|
680
|
+
Returns:
|
681
|
+
Tuple[pa.Array, bool]: (data, type_was_promoted)
|
682
|
+
- data: Either original data or data cast to promoted type
|
683
|
+
- type_was_promoted: True if field type should be updated
|
684
|
+
|
685
|
+
Raises:
|
686
|
+
SchemaValidationError: If column data cannot be promoted to a unified type
|
687
|
+
"""
|
688
|
+
current_type = self.arrow.type
|
689
|
+
data_type = column_data.type
|
690
|
+
|
691
|
+
# Early return if types are already compatible
|
692
|
+
if current_type.equals(data_type):
|
693
|
+
return column_data, False
|
694
|
+
|
695
|
+
# Find the promoted type that can accommodate both types
|
696
|
+
promoted_type = self._find_promoted_type(current_type, data_type)
|
697
|
+
|
698
|
+
# Handle type coercion vs promotion
|
699
|
+
if promoted_type.equals(current_type):
|
700
|
+
return self._coerce_to_current_type(column_data, current_type)
|
701
|
+
else:
|
702
|
+
return self._promote_to_new_type(column_data, promoted_type)
|
703
|
+
|
704
|
+
def _coerce_to_current_type(
|
705
|
+
self,
|
706
|
+
column_data: pa.Array,
|
707
|
+
current_type: pa.DataType,
|
708
|
+
) -> Tuple[pa.Array, bool]:
|
709
|
+
"""Try to coerce data to current type without promoting the field type."""
|
710
|
+
try:
|
711
|
+
coerced_data = pa.compute.cast(column_data, current_type)
|
712
|
+
return coerced_data, False
|
713
|
+
except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
|
714
|
+
return column_data, False
|
715
|
+
|
716
|
+
def _promote_to_new_type(
|
717
|
+
self,
|
718
|
+
column_data: pa.Array,
|
719
|
+
promoted_type: pa.DataType,
|
720
|
+
) -> Tuple[pa.Array, bool]:
|
721
|
+
"""Try to cast data to the promoted type."""
|
722
|
+
try:
|
723
|
+
promoted_data = pa.compute.cast(column_data, promoted_type)
|
724
|
+
return promoted_data, True
|
725
|
+
except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
|
726
|
+
# If direct cast fails, the promotion is not valid
|
727
|
+
raise SchemaValidationError(
|
728
|
+
f"Cannot cast data for field '{self.arrow.name}' from type {column_data.type} "
|
729
|
+
f"to promoted type {promoted_type}"
|
730
|
+
)
|
731
|
+
|
732
|
+
def _cast_default_to_promoted_type(
|
733
|
+
self,
|
734
|
+
default_value: Any,
|
735
|
+
promoted_type: pa.DataType,
|
736
|
+
) -> Optional[Any]:
|
737
|
+
"""Cast a default value to match a promoted type.
|
738
|
+
|
739
|
+
Args:
|
740
|
+
default_value: The original default value
|
741
|
+
promoted_type: The new promoted type
|
742
|
+
|
743
|
+
Returns:
|
744
|
+
The default value cast to the promoted type.
|
745
|
+
|
746
|
+
Raises:
|
747
|
+
SchemaValidationError: If the default value cannot be cast to the promoted type
|
748
|
+
"""
|
749
|
+
if default_value is None:
|
750
|
+
return None
|
751
|
+
|
752
|
+
try:
|
753
|
+
# Create a scalar with the original default value
|
754
|
+
original_scalar = pa.scalar(default_value)
|
755
|
+
# Cast to the promoted type
|
756
|
+
promoted_scalar = pa.compute.cast(original_scalar, promoted_type)
|
757
|
+
# Return the Python value
|
758
|
+
return promoted_scalar.as_py()
|
759
|
+
except (
|
760
|
+
pa.ArrowTypeError,
|
761
|
+
pa.ArrowInvalid,
|
762
|
+
pa.ArrowNotImplementedError,
|
763
|
+
TypeError,
|
764
|
+
ValueError,
|
765
|
+
):
|
766
|
+
raise SchemaValidationError(
|
767
|
+
f"Cannot cast default value `{default_value}` to promoted type {promoted_type}"
|
768
|
+
)
|
769
|
+
|
770
|
+
def _find_promoted_type(
|
771
|
+
self,
|
772
|
+
current_type: pa.DataType,
|
773
|
+
new_type: pa.DataType,
|
774
|
+
) -> Optional[pa.DataType]:
|
775
|
+
"""Find the most specific type that can accommodate both current and new types
|
776
|
+
using PyArrow's unify_schemas with permissive promotion options.
|
777
|
+
|
778
|
+
Returns:
|
779
|
+
The promoted type.
|
780
|
+
|
781
|
+
Raises:
|
782
|
+
SchemaValidationError: If the types cannot be unified.
|
783
|
+
"""
|
784
|
+
try:
|
785
|
+
# Create schemas with the same field name but different types
|
786
|
+
schema1 = pa.schema([("field", current_type)])
|
787
|
+
schema2 = pa.schema([("field", new_type)])
|
788
|
+
|
789
|
+
# Use PyArrow's built-in permissive type promotion
|
790
|
+
unified_schema = pa.unify_schemas(
|
791
|
+
[schema1, schema2], promote_options="permissive"
|
792
|
+
)
|
793
|
+
|
794
|
+
# Return the promoted type
|
795
|
+
return unified_schema.field("field").type
|
796
|
+
|
797
|
+
except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError):
|
798
|
+
# If unification fails, no promotion is possible
|
799
|
+
raise SchemaValidationError(
|
800
|
+
f"Cannot unify types for field '{self.arrow.name}': "
|
801
|
+
f"current type {current_type} incompatible with new data type {new_type}"
|
802
|
+
)
|
803
|
+
|
804
|
+
|
805
|
+
SingleSchema = Union[List[Field], pa.Schema]
|
806
|
+
MultiSchema = Union[Dict[SchemaName, List[Field]], Dict[SchemaName, pa.Schema]]
|
807
|
+
|
808
|
+
|
809
|
+
class Schema(dict):
|
810
|
+
@staticmethod
|
811
|
+
def of(
|
812
|
+
schema: Union[SingleSchema, MultiSchema],
|
813
|
+
schema_id: Optional[SchemaId] = None,
|
814
|
+
native_object: Optional[Any] = None,
|
815
|
+
) -> Schema:
|
816
|
+
"""
|
817
|
+
Creates a DeltaCAT schema from either one or multiple Arrow base schemas
|
818
|
+
or lists of DeltaCAT fields. All field names across all input schemas
|
819
|
+
must be unique (case-insensitive). If a dict of named subschemas is
|
820
|
+
given, then this DeltaCAT schema will be backed by a unified arrow
|
821
|
+
schema created as a union of all input schemas in the natural iteration
|
822
|
+
order of their dictionary keys. This unified schema saves all named
|
823
|
+
subschema field mappings in its metadata to support DeltaCAT subschema
|
824
|
+
retrieval by name after schema creation.
|
825
|
+
|
826
|
+
Args:
|
827
|
+
schema (Union[SingleSchema, MultiSchema]): For a single unnamed
|
828
|
+
schema, either an Arrow base schema or list of DeltaCAT fields.
|
829
|
+
If an Arrow base schema is given, then a copy of the base schema
|
830
|
+
is made with each Arrow field populated with additional metadata.
|
831
|
+
Field IDs, merge keys, docs, and default vals will be read from
|
832
|
+
each Arrow field's metadata if they exist. Any field missing a
|
833
|
+
field ID will be assigned a unique field ID, with assigned field
|
834
|
+
IDs either starting from 0 or the max field ID + 1.
|
835
|
+
For multiple named subschemas, a dictionary of schema names to an
|
836
|
+
arrow base schema or list of DeltaCAT fields. These schemas will
|
837
|
+
be copied into a unified Arrow schema representing a union of all
|
838
|
+
of their fields in their natural iteration order. Any missing
|
839
|
+
field IDs will be autoassigned starting from 0 or the max field ID
|
840
|
+
+ 1 across the natural iteration order of all schemas first, and
|
841
|
+
all fields second.
|
842
|
+
All fields across all schemas must have unique names
|
843
|
+
(case-insensitive).
|
844
|
+
|
845
|
+
schema_id (SchemaId): Unique ID of schema within its parent table
|
846
|
+
version. Defaults to 0.
|
847
|
+
|
848
|
+
native_object (Optional[Any]): The native object, if any, that this
|
849
|
+
schema was converted from.
|
850
|
+
Returns:
|
851
|
+
A new DeltaCAT Schema.
|
852
|
+
"""
|
853
|
+
if schema_id and schema_id < 0:
|
854
|
+
raise ValueError(f"Schema ID must be non-negative, got {schema_id}")
|
855
|
+
# normalize the input as a unified pyarrow schema
|
856
|
+
# if the input included multiple subschemas, then also save a mapping
|
857
|
+
# from each subschema to its unique field names
|
858
|
+
schema, subschema_to_field_names = Schema._to_unified_pyarrow_schema(schema)
|
859
|
+
# discover assigned field IDs in the given pyarrow schema
|
860
|
+
field_ids_to_fields = {}
|
861
|
+
schema_metadata = {}
|
862
|
+
visitor_dict = {"maxFieldId": 0}
|
863
|
+
# find and save the schema's max field ID in the visitor dictionary
|
864
|
+
Schema._visit_fields(
|
865
|
+
current=schema,
|
866
|
+
visit=Schema._find_max_field_id,
|
867
|
+
visitor_dict=visitor_dict,
|
868
|
+
)
|
869
|
+
max_field_id = visitor_dict["maxFieldId"]
|
870
|
+
visitor_dict["fieldIdsToFields"] = field_ids_to_fields
|
871
|
+
# populate map of field IDs to DeltaCAT fields w/ IDs, docs, etc.
|
872
|
+
Schema._visit_fields(
|
873
|
+
current=schema,
|
874
|
+
visit=Schema._populate_fields,
|
875
|
+
visitor_dict=visitor_dict,
|
876
|
+
)
|
877
|
+
# recalculate max field ID after field population (in case new field IDs were assigned)
|
878
|
+
max_field_id = max(field_ids_to_fields.keys()) if field_ids_to_fields else 0
|
879
|
+
if schema.metadata:
|
880
|
+
schema_metadata.update(schema.metadata)
|
881
|
+
# populate merge keys
|
882
|
+
merge_keys = [
|
883
|
+
field.id for field in field_ids_to_fields.values() if field.is_merge_key
|
884
|
+
]
|
885
|
+
# create a new pyarrow schema with field ID, doc, etc. field metadata
|
886
|
+
pyarrow_schema = pa.schema(
|
887
|
+
fields=[field.arrow for field in field_ids_to_fields.values()],
|
888
|
+
)
|
889
|
+
# map subschema field names to IDs (for faster lookup and reduced size)
|
890
|
+
subschema_to_field_ids = {
|
891
|
+
schema_name: [
|
892
|
+
Field.of(pyarrow_schema.field(field_name)).id
|
893
|
+
for field_name in field_names
|
894
|
+
]
|
895
|
+
for schema_name, field_names in subschema_to_field_names.items()
|
896
|
+
}
|
897
|
+
# create a final pyarrow schema with populated schema metadata
|
898
|
+
if schema_id is not None:
|
899
|
+
schema_metadata[SCHEMA_ID_KEY_NAME] = str(schema_id)
|
900
|
+
if schema_metadata.get(SCHEMA_ID_KEY_NAME) is None:
|
901
|
+
schema_metadata[SCHEMA_ID_KEY_NAME] = str(0)
|
902
|
+
schema_metadata[SUBSCHEMAS_KEY_NAME] = _encode_metadata_value(
|
903
|
+
subschema_to_field_ids
|
904
|
+
)
|
905
|
+
final_schema = pyarrow_schema.with_metadata(schema_metadata)
|
906
|
+
return Schema(
|
907
|
+
{
|
908
|
+
"arrow": final_schema,
|
909
|
+
"mergeKeys": merge_keys or None,
|
910
|
+
"fieldIdsToFields": field_ids_to_fields,
|
911
|
+
"maxFieldId": max_field_id,
|
912
|
+
"nativeObject": native_object,
|
913
|
+
}
|
914
|
+
)
|
915
|
+
|
916
|
+
@staticmethod
|
917
|
+
def deserialize(serialized: pa.Buffer) -> Schema:
|
918
|
+
return Schema.of(schema=pa.ipc.read_schema(serialized))
|
919
|
+
|
920
|
+
def serialize(self) -> pa.Buffer:
|
921
|
+
return self.arrow.serialize()
|
922
|
+
|
923
|
+
def equivalent_to(self, other: Schema, check_metadata: bool = False):
|
924
|
+
if other is None:
|
925
|
+
return False
|
926
|
+
if not isinstance(other, dict):
|
927
|
+
return False
|
928
|
+
if not isinstance(other, Schema):
|
929
|
+
other = Schema(other)
|
930
|
+
return self.arrow.equals(
|
931
|
+
other.arrow,
|
932
|
+
check_metadata,
|
933
|
+
)
|
934
|
+
|
935
|
+
def add_subschema(
|
936
|
+
self,
|
937
|
+
name: SchemaName,
|
938
|
+
schema: SingleSchema,
|
939
|
+
) -> Schema:
|
940
|
+
subschemas = copy.copy(self.subschemas)
|
941
|
+
if not subschemas: # self is SingleSchema
|
942
|
+
subschemas = {BASE_SCHEMA_NAME: self}
|
943
|
+
subschemas = Schema._add_subschema(name, schema, subschemas)
|
944
|
+
return Schema.of(
|
945
|
+
schema=subschemas,
|
946
|
+
schema_id=self.id + 1,
|
947
|
+
)
|
948
|
+
|
949
|
+
def delete_subschema(self, name: SchemaName) -> Schema:
|
950
|
+
subschemas = copy.copy(self.subschemas)
|
951
|
+
subschemas = self._del_subschema(name, subschemas)
|
952
|
+
if not subschemas:
|
953
|
+
raise ValueError(f"Deleting `{name}` would leave the schema empty.")
|
954
|
+
subschemas = {name: val.arrow for name, val in subschemas.items()}
|
955
|
+
return Schema.of(
|
956
|
+
schema=subschemas,
|
957
|
+
schema_id=self.id + 1,
|
958
|
+
)
|
959
|
+
|
960
|
+
def replace_subschema(
|
961
|
+
self,
|
962
|
+
name: SchemaName,
|
963
|
+
schema: SingleSchema,
|
964
|
+
) -> Schema:
|
965
|
+
subschemas = copy.copy(self.subschemas)
|
966
|
+
subschemas = Schema._del_subschema(name, subschemas)
|
967
|
+
subschemas = Schema._add_subschema(name, schema, subschemas)
|
968
|
+
return Schema.of(
|
969
|
+
schema=subschemas,
|
970
|
+
schema_id=self.id + 1,
|
971
|
+
)
|
972
|
+
|
973
|
+
def update(self, allow_incompatible_changes: bool = False) -> SchemaUpdate:
|
974
|
+
"""
|
975
|
+
Create a SchemaUpdate instance for safely evolving this schema.
|
976
|
+
|
977
|
+
This method provides a convenient way to create a SchemaUpdate for this schema
|
978
|
+
without needing to call SchemaUpdate.of() directly.
|
979
|
+
|
980
|
+
Args:
|
981
|
+
allow_incompatible_changes: If True, allows changes that may break
|
982
|
+
backward compatibility. If False (default), raises SchemaCompatibilityError
|
983
|
+
for incompatible changes.
|
984
|
+
|
985
|
+
Returns:
|
986
|
+
A new SchemaUpdate instance configured for this schema
|
987
|
+
|
988
|
+
Example:
|
989
|
+
>>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
|
990
|
+
>>> new_field = Field.of(pa.field("name", pa.string()))
|
991
|
+
>>> updated_schema = (schema.update()
|
992
|
+
... .add_field("name", new_field)
|
993
|
+
... .apply())
|
994
|
+
"""
|
995
|
+
return SchemaUpdate.of(
|
996
|
+
self, allow_incompatible_changes=allow_incompatible_changes
|
997
|
+
)
|
998
|
+
|
999
|
+
def field_id(self, name: Union[FieldName, NestedFieldName]) -> FieldId:
|
1000
|
+
return Schema._field_name_to_field_id(self.arrow, name)
|
1001
|
+
|
1002
|
+
def field_name(self, field_id: FieldId) -> Union[FieldName, NestedFieldName]:
|
1003
|
+
field = self.field_ids_to_fields[field_id]
|
1004
|
+
if len(field.path) == 1:
|
1005
|
+
return field.arrow.name
|
1006
|
+
return field.path
|
1007
|
+
|
1008
|
+
def field(self, field_locator: FieldLocator) -> Field:
|
1009
|
+
field_id = (
|
1010
|
+
field_locator
|
1011
|
+
if isinstance(field_locator, FieldId)
|
1012
|
+
else self.field_id(field_locator)
|
1013
|
+
)
|
1014
|
+
return self.field_ids_to_fields[field_id]
|
1015
|
+
|
1016
|
+
def merge_order_sort_keys(self) -> Optional[List[SortKey]]:
|
1017
|
+
"""Extract sort keys from fields with merge_order defined, or use event_time as fallback.
|
1018
|
+
|
1019
|
+
If explicit merge_order fields are defined, they take precedence.
|
1020
|
+
If no merge_order fields are defined but an event_time field exists, use event_time
|
1021
|
+
with DESCENDING merge_order (keep latest events by default).
|
1022
|
+
|
1023
|
+
Note: The sort order is inverted because deduplication keeps the "last" record
|
1024
|
+
after sorting. To keep the record with the smallest merge_order value, we need
|
1025
|
+
to sort in DESCENDING order so that record appears last.
|
1026
|
+
|
1027
|
+
Returns:
|
1028
|
+
List of SortKey objects constructed from fields with merge_order or event_time,
|
1029
|
+
or None if neither are defined.
|
1030
|
+
"""
|
1031
|
+
# First priority: explicit merge_order fields
|
1032
|
+
fields_with_merge_order = self._get_fields_with_merge_order()
|
1033
|
+
if fields_with_merge_order:
|
1034
|
+
return self._create_sort_keys_from_merge_order_fields(
|
1035
|
+
fields_with_merge_order
|
1036
|
+
)
|
1037
|
+
|
1038
|
+
# Second priority: event_time field as default merge_order key
|
1039
|
+
event_time_fields = self._get_event_time_fields()
|
1040
|
+
if event_time_fields:
|
1041
|
+
return self._create_sort_keys_from_event_time_fields(event_time_fields)
|
1042
|
+
|
1043
|
+
return None
|
1044
|
+
|
1045
|
+
def _validate_and_coerce_table(
|
1046
|
+
self,
|
1047
|
+
table: pa.Table,
|
1048
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
|
1049
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
|
1050
|
+
) -> Tuple[pa.Table, Schema]:
|
1051
|
+
"""Validate and coerce a PyArrow table to match this schema's field types and constraints.
|
1052
|
+
|
1053
|
+
This method now uses SchemaUpdate for safe schema evolution, ensuring all field
|
1054
|
+
protection rules and validation are applied consistently.
|
1055
|
+
|
1056
|
+
Args:
|
1057
|
+
table: PyArrow Table to validate and coerce
|
1058
|
+
schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
|
1059
|
+
default_schema_consistency_type: Default consistency type for new fields in AUTO mode
|
1060
|
+
|
1061
|
+
Returns:
|
1062
|
+
Tuple[pa.Table, Schema]: Table with data validated/coerced according to schema consistency types,
|
1063
|
+
and the (potentially updated) schema
|
1064
|
+
|
1065
|
+
Raises:
|
1066
|
+
SchemaValidationError: If validation fails or coercion is not possible
|
1067
|
+
SchemaCompatibilityError: If schema evolution would break compatibility
|
1068
|
+
"""
|
1069
|
+
if not self.field_ids_to_fields:
|
1070
|
+
# No fields defined in schema, return original table
|
1071
|
+
return table, self
|
1072
|
+
|
1073
|
+
# Setup
|
1074
|
+
field_name_to_field = self._create_field_name_mapping()
|
1075
|
+
field_updates = {} # field_name -> updated_field
|
1076
|
+
new_fields = {} # field_name -> new_field
|
1077
|
+
new_columns = []
|
1078
|
+
new_schema_fields = []
|
1079
|
+
|
1080
|
+
# Process each column in the table
|
1081
|
+
for column_name in table.column_names:
|
1082
|
+
column_data = table.column(column_name)
|
1083
|
+
|
1084
|
+
(
|
1085
|
+
processed_data,
|
1086
|
+
schema_field,
|
1087
|
+
field_update,
|
1088
|
+
new_field,
|
1089
|
+
) = self._process_existing_table_column(
|
1090
|
+
column_name,
|
1091
|
+
column_data,
|
1092
|
+
field_name_to_field,
|
1093
|
+
schema_evolution_mode,
|
1094
|
+
default_schema_consistency_type,
|
1095
|
+
)
|
1096
|
+
|
1097
|
+
new_columns.append(processed_data)
|
1098
|
+
new_schema_fields.append(schema_field)
|
1099
|
+
|
1100
|
+
if field_update:
|
1101
|
+
field_updates[column_name] = field_update
|
1102
|
+
if new_field:
|
1103
|
+
new_fields[column_name] = new_field
|
1104
|
+
|
1105
|
+
# Add any missing fields from schema
|
1106
|
+
table_column_names = set(table.column_names)
|
1107
|
+
self._add_missing_schema_fields(
|
1108
|
+
table, table_column_names, new_columns, new_schema_fields
|
1109
|
+
)
|
1110
|
+
|
1111
|
+
# Apply schema updates if any modifications were made
|
1112
|
+
updated_schema = self._apply_schema_updates(field_updates, new_fields)
|
1113
|
+
|
1114
|
+
return (
|
1115
|
+
pa.table(new_columns, schema=pa.schema(new_schema_fields)),
|
1116
|
+
updated_schema,
|
1117
|
+
)
|
1118
|
+
|
1119
|
+
def validate_and_coerce_dataset(
|
1120
|
+
self,
|
1121
|
+
dataset: Union[pa.Table, Any],
|
1122
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
|
1123
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
|
1124
|
+
) -> Tuple[Union[pa.Table, Any], Schema]:
|
1125
|
+
"""Validate and coerce a dataset to match this schema's field types and constraints.
|
1126
|
+
|
1127
|
+
Args:
|
1128
|
+
dataset: Dataset to validate and coerce (PyArrow Table, Daft DataFrame, etc.)
|
1129
|
+
schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
|
1130
|
+
default_schema_consistency_type: Default consistency type for new fields in AUTO mode
|
1131
|
+
|
1132
|
+
Returns:
|
1133
|
+
Tuple[Dataset, Schema]: Dataset with data validated/coerced according to schema consistency types,
|
1134
|
+
and the (potentially updated) schema
|
1135
|
+
|
1136
|
+
Raises:
|
1137
|
+
SchemaValidationError: If validation fails or coercion is not possible
|
1138
|
+
SchemaCompatibilityError: If schema evolution would break compatibility
|
1139
|
+
"""
|
1140
|
+
# Handle PyArrow tables using existing method
|
1141
|
+
if get_dataset_type(dataset) == DatasetType.PYARROW:
|
1142
|
+
return self._validate_and_coerce_table(
|
1143
|
+
dataset,
|
1144
|
+
schema_evolution_mode,
|
1145
|
+
default_schema_consistency_type,
|
1146
|
+
)
|
1147
|
+
|
1148
|
+
# Handle Daft DataFrames without collecting to memory
|
1149
|
+
if get_dataset_type(dataset) == DatasetType.DAFT:
|
1150
|
+
return self._validate_and_coerce_daft_dataframe(
|
1151
|
+
dataset,
|
1152
|
+
schema_evolution_mode,
|
1153
|
+
default_schema_consistency_type,
|
1154
|
+
)
|
1155
|
+
|
1156
|
+
# Handle Ray Datasets by converting to Daft
|
1157
|
+
if get_dataset_type(dataset) == DatasetType.RAY_DATASET:
|
1158
|
+
daft_dataframe = dataset.to_daft()
|
1159
|
+
return self._validate_and_coerce_daft_dataframe(
|
1160
|
+
daft_dataframe,
|
1161
|
+
schema_evolution_mode,
|
1162
|
+
default_schema_consistency_type,
|
1163
|
+
)
|
1164
|
+
|
1165
|
+
# For other types, convert to PyArrow and back
|
1166
|
+
# Don't pass schema during conversion as it may contain columns not yet in the dataset
|
1167
|
+
pa_table = to_pyarrow(dataset)
|
1168
|
+
coerced_table, updated_schema = self._validate_and_coerce_table(
|
1169
|
+
pa_table,
|
1170
|
+
schema_evolution_mode,
|
1171
|
+
default_schema_consistency_type,
|
1172
|
+
)
|
1173
|
+
return from_pyarrow(coerced_table, get_dataset_type(dataset)), updated_schema
|
1174
|
+
|
1175
|
+
def coerce(
|
1176
|
+
self,
|
1177
|
+
dataset: Union[pa.Table, pd.DataFrame, np.ndarray, Any],
|
1178
|
+
manifest_entry_schema: Optional[Schema] = None,
|
1179
|
+
) -> Union[pa.Table, pd.DataFrame, np.ndarray, Any]:
|
1180
|
+
"""Coerce a dataset to match this schema using field type promotion.
|
1181
|
+
|
1182
|
+
This method processes different dataset types and applies type promotion
|
1183
|
+
using the field's promote_type_if_needed method. It handles:
|
1184
|
+
- PyArrow Tables
|
1185
|
+
- Pandas DataFrames
|
1186
|
+
- NumPy arrays (1D and 2D)
|
1187
|
+
- Polars DataFrames (if available)
|
1188
|
+
- Daft DataFrames (if available)
|
1189
|
+
- Other types with to_arrow() method
|
1190
|
+
|
1191
|
+
For each column, it:
|
1192
|
+
- Fields that exist in both dataset and schema: applies type promotion
|
1193
|
+
- Fields in dataset but not in schema: preserves as-is
|
1194
|
+
- Fields in schema but not in dataset: adds with null or past default values
|
1195
|
+
- Reorders columns to match schema order
|
1196
|
+
|
1197
|
+
Args:
|
1198
|
+
dataset: Dataset to coerce to this schema
|
1199
|
+
manifest_entry_schema: Original manifest entry schema used to write the dataset.
|
1200
|
+
|
1201
|
+
Returns:
|
1202
|
+
Dataset of the same type, coerced to match this schema.
|
1203
|
+
|
1204
|
+
Raises:
|
1205
|
+
SchemaValidationError: If coercion fails
|
1206
|
+
"""
|
1207
|
+
if not self.field_ids_to_fields:
|
1208
|
+
# No fields defined in schema, return original dataset
|
1209
|
+
return dataset
|
1210
|
+
|
1211
|
+
# Convert dataset to PyArrow table for processing
|
1212
|
+
pa_table = to_pyarrow(
|
1213
|
+
dataset,
|
1214
|
+
schema=manifest_entry_schema.arrow if manifest_entry_schema else None,
|
1215
|
+
)
|
1216
|
+
|
1217
|
+
# Process columns using field coercion
|
1218
|
+
coerced_columns, coerced_fields = self._coerce_table_columns(pa_table)
|
1219
|
+
|
1220
|
+
# Reorder columns to match schema order
|
1221
|
+
reordered_columns, reordered_fields = self._reorder_columns_to_schema(
|
1222
|
+
coerced_columns, coerced_fields, pa_table
|
1223
|
+
)
|
1224
|
+
|
1225
|
+
# Create new table with processed columns
|
1226
|
+
coerced_table = pa.table(reordered_columns, schema=pa.schema(reordered_fields))
|
1227
|
+
|
1228
|
+
# Convert back to original dataset type
|
1229
|
+
return from_pyarrow(coerced_table, get_dataset_type(dataset))
|
1230
|
+
|
1231
|
+
def _validate_and_coerce_daft_dataframe(
|
1232
|
+
self,
|
1233
|
+
dataframe: Any, # DaftDataFrame type
|
1234
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode] = None,
|
1235
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType] = None,
|
1236
|
+
) -> Tuple[Any, Schema]:
|
1237
|
+
"""Validate and coerce a Daft DataFrame without collecting to memory.
|
1238
|
+
|
1239
|
+
This method processes Daft DataFrames column by column using Daft expressions
|
1240
|
+
for validation and coercion, avoiding memory collection.
|
1241
|
+
|
1242
|
+
Args:
|
1243
|
+
dataframe: Daft DataFrame to validate and coerce
|
1244
|
+
schema_evolution_mode: How to handle fields not in schema (MANUAL or AUTO)
|
1245
|
+
default_schema_consistency_type: Default consistency type for new fields in AUTO mode
|
1246
|
+
|
1247
|
+
Returns:
|
1248
|
+
Tuple[DaftDataFrame, Schema]: Processed DataFrame and updated schema
|
1249
|
+
|
1250
|
+
Raises:
|
1251
|
+
SchemaValidationError: If validation fails or coercion is not possible
|
1252
|
+
SchemaCompatibilityError: If schema evolution would break compatibility
|
1253
|
+
"""
|
1254
|
+
if not self.field_ids_to_fields:
|
1255
|
+
# No fields defined in schema, return original dataframe
|
1256
|
+
return dataframe, self
|
1257
|
+
|
1258
|
+
# Setup
|
1259
|
+
field_name_to_field = self._create_field_name_mapping()
|
1260
|
+
field_updates = {} # field_name -> updated_field
|
1261
|
+
new_fields = {} # field_name -> new_field
|
1262
|
+
processed_dataframe = dataframe
|
1263
|
+
|
1264
|
+
# Process each column in the dataframe
|
1265
|
+
for column_name in dataframe.column_names:
|
1266
|
+
column_type = dataframe.schema()[column_name].dtype.to_arrow_dtype()
|
1267
|
+
|
1268
|
+
(
|
1269
|
+
processed_dataframe,
|
1270
|
+
schema_field,
|
1271
|
+
field_update,
|
1272
|
+
new_field,
|
1273
|
+
) = self._process_existing_daft_column(
|
1274
|
+
processed_dataframe,
|
1275
|
+
column_name,
|
1276
|
+
column_type,
|
1277
|
+
field_name_to_field,
|
1278
|
+
schema_evolution_mode,
|
1279
|
+
default_schema_consistency_type,
|
1280
|
+
)
|
1281
|
+
|
1282
|
+
if field_update:
|
1283
|
+
field_updates[column_name] = field_update
|
1284
|
+
if new_field:
|
1285
|
+
new_fields[column_name] = new_field
|
1286
|
+
|
1287
|
+
# Add any missing fields from schema
|
1288
|
+
dataframe_column_names = set(dataframe.column_names)
|
1289
|
+
processed_dataframe = self._add_missing_schema_fields_daft(
|
1290
|
+
processed_dataframe, dataframe_column_names
|
1291
|
+
)
|
1292
|
+
|
1293
|
+
# Apply schema updates if any modifications were made
|
1294
|
+
updated_schema = self._apply_schema_updates(field_updates, new_fields)
|
1295
|
+
|
1296
|
+
return processed_dataframe, updated_schema
|
1297
|
+
|
1298
|
+
def _process_existing_daft_column(
|
1299
|
+
self,
|
1300
|
+
dataframe: Any, # DaftDataFrame type
|
1301
|
+
column_name: str,
|
1302
|
+
column_type: pa.DataType,
|
1303
|
+
field_name_to_field: Dict[str, Field],
|
1304
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode],
|
1305
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType],
|
1306
|
+
) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
|
1307
|
+
"""Process a Daft DataFrame column that exists in the dataset.
|
1308
|
+
|
1309
|
+
Args:
|
1310
|
+
dataframe: Daft DataFrame to process
|
1311
|
+
column_name: Name of the column to process
|
1312
|
+
column_type: PyArrow DataType of the column
|
1313
|
+
field_name_to_field: Mapping from field names to Field objects
|
1314
|
+
schema_evolution_mode: How to handle fields not in schema
|
1315
|
+
default_schema_consistency_type: Default consistency type for new fields
|
1316
|
+
|
1317
|
+
Returns:
|
1318
|
+
Tuple of (processed_dataframe, schema_field, field_update, new_field)
|
1319
|
+
"""
|
1320
|
+
if column_name in field_name_to_field:
|
1321
|
+
# Field exists in schema - validate/coerce according to consistency type
|
1322
|
+
field = field_name_to_field[column_name]
|
1323
|
+
|
1324
|
+
if field.consistency_type == SchemaConsistencyType.VALIDATE:
|
1325
|
+
field.validate(column_type)
|
1326
|
+
return dataframe, field.arrow, None, None
|
1327
|
+
elif field.consistency_type == SchemaConsistencyType.COERCE:
|
1328
|
+
coerced_dataframe = field.coerce_daft(dataframe, column_name)
|
1329
|
+
return coerced_dataframe, field.arrow, None, None
|
1330
|
+
else:
|
1331
|
+
# NONE or no consistency type - use type promotion
|
1332
|
+
return self._handle_daft_type_promotion(
|
1333
|
+
dataframe, column_name, column_type, field
|
1334
|
+
)
|
1335
|
+
else:
|
1336
|
+
# Field not in schema - handle based on evolution mode
|
1337
|
+
return self._handle_new_daft_field(
|
1338
|
+
dataframe,
|
1339
|
+
column_name,
|
1340
|
+
column_type,
|
1341
|
+
schema_evolution_mode,
|
1342
|
+
default_schema_consistency_type,
|
1343
|
+
)
|
1344
|
+
|
1345
|
+
def _handle_daft_type_promotion(
|
1346
|
+
self,
|
1347
|
+
dataframe: Any, # DaftDataFrame type
|
1348
|
+
column_name: str,
|
1349
|
+
column_type: pa.DataType,
|
1350
|
+
field: Field,
|
1351
|
+
) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
|
1352
|
+
"""Handle type promotion for a Daft column with NONE consistency type."""
|
1353
|
+
# Create a dummy array to check type promotion
|
1354
|
+
dummy_array = pa.array([None], type=column_type)
|
1355
|
+
promoted_data, type_was_promoted = field.promote_type_if_needed(dummy_array)
|
1356
|
+
|
1357
|
+
if type_was_promoted:
|
1358
|
+
# Cast the Daft column to the promoted type
|
1359
|
+
promoted_dataframe = field.coerce_daft(
|
1360
|
+
dataframe, column_name, promoted_data.type
|
1361
|
+
)
|
1362
|
+
|
1363
|
+
# Cast default values to match the promoted type
|
1364
|
+
promoted_past_default = (
|
1365
|
+
field._cast_default_to_promoted_type(
|
1366
|
+
field.past_default, promoted_data.type
|
1367
|
+
)
|
1368
|
+
if field.past_default is not None
|
1369
|
+
else None
|
1370
|
+
)
|
1371
|
+
promoted_future_default = (
|
1372
|
+
field._cast_default_to_promoted_type(
|
1373
|
+
field.future_default, promoted_data.type
|
1374
|
+
)
|
1375
|
+
if field.future_default is not None
|
1376
|
+
else None
|
1377
|
+
)
|
1378
|
+
|
1379
|
+
# Create updated field with promoted type
|
1380
|
+
promoted_field = pa.field(
|
1381
|
+
field.arrow.name,
|
1382
|
+
promoted_data.type,
|
1383
|
+
field.arrow.nullable,
|
1384
|
+
field.arrow.metadata,
|
1385
|
+
)
|
1386
|
+
|
1387
|
+
updated_field = Field.of(
|
1388
|
+
promoted_field,
|
1389
|
+
field_id=field.id,
|
1390
|
+
past_default=promoted_past_default,
|
1391
|
+
future_default=promoted_future_default,
|
1392
|
+
consistency_type=field.consistency_type,
|
1393
|
+
path=field.path,
|
1394
|
+
native_object=field.native_object,
|
1395
|
+
)
|
1396
|
+
|
1397
|
+
return promoted_dataframe, promoted_field, updated_field, None
|
1398
|
+
else:
|
1399
|
+
return dataframe, field.arrow, None, None
|
1400
|
+
|
1401
|
+
def _handle_new_daft_field(
|
1402
|
+
self,
|
1403
|
+
dataframe: Any, # DaftDataFrame type
|
1404
|
+
column_name: str,
|
1405
|
+
column_type: pa.DataType,
|
1406
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode],
|
1407
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType],
|
1408
|
+
) -> Tuple[Any, pa.Field, Optional[Field], Optional[Field]]:
|
1409
|
+
"""Handle a field that's not in the schema for Daft DataFrames."""
|
1410
|
+
if schema_evolution_mode == SchemaEvolutionMode.AUTO:
|
1411
|
+
# Create new field with default consistency type
|
1412
|
+
next_field_id = self.max_field_id + 1
|
1413
|
+
new_field = Field.of(
|
1414
|
+
field=pa.field(column_name, column_type),
|
1415
|
+
field_id=next_field_id,
|
1416
|
+
consistency_type=default_schema_consistency_type
|
1417
|
+
or SchemaConsistencyType.NONE,
|
1418
|
+
)
|
1419
|
+
return dataframe, new_field.arrow, None, new_field
|
1420
|
+
else:
|
1421
|
+
# MANUAL mode or not specified - raise error
|
1422
|
+
raise SchemaValidationError(
|
1423
|
+
f"Field '{column_name}' is not present in the schema and schema evolution mode is '{schema_evolution_mode}'"
|
1424
|
+
)
|
1425
|
+
|
1426
|
+
def _add_missing_schema_fields_daft(
|
1427
|
+
self,
|
1428
|
+
dataframe: Any, # DaftDataFrame type
|
1429
|
+
dataframe_column_names: set,
|
1430
|
+
) -> Any:
|
1431
|
+
"""Add columns for fields that exist in schema but not in Daft DataFrame."""
|
1432
|
+
processed_dataframe = dataframe
|
1433
|
+
|
1434
|
+
for field in self.field_ids_to_fields.values():
|
1435
|
+
if field.arrow.name not in dataframe_column_names:
|
1436
|
+
# Add column with null values or default value to Daft DataFrame
|
1437
|
+
if field.future_default is not None:
|
1438
|
+
# Convert default value to Daft literal
|
1439
|
+
processed_dataframe = processed_dataframe.with_column(
|
1440
|
+
field.arrow.name,
|
1441
|
+
daft.lit(field.future_default).cast(
|
1442
|
+
daft.DataType.from_arrow_type(field.arrow.type)
|
1443
|
+
),
|
1444
|
+
)
|
1445
|
+
elif field.arrow.nullable:
|
1446
|
+
# Add null column
|
1447
|
+
processed_dataframe = processed_dataframe.with_column(
|
1448
|
+
field.arrow.name,
|
1449
|
+
daft.lit(None).cast(
|
1450
|
+
daft.DataType.from_arrow_type(field.arrow.type)
|
1451
|
+
),
|
1452
|
+
)
|
1453
|
+
else:
|
1454
|
+
raise SchemaValidationError(
|
1455
|
+
f"Field '{field.arrow.name}' is required but not present and no future_default is set"
|
1456
|
+
)
|
1457
|
+
|
1458
|
+
return processed_dataframe
|
1459
|
+
|
1460
|
+
@property
|
1461
|
+
def fields(self) -> List[Field]:
|
1462
|
+
field_ids_to_fields = self.field_ids_to_fields
|
1463
|
+
return list(field_ids_to_fields.values())
|
1464
|
+
|
1465
|
+
@property
|
1466
|
+
def merge_keys(self) -> Optional[List[FieldId]]:
|
1467
|
+
return self.get("mergeKeys")
|
1468
|
+
|
1469
|
+
@property
|
1470
|
+
def field_ids_to_fields(self) -> Dict[FieldId, Field]:
|
1471
|
+
return self.get("fieldIdsToFields")
|
1472
|
+
|
1473
|
+
@property
|
1474
|
+
def arrow(self) -> pa.Schema:
|
1475
|
+
return self["arrow"]
|
1476
|
+
|
1477
|
+
@property
|
1478
|
+
def max_field_id(self) -> FieldId:
|
1479
|
+
return self["maxFieldId"]
|
1480
|
+
|
1481
|
+
@property
|
1482
|
+
def id(self) -> SchemaId:
|
1483
|
+
return Schema._schema_id(self.arrow)
|
1484
|
+
|
1485
|
+
@property
|
1486
|
+
def subschema(self, name: SchemaName) -> Optional[Schema]:
|
1487
|
+
subschemas = self.subschemas
|
1488
|
+
return subschemas.get(name) if subschemas else None
|
1489
|
+
|
1490
|
+
@property
|
1491
|
+
def subschemas(self) -> Dict[SchemaName, Schema]:
|
1492
|
+
# return cached subschemas first if they exist
|
1493
|
+
subschemas = self.get("subschemas")
|
1494
|
+
if not subschemas:
|
1495
|
+
# retrieve any defined subschemas
|
1496
|
+
subschemas_to_field_ids = self.subschemas_to_field_ids
|
1497
|
+
# rebuild and return the subschema cache
|
1498
|
+
if subschemas_to_field_ids:
|
1499
|
+
subschemas = {
|
1500
|
+
schema_name: Schema.of(
|
1501
|
+
schema=pa.schema(
|
1502
|
+
[self.field(field_id).arrow for field_id in field_ids]
|
1503
|
+
),
|
1504
|
+
schema_id=self.id,
|
1505
|
+
native_object=self.native_object,
|
1506
|
+
)
|
1507
|
+
for schema_name, field_ids in subschemas_to_field_ids.items()
|
1508
|
+
}
|
1509
|
+
self["subschemas"] = subschemas
|
1510
|
+
return subschemas or {}
|
1511
|
+
|
1512
|
+
@property
|
1513
|
+
def subschema_field_ids(self, name: SchemaName) -> Optional[List[FieldId]]:
|
1514
|
+
return self.subschemas_to_field_ids.get(name)
|
1515
|
+
|
1516
|
+
@property
|
1517
|
+
def subschemas_to_field_ids(self) -> Dict[SchemaName, List[FieldId]]:
|
1518
|
+
return Schema._subschemas(self.arrow)
|
1519
|
+
|
1520
|
+
@property
|
1521
|
+
def native_object(self) -> Optional[Any]:
|
1522
|
+
return self.get("nativeObject")
|
1523
|
+
|
1524
|
+
@staticmethod
|
1525
|
+
def _schema_id(schema: pa.Schema) -> SchemaId:
|
1526
|
+
schema_id = None
|
1527
|
+
if schema.metadata:
|
1528
|
+
bytes_val = schema.metadata.get(SCHEMA_ID_KEY_NAME)
|
1529
|
+
schema_id = int(bytes_val.decode()) if bytes_val else None
|
1530
|
+
return schema_id
|
1531
|
+
|
1532
|
+
@staticmethod
|
1533
|
+
def _subschemas(
|
1534
|
+
schema: pa.Schema,
|
1535
|
+
) -> Dict[SchemaName, List[FieldId]]:
|
1536
|
+
subschemas = None
|
1537
|
+
if schema.metadata:
|
1538
|
+
bytes_val = schema.metadata.get(SUBSCHEMAS_KEY_NAME)
|
1539
|
+
subschemas = _decode_metadata_value(bytes_val) if bytes_val else None
|
1540
|
+
return subschemas
|
1541
|
+
|
1542
|
+
@staticmethod
|
1543
|
+
def _field_name_to_field_id(
|
1544
|
+
schema: pa.Schema,
|
1545
|
+
name: Union[FieldName, NestedFieldName],
|
1546
|
+
) -> FieldId:
|
1547
|
+
if isinstance(name, str):
|
1548
|
+
return Field.of(schema.field(name)).id
|
1549
|
+
if isinstance(name, List):
|
1550
|
+
if not len(name):
|
1551
|
+
raise ValueError(f"Nested field name `{name}` is empty.")
|
1552
|
+
field = schema
|
1553
|
+
for part in name:
|
1554
|
+
field = field[part]
|
1555
|
+
return Field.of(field).id
|
1556
|
+
raise ValueError(f"Unknown field name type: {type(name)}")
|
1557
|
+
|
1558
|
+
@staticmethod
|
1559
|
+
def _visit_fields(
|
1560
|
+
current: Union[pa.Schema, pa.Field],
|
1561
|
+
visit: Callable,
|
1562
|
+
path: Optional[NestedFieldName] = None,
|
1563
|
+
*args,
|
1564
|
+
**kwargs,
|
1565
|
+
) -> None:
|
1566
|
+
"""
|
1567
|
+
Recursively visit all fields in a PyArrow schema, including nested
|
1568
|
+
fields.
|
1569
|
+
|
1570
|
+
Args:
|
1571
|
+
current: The schema or field to visit.
|
1572
|
+
visit: A function that visits the current field.
|
1573
|
+
path: The current path to the field.
|
1574
|
+
*args: Additional args to pass to the visit function.
|
1575
|
+
**kwargs: Additional keyword args to pass to the visit function.
|
1576
|
+
Returns:
|
1577
|
+
None
|
1578
|
+
"""
|
1579
|
+
path = [] if path is None else path
|
1580
|
+
if isinstance(current, pa.Schema):
|
1581
|
+
for field in current:
|
1582
|
+
Schema._visit_fields(
|
1583
|
+
field,
|
1584
|
+
visit,
|
1585
|
+
path,
|
1586
|
+
*args,
|
1587
|
+
**kwargs,
|
1588
|
+
)
|
1589
|
+
elif isinstance(current, pa.Field):
|
1590
|
+
path.append(current.name)
|
1591
|
+
visit(current, path, *args, **kwargs)
|
1592
|
+
if pa.types.is_nested(current.type):
|
1593
|
+
if isinstance(current, pa.StructType):
|
1594
|
+
for field in current:
|
1595
|
+
Schema._visit_fields(
|
1596
|
+
field,
|
1597
|
+
visit,
|
1598
|
+
path,
|
1599
|
+
*args,
|
1600
|
+
**kwargs,
|
1601
|
+
)
|
1602
|
+
elif isinstance(current, pa.ListType):
|
1603
|
+
Schema._visit_fields(
|
1604
|
+
current.value_field,
|
1605
|
+
visit,
|
1606
|
+
path,
|
1607
|
+
*args,
|
1608
|
+
**kwargs,
|
1609
|
+
)
|
1610
|
+
elif isinstance(current, pa.MapType):
|
1611
|
+
Schema._visit_fields(
|
1612
|
+
current.key_field,
|
1613
|
+
visit,
|
1614
|
+
path,
|
1615
|
+
*args,
|
1616
|
+
**kwargs,
|
1617
|
+
)
|
1618
|
+
Schema._visit_fields(
|
1619
|
+
current.item_field,
|
1620
|
+
visit,
|
1621
|
+
path,
|
1622
|
+
*args,
|
1623
|
+
**kwargs,
|
1624
|
+
)
|
1625
|
+
path.pop()
|
1626
|
+
else:
|
1627
|
+
raise ValueError(f"Unexpected Schema Field Type: {type(current)}")
|
1628
|
+
|
1629
|
+
@staticmethod
|
1630
|
+
def _find_max_field_id(
|
1631
|
+
field: pa.Field,
|
1632
|
+
path: NestedFieldName,
|
1633
|
+
visitor_dict: Dict[str, Any],
|
1634
|
+
) -> None:
|
1635
|
+
max_field_id = max(
|
1636
|
+
visitor_dict.get("maxFieldId", 0),
|
1637
|
+
Field.of(field).id or 0,
|
1638
|
+
)
|
1639
|
+
visitor_dict["maxFieldId"] = max_field_id
|
1640
|
+
|
1641
|
+
@staticmethod
|
1642
|
+
def _populate_fields(
|
1643
|
+
field: pa.Field,
|
1644
|
+
path: NestedFieldName,
|
1645
|
+
visitor_dict: Dict[str, Any],
|
1646
|
+
) -> None:
|
1647
|
+
field_ids_to_fields = visitor_dict["fieldIdsToFields"]
|
1648
|
+
dc_field = Field.of(field)
|
1649
|
+
if dc_field is not None and dc_field.id is not None:
|
1650
|
+
field_id = dc_field.id
|
1651
|
+
else:
|
1652
|
+
field_id = (
|
1653
|
+
visitor_dict["maxFieldId"] + len(field_ids_to_fields)
|
1654
|
+
) % MAX_FIELD_ID_EXCLUSIVE
|
1655
|
+
|
1656
|
+
if (dupe := field_ids_to_fields.get(field_id)) is not None:
|
1657
|
+
raise ValueError(
|
1658
|
+
f"Duplicate field id {field_id} for field: {field} "
|
1659
|
+
f"Already assigned to field: {dupe}"
|
1660
|
+
)
|
1661
|
+
field = Field.of(
|
1662
|
+
field=field,
|
1663
|
+
field_id=field_id,
|
1664
|
+
path=path,
|
1665
|
+
)
|
1666
|
+
field_ids_to_fields[field_id] = field
|
1667
|
+
|
1668
|
+
@staticmethod
|
1669
|
+
def _get_lower_case_field_names(
|
1670
|
+
schema: SingleSchema,
|
1671
|
+
) -> List[str]:
|
1672
|
+
if isinstance(schema, pa.Schema):
|
1673
|
+
return [name.lower() for name in schema.names]
|
1674
|
+
elif isinstance(schema, List): # List[Field]
|
1675
|
+
names = [f.arrow.name.lower() for f in schema if isinstance(f, Field)]
|
1676
|
+
if len(names) == len(schema):
|
1677
|
+
return names # all items in list are valid Field objects
|
1678
|
+
raise ValueError(f"Unsupported schema argument: {schema}")
|
1679
|
+
|
1680
|
+
@staticmethod
|
1681
|
+
def _validate_schema_name(name: str) -> None:
|
1682
|
+
if not name:
|
1683
|
+
raise ValueError(f"Schema name cannot be empty.")
|
1684
|
+
if len(name) > BYTES_PER_KIBIBYTE:
|
1685
|
+
raise ValueError(
|
1686
|
+
f"Invalid schema name `{name}`. Schema names "
|
1687
|
+
f"cannot be greater than {BYTES_PER_KIBIBYTE} "
|
1688
|
+
f"characters."
|
1689
|
+
)
|
1690
|
+
|
1691
|
+
@staticmethod
|
1692
|
+
def _validate_field_names(
|
1693
|
+
schema: Union[SingleSchema, MultiSchema],
|
1694
|
+
) -> None:
|
1695
|
+
all_names = []
|
1696
|
+
if isinstance(schema, dict): # MultiSchema
|
1697
|
+
for schema_name, val in schema.items():
|
1698
|
+
Schema._validate_schema_name(schema_name)
|
1699
|
+
all_names.extend(Schema._get_lower_case_field_names(val))
|
1700
|
+
else: # SingleSchema
|
1701
|
+
all_names.extend(Schema._get_lower_case_field_names(schema))
|
1702
|
+
if not all_names:
|
1703
|
+
raise ValueError(f"Schema must contain at least one field.")
|
1704
|
+
name_set = set()
|
1705
|
+
dupes = []
|
1706
|
+
for name in all_names:
|
1707
|
+
dupes.append(name) if name in name_set else name_set.add(name)
|
1708
|
+
if dupes:
|
1709
|
+
raise ValueError(
|
1710
|
+
f"Expected all schema fields to have unique names "
|
1711
|
+
f"(case-insensitive), but found the following duplicates: "
|
1712
|
+
f"{dupes}"
|
1713
|
+
)
|
1714
|
+
|
1715
|
+
@staticmethod
|
1716
|
+
def _to_pyarrow_schema(schema: SingleSchema) -> pa.Schema:
|
1717
|
+
if isinstance(schema, pa.Schema):
|
1718
|
+
return schema
|
1719
|
+
elif isinstance(schema, List): # List[Field]
|
1720
|
+
return pa.schema(fields=[field.arrow for field in schema])
|
1721
|
+
else:
|
1722
|
+
raise ValueError(f"Unsupported schema base type: {schema}")
|
1723
|
+
|
1724
|
+
@staticmethod
|
1725
|
+
def _to_unified_pyarrow_schema(
|
1726
|
+
schema: Union[SingleSchema, MultiSchema],
|
1727
|
+
) -> Tuple[pa.Schema, Dict[SchemaName, List[FieldName]]]:
|
1728
|
+
# first, ensure all field names are valid and contain no duplicates
|
1729
|
+
Schema._validate_field_names(schema)
|
1730
|
+
# now union all schemas into a single schema
|
1731
|
+
subschema_to_field_names = {}
|
1732
|
+
if isinstance(schema, dict): # MultiSchema
|
1733
|
+
all_schemas = []
|
1734
|
+
for schema_name, schema_val in schema.items():
|
1735
|
+
pyarow_schema = Schema._to_pyarrow_schema(schema_val)
|
1736
|
+
all_schemas.append(pyarow_schema)
|
1737
|
+
subschema_to_field_names[schema_name] = [
|
1738
|
+
field.name for field in pyarow_schema
|
1739
|
+
]
|
1740
|
+
return pa.unify_schemas(all_schemas), subschema_to_field_names
|
1741
|
+
return Schema._to_pyarrow_schema(schema), {} # SingleSchema
|
1742
|
+
|
1743
|
+
def _get_fields_with_merge_order(self) -> List[Field]:
|
1744
|
+
"""Get all fields that have merge_order defined.
|
1745
|
+
|
1746
|
+
Returns:
|
1747
|
+
List of fields with merge_order defined, or empty list if none
|
1748
|
+
"""
|
1749
|
+
return [field for field in self.fields if field.merge_order is not None]
|
1750
|
+
|
1751
|
+
def _create_sort_keys_from_merge_order_fields(
|
1752
|
+
self, fields_with_merge_order: List[Field]
|
1753
|
+
) -> List[SortKey]:
|
1754
|
+
"""Create sort keys from fields with explicit merge_order.
|
1755
|
+
|
1756
|
+
Args:
|
1757
|
+
fields_with_merge_order: List of fields with merge_order defined
|
1758
|
+
|
1759
|
+
Returns:
|
1760
|
+
List of SortKey objects with inverted sort order for deduplication
|
1761
|
+
"""
|
1762
|
+
from deltacat.storage.model.sort_key import SortKey
|
1763
|
+
|
1764
|
+
sort_keys = []
|
1765
|
+
for field in fields_with_merge_order:
|
1766
|
+
merge_order = field.merge_order
|
1767
|
+
desired_sort_order = merge_order[0]
|
1768
|
+
|
1769
|
+
# Invert the sort order because deduplication keeps the "last" record
|
1770
|
+
# ASCENDING merge_order (keep smallest) → DESCENDING sort (smallest appears last)
|
1771
|
+
# DESCENDING merge_order (keep largest) → ASCENDING sort (largest appears last)
|
1772
|
+
if desired_sort_order == SortOrder.ASCENDING:
|
1773
|
+
actual_sort_order = SortOrder.DESCENDING
|
1774
|
+
else:
|
1775
|
+
actual_sort_order = SortOrder.ASCENDING
|
1776
|
+
|
1777
|
+
sort_key = SortKey.of(
|
1778
|
+
key=[field.arrow.name],
|
1779
|
+
sort_order=actual_sort_order,
|
1780
|
+
null_order=merge_order[1], # NullOrder (AT_START/AT_END)
|
1781
|
+
)
|
1782
|
+
sort_keys.append(sort_key)
|
1783
|
+
return sort_keys
|
1784
|
+
|
1785
|
+
def _get_event_time_fields(self) -> List[Field]:
|
1786
|
+
"""Get all fields marked as event_time.
|
1787
|
+
|
1788
|
+
Returns:
|
1789
|
+
List of event_time fields, or empty list if none
|
1790
|
+
"""
|
1791
|
+
return [field for field in self.fields if field.is_event_time]
|
1792
|
+
|
1793
|
+
def _create_sort_keys_from_event_time_fields(
|
1794
|
+
self, event_time_fields: List[Field]
|
1795
|
+
) -> List:
|
1796
|
+
"""Create sort keys from event_time fields with default DESCENDING merge_order.
|
1797
|
+
|
1798
|
+
Args:
|
1799
|
+
event_time_fields: List of event_time fields
|
1800
|
+
|
1801
|
+
Returns:
|
1802
|
+
List of SortKey objects with ASCENDING sort order (inverted from DESCENDING merge_order)
|
1803
|
+
"""
|
1804
|
+
from deltacat.storage.model.sort_key import SortKey
|
1805
|
+
|
1806
|
+
sort_keys = []
|
1807
|
+
for field in event_time_fields:
|
1808
|
+
sort_key = SortKey.of(
|
1809
|
+
key=[field.arrow.name],
|
1810
|
+
sort_order=SortOrder.ASCENDING, # Inverted: DESCENDING merge_order → ASCENDING sort
|
1811
|
+
null_order=NullOrder.AT_END,
|
1812
|
+
)
|
1813
|
+
sort_keys.append(sort_key)
|
1814
|
+
return sort_keys
|
1815
|
+
|
1816
|
+
def _create_field_name_mapping(self) -> Dict[str, Field]:
|
1817
|
+
"""Create a mapping from field names to Field objects."""
|
1818
|
+
field_name_to_field = {}
|
1819
|
+
for field in self.field_ids_to_fields.values():
|
1820
|
+
field_name_to_field[field.arrow.name] = field
|
1821
|
+
return field_name_to_field
|
1822
|
+
|
1823
|
+
def _process_existing_table_column(
|
1824
|
+
self,
|
1825
|
+
column_name: str,
|
1826
|
+
column_data: pa.Array,
|
1827
|
+
field_name_to_field: Dict[str, Field],
|
1828
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode],
|
1829
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType],
|
1830
|
+
) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
|
1831
|
+
"""Process a column that exists in the table.
|
1832
|
+
|
1833
|
+
Returns:
|
1834
|
+
Tuple of (processed_column_data, schema_field, field_update, new_field)
|
1835
|
+
"""
|
1836
|
+
if column_name in field_name_to_field:
|
1837
|
+
# Field exists in schema - validate/coerce according to consistency type
|
1838
|
+
field = field_name_to_field[column_name]
|
1839
|
+
|
1840
|
+
if field.consistency_type == SchemaConsistencyType.VALIDATE:
|
1841
|
+
field.validate(column_data.type)
|
1842
|
+
return column_data, field.arrow, None, None
|
1843
|
+
elif field.consistency_type == SchemaConsistencyType.COERCE:
|
1844
|
+
coerced_data = field.coerce(column_data)
|
1845
|
+
return coerced_data, field.arrow, None, None
|
1846
|
+
else:
|
1847
|
+
# NONE or no consistency type - use type promotion
|
1848
|
+
return self._handle_type_promotion(column_name, column_data, field)
|
1849
|
+
else:
|
1850
|
+
# Field not in schema - handle based on evolution mode
|
1851
|
+
return self._handle_new_field(
|
1852
|
+
column_name,
|
1853
|
+
column_data,
|
1854
|
+
schema_evolution_mode,
|
1855
|
+
default_schema_consistency_type,
|
1856
|
+
)
|
1857
|
+
|
1858
|
+
def _handle_type_promotion(
|
1859
|
+
self, column_name: str, column_data: pa.Array, field: Field
|
1860
|
+
) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
|
1861
|
+
"""Handle type promotion for a field with NONE consistency type."""
|
1862
|
+
promoted_data, type_was_promoted = field.promote_type_if_needed(column_data)
|
1863
|
+
|
1864
|
+
if type_was_promoted:
|
1865
|
+
# Cast default values to match the promoted type
|
1866
|
+
promoted_past_default = (
|
1867
|
+
field._cast_default_to_promoted_type(
|
1868
|
+
field.past_default, promoted_data.type
|
1869
|
+
)
|
1870
|
+
if field.past_default is not None
|
1871
|
+
else None
|
1872
|
+
)
|
1873
|
+
|
1874
|
+
promoted_future_default = (
|
1875
|
+
field._cast_default_to_promoted_type(
|
1876
|
+
field.future_default, promoted_data.type
|
1877
|
+
)
|
1878
|
+
if field.future_default is not None
|
1879
|
+
else None
|
1880
|
+
)
|
1881
|
+
|
1882
|
+
# Create updated field with same properties but new type and cast defaults
|
1883
|
+
promoted_field = pa.field(
|
1884
|
+
field.arrow.name,
|
1885
|
+
promoted_data.type,
|
1886
|
+
nullable=field.arrow.nullable,
|
1887
|
+
metadata=field.arrow.metadata,
|
1888
|
+
)
|
1889
|
+
|
1890
|
+
updated_field = Field.of(
|
1891
|
+
promoted_field,
|
1892
|
+
field_id=field.id,
|
1893
|
+
is_merge_key=field.is_merge_key,
|
1894
|
+
merge_order=field.merge_order,
|
1895
|
+
is_event_time=field.is_event_time,
|
1896
|
+
doc=field.doc,
|
1897
|
+
past_default=promoted_past_default,
|
1898
|
+
future_default=promoted_future_default,
|
1899
|
+
consistency_type=field.consistency_type,
|
1900
|
+
path=field.path,
|
1901
|
+
native_object=field.native_object,
|
1902
|
+
)
|
1903
|
+
|
1904
|
+
return promoted_data, promoted_field, updated_field, None
|
1905
|
+
else:
|
1906
|
+
return promoted_data, field.arrow, None, None
|
1907
|
+
|
1908
|
+
def _handle_new_field(
|
1909
|
+
self,
|
1910
|
+
column_name: str,
|
1911
|
+
column_data: pa.Array,
|
1912
|
+
schema_evolution_mode: Optional[SchemaEvolutionMode],
|
1913
|
+
default_schema_consistency_type: Optional[SchemaConsistencyType],
|
1914
|
+
) -> Tuple[pa.Array, pa.Field, Optional[Field], Optional[Field]]:
|
1915
|
+
"""Handle a field that's not in the schema."""
|
1916
|
+
if schema_evolution_mode == SchemaEvolutionMode.AUTO:
|
1917
|
+
# Create new field with default consistency type
|
1918
|
+
next_field_id = self.max_field_id + 1
|
1919
|
+
new_field = Field.of(
|
1920
|
+
pa.field(column_name, column_data.type, nullable=True),
|
1921
|
+
field_id=next_field_id,
|
1922
|
+
consistency_type=default_schema_consistency_type
|
1923
|
+
or SchemaConsistencyType.NONE,
|
1924
|
+
)
|
1925
|
+
return column_data, new_field.arrow, None, new_field
|
1926
|
+
else:
|
1927
|
+
# MANUAL mode or disabled - raise error
|
1928
|
+
raise SchemaValidationError(
|
1929
|
+
f"Field '{column_name}' is not present in the schema and schema evolution mode is '{schema_evolution_mode}'"
|
1930
|
+
)
|
1931
|
+
|
1932
|
+
def _add_missing_schema_fields(
|
1933
|
+
self,
|
1934
|
+
table: pa.Table,
|
1935
|
+
table_column_names: set,
|
1936
|
+
new_columns: List[pa.Array],
|
1937
|
+
new_schema_fields: List[pa.Field],
|
1938
|
+
) -> None:
|
1939
|
+
"""Add columns for fields that exist in schema but not in table."""
|
1940
|
+
for field in self.field_ids_to_fields.values():
|
1941
|
+
if field.arrow.name not in table_column_names:
|
1942
|
+
# Use future_default if available, otherwise check if nullable
|
1943
|
+
if field.future_default is not None:
|
1944
|
+
# Create column with future_default value
|
1945
|
+
default_array = pa.array(
|
1946
|
+
[field.future_default] * get_table_length(table),
|
1947
|
+
type=field.arrow.type,
|
1948
|
+
)
|
1949
|
+
new_columns.append(default_array)
|
1950
|
+
elif field.arrow.nullable:
|
1951
|
+
# Backfill with nulls if field is nullable
|
1952
|
+
null_column = pa.nulls(
|
1953
|
+
get_table_length(table), type=field.arrow.type
|
1954
|
+
)
|
1955
|
+
new_columns.append(null_column)
|
1956
|
+
else:
|
1957
|
+
# Field is not nullable and no future_default - error
|
1958
|
+
raise SchemaValidationError(
|
1959
|
+
f"Field '{field.arrow.name}' is required but not present and no future_default is set"
|
1960
|
+
)
|
1961
|
+
new_schema_fields.append(field.arrow)
|
1962
|
+
|
1963
|
+
def _apply_schema_updates(
|
1964
|
+
self, field_updates: Dict[str, Field], new_fields: Dict[str, Field]
|
1965
|
+
) -> Schema:
|
1966
|
+
"""Apply collected schema updates and return the updated schema."""
|
1967
|
+
if not field_updates and not new_fields:
|
1968
|
+
return self
|
1969
|
+
|
1970
|
+
# Initialize schema update with allow_incompatible_changes=True for type promotion
|
1971
|
+
schema_update = self.update(allow_incompatible_changes=True)
|
1972
|
+
|
1973
|
+
# Apply field updates
|
1974
|
+
for field_name, updated_field in field_updates.items():
|
1975
|
+
schema_update = schema_update._update_field(field_name, updated_field)
|
1976
|
+
|
1977
|
+
# Apply new fields
|
1978
|
+
for field_name, new_field in new_fields.items():
|
1979
|
+
schema_update = schema_update.add_field(new_field)
|
1980
|
+
|
1981
|
+
# Apply all updates
|
1982
|
+
return schema_update.apply()
|
1983
|
+
|
1984
|
+
def _process_existing_columns_for_coercion(
|
1985
|
+
self, pa_table: pa.Table, field_name_to_field: Dict[str, Field]
|
1986
|
+
) -> Tuple[List[pa.Array], List[pa.Field]]:
|
1987
|
+
"""Process columns that exist in the table for coercion.
|
1988
|
+
|
1989
|
+
Args:
|
1990
|
+
pa_table: PyArrow table to process
|
1991
|
+
field_name_to_field: Mapping from field names to Field objects
|
1992
|
+
|
1993
|
+
Returns:
|
1994
|
+
Tuple of (processed columns, corresponding fields)
|
1995
|
+
"""
|
1996
|
+
new_columns = []
|
1997
|
+
new_schema_fields = []
|
1998
|
+
|
1999
|
+
for column_name in pa_table.column_names:
|
2000
|
+
column_data = pa_table.column(column_name)
|
2001
|
+
|
2002
|
+
if column_name in field_name_to_field:
|
2003
|
+
# Field exists in target schema - use promote_type_if_needed for coercion
|
2004
|
+
field = field_name_to_field[column_name]
|
2005
|
+
promoted_data, _ = field.promote_type_if_needed(column_data)
|
2006
|
+
new_columns.append(promoted_data)
|
2007
|
+
new_schema_fields.append(field.arrow)
|
2008
|
+
else:
|
2009
|
+
# Field not in target schema - preserve as-is
|
2010
|
+
new_columns.append(column_data)
|
2011
|
+
new_schema_fields.append(pa.field(column_name, column_data.type))
|
2012
|
+
|
2013
|
+
return new_columns, new_schema_fields
|
2014
|
+
|
2015
|
+
def _add_missing_fields_for_coercion(
|
2016
|
+
self,
|
2017
|
+
pa_table: pa.Table,
|
2018
|
+
field_name_to_field: Dict[str, Field],
|
2019
|
+
existing_columns: List[pa.Array],
|
2020
|
+
existing_fields: List[pa.Field],
|
2021
|
+
) -> Tuple[List[pa.Array], List[pa.Field]]:
|
2022
|
+
"""Add columns for fields that exist in schema but not in table.
|
2023
|
+
|
2024
|
+
Args:
|
2025
|
+
pa_table: Original PyArrow table
|
2026
|
+
field_name_to_field: Mapping from field names to Field objects
|
2027
|
+
existing_columns: Columns already processed
|
2028
|
+
existing_fields: Fields already processed
|
2029
|
+
|
2030
|
+
Returns:
|
2031
|
+
Tuple of (all columns including added ones, all corresponding fields)
|
2032
|
+
"""
|
2033
|
+
all_columns = existing_columns.copy()
|
2034
|
+
all_fields = existing_fields.copy()
|
2035
|
+
|
2036
|
+
# Add any missing fields from target schema with null values or past_default values
|
2037
|
+
target_field_names = {
|
2038
|
+
field.arrow.name for field in self.field_ids_to_fields.values()
|
2039
|
+
}
|
2040
|
+
table_field_names = set(pa_table.column_names)
|
2041
|
+
|
2042
|
+
for field_name in target_field_names - table_field_names:
|
2043
|
+
field = field_name_to_field[field_name]
|
2044
|
+
|
2045
|
+
# Check if field has past_default value and use it instead of nulls
|
2046
|
+
if field.past_default is not None:
|
2047
|
+
# Create array filled with past_default value
|
2048
|
+
default_column = pa.array(
|
2049
|
+
[field.past_default] * get_table_length(pa_table),
|
2050
|
+
type=field.arrow.type,
|
2051
|
+
)
|
2052
|
+
all_columns.append(default_column)
|
2053
|
+
else:
|
2054
|
+
# Use null values as before
|
2055
|
+
null_column = pa.nulls(
|
2056
|
+
get_table_length(pa_table), type=field.arrow.type
|
2057
|
+
)
|
2058
|
+
all_columns.append(null_column)
|
2059
|
+
|
2060
|
+
all_fields.append(field.arrow)
|
2061
|
+
|
2062
|
+
return all_columns, all_fields
|
2063
|
+
|
2064
|
+
def _coerce_table_columns(
|
2065
|
+
self, pa_table: pa.Table
|
2066
|
+
) -> Tuple[List[pa.Array], List[pa.Field]]:
|
2067
|
+
"""Process table columns using field coercion and add missing fields.
|
2068
|
+
|
2069
|
+
Args:
|
2070
|
+
pa_table: PyArrow table to process
|
2071
|
+
|
2072
|
+
Returns:
|
2073
|
+
Tuple of (list of coerced columns, list of corresponding fields)
|
2074
|
+
"""
|
2075
|
+
# Create mapping from field names to Field objects
|
2076
|
+
field_name_to_field = self._create_field_name_mapping()
|
2077
|
+
|
2078
|
+
# Process existing columns in the table
|
2079
|
+
(
|
2080
|
+
processed_columns,
|
2081
|
+
processed_fields,
|
2082
|
+
) = self._process_existing_columns_for_coercion(pa_table, field_name_to_field)
|
2083
|
+
|
2084
|
+
# Add any missing fields from target schema
|
2085
|
+
all_columns, all_fields = self._add_missing_fields_for_coercion(
|
2086
|
+
pa_table, field_name_to_field, processed_columns, processed_fields
|
2087
|
+
)
|
2088
|
+
|
2089
|
+
return all_columns, all_fields
|
2090
|
+
|
2091
|
+
def _reorder_columns_to_schema(
|
2092
|
+
self, columns: List[pa.Array], fields: List[pa.Field], original_table: pa.Table
|
2093
|
+
) -> Tuple[List[pa.Array], List[pa.Field]]:
|
2094
|
+
"""Reorder columns to match schema order, preserving extra fields.
|
2095
|
+
|
2096
|
+
Args:
|
2097
|
+
columns: List of processed columns
|
2098
|
+
fields: List of corresponding field schemas
|
2099
|
+
original_table: Original table for field name ordering
|
2100
|
+
|
2101
|
+
Returns:
|
2102
|
+
Tuple of (reordered columns, reordered fields)
|
2103
|
+
"""
|
2104
|
+
# Reorder columns to match schema order
|
2105
|
+
reordered_columns = []
|
2106
|
+
reordered_fields = []
|
2107
|
+
schema_field_names = [
|
2108
|
+
field.arrow.name for field in self.field_ids_to_fields.values()
|
2109
|
+
]
|
2110
|
+
|
2111
|
+
# Add schema fields in schema order
|
2112
|
+
for field_name in schema_field_names:
|
2113
|
+
for i, field in enumerate(fields):
|
2114
|
+
if field.name == field_name:
|
2115
|
+
reordered_columns.append(columns[i])
|
2116
|
+
reordered_fields.append(field)
|
2117
|
+
break
|
2118
|
+
|
2119
|
+
# Add any extra fields that aren't in schema (preserve original order)
|
2120
|
+
target_field_names = set(schema_field_names)
|
2121
|
+
table_field_names = set(original_table.column_names)
|
2122
|
+
extra_field_names = table_field_names - target_field_names
|
2123
|
+
|
2124
|
+
for field_name in original_table.column_names:
|
2125
|
+
if field_name in extra_field_names:
|
2126
|
+
for i, field in enumerate(fields):
|
2127
|
+
if field.name == field_name:
|
2128
|
+
reordered_columns.append(columns[i])
|
2129
|
+
reordered_fields.append(field)
|
2130
|
+
break
|
2131
|
+
|
2132
|
+
return reordered_columns, reordered_fields
|
2133
|
+
|
2134
|
+
@staticmethod
|
2135
|
+
def _del_subschema(
|
2136
|
+
name: SchemaName,
|
2137
|
+
subschemas: Dict[SchemaName, Schema],
|
2138
|
+
) -> Dict[SchemaName, Schema]:
|
2139
|
+
deleted_subschema = subschemas.pop(name, None)
|
2140
|
+
if deleted_subschema is None:
|
2141
|
+
raise ValueError(f"Subschema `{name}` does not exist.")
|
2142
|
+
return subschemas
|
2143
|
+
|
2144
|
+
@staticmethod
|
2145
|
+
def _add_subschema(
|
2146
|
+
name: SchemaName,
|
2147
|
+
schema: SingleSchema,
|
2148
|
+
subschemas: Dict[SchemaName, Schema],
|
2149
|
+
) -> Dict[SchemaName, Schema]:
|
2150
|
+
Schema._validate_schema_name(name)
|
2151
|
+
if name == BASE_SCHEMA_NAME:
|
2152
|
+
raise ValueError(
|
2153
|
+
f"Cannot add subschema with reserved name: {BASE_SCHEMA_NAME}"
|
2154
|
+
)
|
2155
|
+
if name in subschemas:
|
2156
|
+
raise ValueError(f"Subschema `{name}` already exists.")
|
2157
|
+
for key, val in subschemas.items():
|
2158
|
+
subschemas[key] = val.arrow
|
2159
|
+
subschemas[name] = schema
|
2160
|
+
return subschemas
|
2161
|
+
|
2162
|
+
|
2163
|
+
class SchemaList(List[Schema]):
|
2164
|
+
@staticmethod
|
2165
|
+
def of(items: List[Schema]) -> SchemaList:
|
2166
|
+
typed_items = SchemaList()
|
2167
|
+
for item in items:
|
2168
|
+
if item is not None and not isinstance(item, Schema):
|
2169
|
+
item = Schema(item)
|
2170
|
+
typed_items.append(item)
|
2171
|
+
return typed_items
|
2172
|
+
|
2173
|
+
def __getitem__(self, item):
|
2174
|
+
val = super().__getitem__(item)
|
2175
|
+
if val is not None and not isinstance(val, Schema):
|
2176
|
+
self[item] = val = Schema(val)
|
2177
|
+
return val
|
2178
|
+
|
2179
|
+
def __iter__(self):
|
2180
|
+
for i in range(len(self)):
|
2181
|
+
yield self[i] # This triggers __getitem__ conversion
|
2182
|
+
|
2183
|
+
|
2184
|
+
class SchemaUpdate(dict):
|
2185
|
+
"""
|
2186
|
+
Provides safe schema evolution capabilities for DeltaCAT schemas.
|
2187
|
+
|
2188
|
+
SchemaUpdate allows users to:
|
2189
|
+
1. Add new fields to a schema
|
2190
|
+
2. Remove existing fields from a schema
|
2191
|
+
3. Update existing fields with compatible changes
|
2192
|
+
4. Validate schema compatibility to prevent breaking existing dataset consumers
|
2193
|
+
|
2194
|
+
The class enforces backward compatibility by default to ensure that table
|
2195
|
+
consumer jobs writtten using PyArrow, Pandas, Polars, Ray Data, Daft, and other
|
2196
|
+
dataset types continue to work after schema changes.
|
2197
|
+
|
2198
|
+
Example:
|
2199
|
+
Using Schema.update():
|
2200
|
+
>>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
|
2201
|
+
>>> new_field = Field.of(pa.field("name", pa.string()))
|
2202
|
+
>>> updated_schema = (schema.update()
|
2203
|
+
... .add_field("name", new_field)
|
2204
|
+
... .apply())
|
2205
|
+
|
2206
|
+
Using SchemaUpdate.of():
|
2207
|
+
>>> schema = Schema.of([Field.of(pa.field("id", pa.int64()))])
|
2208
|
+
>>> update = SchemaUpdate.of(schema)
|
2209
|
+
>>> new_field = Field.of(pa.field("name", pa.string()))
|
2210
|
+
>>> updated_schema = update.add_field("name", new_field).apply()
|
2211
|
+
"""
|
2212
|
+
|
2213
|
+
@staticmethod
|
2214
|
+
def of(
|
2215
|
+
base_schema: Schema, allow_incompatible_changes: bool = False
|
2216
|
+
) -> SchemaUpdate:
|
2217
|
+
"""
|
2218
|
+
Create a SchemaUpdate for the given base schema.
|
2219
|
+
|
2220
|
+
Args:
|
2221
|
+
base_schema: The original schema to update
|
2222
|
+
allow_incompatible_changes: If True, allows changes that may break
|
2223
|
+
backward compatibility. If False (default), raises SchemaCompatibilityError
|
2224
|
+
for incompatible changes.
|
2225
|
+
|
2226
|
+
Returns:
|
2227
|
+
A new SchemaUpdate instance
|
2228
|
+
"""
|
2229
|
+
return SchemaUpdate(
|
2230
|
+
{
|
2231
|
+
"baseSchema": base_schema,
|
2232
|
+
"allowIncompatibleChanges": allow_incompatible_changes,
|
2233
|
+
"operations": SchemaUpdateOperations.of([]),
|
2234
|
+
}
|
2235
|
+
)
|
2236
|
+
|
2237
|
+
@property
|
2238
|
+
def base_schema(self) -> Schema:
|
2239
|
+
"""Get the base schema being updated."""
|
2240
|
+
return self["baseSchema"]
|
2241
|
+
|
2242
|
+
@base_schema.setter
|
2243
|
+
def base_schema(self, value: Schema) -> None:
|
2244
|
+
"""Set the base schema being updated."""
|
2245
|
+
self["baseSchema"] = value
|
2246
|
+
|
2247
|
+
@property
|
2248
|
+
def allow_incompatible_changes(self) -> bool:
|
2249
|
+
"""Get whether incompatible changes are allowed."""
|
2250
|
+
return self["allowIncompatibleChanges"]
|
2251
|
+
|
2252
|
+
@allow_incompatible_changes.setter
|
2253
|
+
def allow_incompatible_changes(self, value: bool) -> None:
|
2254
|
+
"""Set whether incompatible changes are allowed."""
|
2255
|
+
self["allowIncompatibleChanges"] = value
|
2256
|
+
|
2257
|
+
@property
|
2258
|
+
def operations(self) -> SchemaUpdateOperations:
|
2259
|
+
"""Get the list of pending operations."""
|
2260
|
+
return self["operations"]
|
2261
|
+
|
2262
|
+
@operations.setter
|
2263
|
+
def operations(self, value: SchemaUpdateOperations) -> None:
|
2264
|
+
"""Set the list of pending operations."""
|
2265
|
+
self["operations"] = value
|
2266
|
+
|
2267
|
+
def add_field(
|
2268
|
+
self,
|
2269
|
+
new_field: Field,
|
2270
|
+
) -> SchemaUpdate:
|
2271
|
+
"""
|
2272
|
+
Add a new field to the schema.
|
2273
|
+
|
2274
|
+
Args:
|
2275
|
+
field_locator: Location identifier for the new field (name, nested path, or ID)
|
2276
|
+
new_field: The Field object to add
|
2277
|
+
|
2278
|
+
Returns:
|
2279
|
+
Self for method chaining
|
2280
|
+
|
2281
|
+
Raises:
|
2282
|
+
SchemaCompatibilityError: If field already exists or addition would break compatibility
|
2283
|
+
"""
|
2284
|
+
self.operations.append(SchemaUpdateOperation.add_field(new_field))
|
2285
|
+
return self
|
2286
|
+
|
2287
|
+
def remove_field(self, field_locator: FieldLocator) -> SchemaUpdate:
|
2288
|
+
"""
|
2289
|
+
Remove an existing field from the schema.
|
2290
|
+
|
2291
|
+
Args:
|
2292
|
+
field_locator: Location identifier for the field to remove
|
2293
|
+
|
2294
|
+
Returns:
|
2295
|
+
Self for method chaining
|
2296
|
+
|
2297
|
+
Raises:
|
2298
|
+
SchemaCompatibilityError: If field doesn't exist or removal would break compatibility
|
2299
|
+
"""
|
2300
|
+
self.operations.append(SchemaUpdateOperation.remove_field(field_locator))
|
2301
|
+
return self
|
2302
|
+
|
2303
|
+
def rename_field(
|
2304
|
+
self,
|
2305
|
+
field_locator: FieldLocator,
|
2306
|
+
new_name: str,
|
2307
|
+
) -> SchemaUpdate:
|
2308
|
+
"""
|
2309
|
+
Rename an existing field while keeping all other properties the same.
|
2310
|
+
|
2311
|
+
Args:
|
2312
|
+
field_locator: Location identifier for the field to rename
|
2313
|
+
new_name: The new name for the field
|
2314
|
+
|
2315
|
+
Returns:
|
2316
|
+
Self for method chaining
|
2317
|
+
|
2318
|
+
Raises:
|
2319
|
+
SchemaCompatibilityError: If field doesn't exist or rename would break compatibility
|
2320
|
+
"""
|
2321
|
+
# Get the existing field
|
2322
|
+
existing_field = self._get_existing_field(field_locator)
|
2323
|
+
|
2324
|
+
# Create a deep copy of the field
|
2325
|
+
updated_field = copy.deepcopy(existing_field)
|
2326
|
+
|
2327
|
+
# Update only the arrow field name
|
2328
|
+
updated_field["arrow"] = pa.field(
|
2329
|
+
new_name,
|
2330
|
+
existing_field.arrow.type,
|
2331
|
+
nullable=existing_field.arrow.nullable,
|
2332
|
+
metadata=existing_field.arrow.metadata,
|
2333
|
+
)
|
2334
|
+
|
2335
|
+
return self._update_field(field_locator, updated_field)
|
2336
|
+
|
2337
|
+
def update_field_type(
|
2338
|
+
self, field_locator: FieldLocator, new_type: pa.DataType
|
2339
|
+
) -> SchemaUpdate:
|
2340
|
+
"""
|
2341
|
+
Update the PyArrow data type of an existing field while keeping all other properties the same.
|
2342
|
+
|
2343
|
+
Args:
|
2344
|
+
field_locator: Location identifier for the field to update
|
2345
|
+
new_type: The new PyArrow data type for the field
|
2346
|
+
|
2347
|
+
Returns:
|
2348
|
+
Self for method chaining
|
2349
|
+
|
2350
|
+
Raises:
|
2351
|
+
SchemaCompatibilityError: If field doesn't exist or type change would break compatibility
|
2352
|
+
"""
|
2353
|
+
# Get the existing field
|
2354
|
+
existing_field = self._get_existing_field(field_locator)
|
2355
|
+
|
2356
|
+
# Create a deep copy of the field
|
2357
|
+
updated_field = copy.deepcopy(existing_field)
|
2358
|
+
|
2359
|
+
# Update only the arrow field type
|
2360
|
+
updated_field["arrow"] = pa.field(
|
2361
|
+
existing_field.arrow.name,
|
2362
|
+
new_type,
|
2363
|
+
nullable=existing_field.arrow.nullable,
|
2364
|
+
metadata=existing_field.arrow.metadata,
|
2365
|
+
)
|
2366
|
+
|
2367
|
+
return self._update_field(field_locator, updated_field)
|
2368
|
+
|
2369
|
+
def update_field_doc(
|
2370
|
+
self,
|
2371
|
+
field_locator: FieldLocator,
|
2372
|
+
new_doc: Optional[str],
|
2373
|
+
) -> SchemaUpdate:
|
2374
|
+
"""
|
2375
|
+
Update the documentation of an existing field while keeping all other properties the same.
|
2376
|
+
|
2377
|
+
Args:
|
2378
|
+
field_locator: Location identifier for the field to update
|
2379
|
+
new_doc: The new documentation string for the field
|
2380
|
+
|
2381
|
+
Returns:
|
2382
|
+
Self for method chaining
|
2383
|
+
|
2384
|
+
Raises:
|
2385
|
+
SchemaCompatibilityError: If field doesn't exist
|
2386
|
+
"""
|
2387
|
+
# Get the existing field
|
2388
|
+
existing_field = self._get_existing_field(field_locator)
|
2389
|
+
|
2390
|
+
# Create a deep copy of the field
|
2391
|
+
updated_field = copy.deepcopy(existing_field)
|
2392
|
+
|
2393
|
+
# Update the arrow field metadata to set/remove doc
|
2394
|
+
new_metadata = copy.deepcopy(existing_field.arrow.metadata)
|
2395
|
+
new_metadata.pop(FIELD_DOC_KEY_NAME, None)
|
2396
|
+
if new_doc is not None:
|
2397
|
+
new_metadata[FIELD_DOC_KEY_NAME] = new_doc
|
2398
|
+
|
2399
|
+
updated_field["arrow"] = pa.field(
|
2400
|
+
existing_field.arrow.name,
|
2401
|
+
existing_field.arrow.type,
|
2402
|
+
nullable=existing_field.arrow.nullable,
|
2403
|
+
metadata=new_metadata if new_metadata else None,
|
2404
|
+
)
|
2405
|
+
|
2406
|
+
return self._update_field(field_locator, updated_field)
|
2407
|
+
|
2408
|
+
def update_field_nullability(
|
2409
|
+
self, field_locator: FieldLocator, nullable: bool
|
2410
|
+
) -> SchemaUpdate:
|
2411
|
+
"""
|
2412
|
+
Update the nullability of an existing field while keeping all other properties the same.
|
2413
|
+
|
2414
|
+
Args:
|
2415
|
+
field_locator: Location identifier for the field to update
|
2416
|
+
nullable: Whether the field should allow null values
|
2417
|
+
|
2418
|
+
Returns:
|
2419
|
+
Self for method chaining
|
2420
|
+
|
2421
|
+
Raises:
|
2422
|
+
SchemaCompatibilityError: If field doesn't exist or nullability change would break compatibility
|
2423
|
+
"""
|
2424
|
+
# Get the existing field
|
2425
|
+
existing_field = self._get_existing_field(field_locator)
|
2426
|
+
|
2427
|
+
# Create a deep copy of the field
|
2428
|
+
updated_field = copy.deepcopy(existing_field)
|
2429
|
+
|
2430
|
+
# Update only the arrow field nullability
|
2431
|
+
updated_field["arrow"] = pa.field(
|
2432
|
+
existing_field.arrow.name,
|
2433
|
+
existing_field.arrow.type,
|
2434
|
+
nullable=nullable,
|
2435
|
+
metadata=existing_field.arrow.metadata,
|
2436
|
+
)
|
2437
|
+
|
2438
|
+
return self._update_field(field_locator, updated_field)
|
2439
|
+
|
2440
|
+
def update_field_consistency_type(
|
2441
|
+
self,
|
2442
|
+
field_locator: FieldLocator,
|
2443
|
+
consistency_type: Optional[SchemaConsistencyType],
|
2444
|
+
) -> SchemaUpdate:
|
2445
|
+
"""
|
2446
|
+
Update the schema consistency type of an existing field while keeping all other properties the same.
|
2447
|
+
|
2448
|
+
Args:
|
2449
|
+
field_locator: Location identifier for the field to update
|
2450
|
+
consistency_type: The new schema consistency type for the field
|
2451
|
+
|
2452
|
+
Returns:
|
2453
|
+
Self for method chaining
|
2454
|
+
|
2455
|
+
Raises:
|
2456
|
+
SchemaCompatibilityError: If field doesn't exist
|
2457
|
+
"""
|
2458
|
+
# Get the existing field
|
2459
|
+
existing_field = self._get_existing_field(field_locator)
|
2460
|
+
|
2461
|
+
# Create a deep copy of the field
|
2462
|
+
updated_field = copy.deepcopy(existing_field)
|
2463
|
+
|
2464
|
+
# Update the arrow field metadata to set/remove consistency type
|
2465
|
+
new_metadata = copy.deepcopy(existing_field.arrow.metadata)
|
2466
|
+
new_metadata.pop(FIELD_CONSISTENCY_TYPE_KEY_NAME, None)
|
2467
|
+
|
2468
|
+
if consistency_type is not None:
|
2469
|
+
new_metadata[FIELD_CONSISTENCY_TYPE_KEY_NAME] = consistency_type.value
|
2470
|
+
|
2471
|
+
updated_field["arrow"] = pa.field(
|
2472
|
+
existing_field.arrow.name,
|
2473
|
+
existing_field.arrow.type,
|
2474
|
+
nullable=existing_field.arrow.nullable,
|
2475
|
+
metadata=new_metadata if new_metadata else None,
|
2476
|
+
)
|
2477
|
+
|
2478
|
+
return self._update_field(field_locator, updated_field)
|
2479
|
+
|
2480
|
+
def update_field_future_default(
|
2481
|
+
self, field_locator: FieldLocator, future_default: Optional[Any]
|
2482
|
+
) -> SchemaUpdate:
|
2483
|
+
"""
|
2484
|
+
Update the future default value of an existing field while keeping all other properties the same.
|
2485
|
+
The future default is validated to ensure it's compatible with the field's data type.
|
2486
|
+
|
2487
|
+
Args:
|
2488
|
+
field_locator: Location identifier for the field to update
|
2489
|
+
future_default: The new future default value for the field
|
2490
|
+
|
2491
|
+
Returns:
|
2492
|
+
Self for method chaining
|
2493
|
+
|
2494
|
+
Raises:
|
2495
|
+
SchemaCompatibilityError: If field doesn't exist
|
2496
|
+
ValueError: If future_default is not compatible with the field's data type
|
2497
|
+
"""
|
2498
|
+
# Get the existing field
|
2499
|
+
existing_field = self._get_existing_field(field_locator)
|
2500
|
+
|
2501
|
+
# Validate that the future_default is compatible with the field's type
|
2502
|
+
if future_default is not None:
|
2503
|
+
self._validate_default_value(existing_field.arrow.type, future_default)
|
2504
|
+
|
2505
|
+
# Create a deep copy of the field
|
2506
|
+
updated_field = copy.deepcopy(existing_field)
|
2507
|
+
|
2508
|
+
# Update the arrow field metadata to set/remove future default
|
2509
|
+
new_metadata = copy.deepcopy(existing_field.arrow.metadata)
|
2510
|
+
new_metadata.pop(FIELD_FUTURE_DEFAULT_KEY_NAME, None)
|
2511
|
+
|
2512
|
+
if future_default is not None:
|
2513
|
+
new_metadata[FIELD_FUTURE_DEFAULT_KEY_NAME] = _encode_metadata_value(
|
2514
|
+
future_default
|
2515
|
+
)
|
2516
|
+
|
2517
|
+
updated_field["arrow"] = pa.field(
|
2518
|
+
existing_field.arrow.name,
|
2519
|
+
existing_field.arrow.type,
|
2520
|
+
nullable=existing_field.arrow.nullable,
|
2521
|
+
metadata=new_metadata if new_metadata else None,
|
2522
|
+
)
|
2523
|
+
|
2524
|
+
return self._update_field(field_locator, updated_field)
|
2525
|
+
|
2526
|
+
def _update_field(
|
2527
|
+
self, field_locator: FieldLocator, updated_field: Field
|
2528
|
+
) -> SchemaUpdate:
|
2529
|
+
"""
|
2530
|
+
Update an existing field with compatible changes. This is the protected method
|
2531
|
+
that handles the general case of field updates.
|
2532
|
+
|
2533
|
+
Args:
|
2534
|
+
field_locator: Location identifier for the field to update
|
2535
|
+
updated_field: The new Field object to replace the existing field
|
2536
|
+
|
2537
|
+
Returns:
|
2538
|
+
Self for method chaining
|
2539
|
+
|
2540
|
+
Raises:
|
2541
|
+
SchemaCompatibilityError: If field doesn't exist or update would break compatibility
|
2542
|
+
"""
|
2543
|
+
self.operations.append(
|
2544
|
+
SchemaUpdateOperation.update_field(field_locator, updated_field)
|
2545
|
+
)
|
2546
|
+
return self
|
2547
|
+
|
2548
|
+
def _get_existing_field(self, field_locator: FieldLocator) -> Field:
|
2549
|
+
"""
|
2550
|
+
Helper method to retrieve an existing field, accounting for pending operations.
|
2551
|
+
|
2552
|
+
Args:
|
2553
|
+
field_locator: Location identifier for the field to retrieve
|
2554
|
+
|
2555
|
+
Returns:
|
2556
|
+
The existing Field object (with any pending updates applied)
|
2557
|
+
|
2558
|
+
Raises:
|
2559
|
+
SchemaCompatibilityError: If field doesn't exist
|
2560
|
+
"""
|
2561
|
+
field_name = self._get_field_name(field_locator)
|
2562
|
+
# Search for the field in the base schema
|
2563
|
+
base_field = None
|
2564
|
+
for field in self.base_schema.fields:
|
2565
|
+
field_field_name = field.path[0] if field.path else f"field_{field.id}"
|
2566
|
+
if field_field_name == field_name:
|
2567
|
+
base_field = field
|
2568
|
+
break
|
2569
|
+
|
2570
|
+
if base_field is None:
|
2571
|
+
# Field not found
|
2572
|
+
raise SchemaCompatibilityError(
|
2573
|
+
f"Field '{field_name}' does not exist in schema", field_locator
|
2574
|
+
)
|
2575
|
+
|
2576
|
+
# Apply any pending operations that affect this field to get the current state
|
2577
|
+
current_field = copy.deepcopy(base_field)
|
2578
|
+
|
2579
|
+
for operation in self.operations:
|
2580
|
+
if operation.field_locator_matches(field_locator):
|
2581
|
+
# Apply this operation to get the cumulative state
|
2582
|
+
current_field = operation.field
|
2583
|
+
|
2584
|
+
return current_field
|
2585
|
+
|
2586
|
+
def _validate_default_value(
|
2587
|
+
self, arrow_type: pa.DataType, default_value: Any
|
2588
|
+
) -> None:
|
2589
|
+
"""
|
2590
|
+
Helper method to validate that a default value is compatible with a PyArrow data type.
|
2591
|
+
|
2592
|
+
Args:
|
2593
|
+
arrow_type: The PyArrow data type to validate against
|
2594
|
+
default_value: The default value to validate
|
2595
|
+
|
2596
|
+
Raises:
|
2597
|
+
ValueError: If the default value is not compatible with the data type
|
2598
|
+
"""
|
2599
|
+
try:
|
2600
|
+
# Try to create a PyArrow array with the default value to validate compatibility
|
2601
|
+
pa.array([default_value], type=arrow_type)
|
2602
|
+
except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError, ValueError) as e:
|
2603
|
+
raise ValueError(
|
2604
|
+
f"Default value {default_value} is not compatible with type {arrow_type}: {e}"
|
2605
|
+
)
|
2606
|
+
|
2607
|
+
def apply(self) -> Schema:
|
2608
|
+
"""
|
2609
|
+
Apply all pending operations and return the updated schema.
|
2610
|
+
|
2611
|
+
Returns:
|
2612
|
+
New Schema object with all updates applied
|
2613
|
+
|
2614
|
+
Raises:
|
2615
|
+
SchemaCompatibilityError: If any operation would break backward compatibility
|
2616
|
+
and allow_incompatible_changes is False
|
2617
|
+
"""
|
2618
|
+
# Start with a copy of the base schema
|
2619
|
+
updated_fields = list(self.base_schema.fields)
|
2620
|
+
field_name_to_index = {
|
2621
|
+
field.path[0] if field.path else f"field_{field.id}": i
|
2622
|
+
for i, field in enumerate(updated_fields)
|
2623
|
+
}
|
2624
|
+
|
2625
|
+
# Track next available field ID for new fields during schema evolution
|
2626
|
+
next_available_field_id = self.base_schema.max_field_id + 1
|
2627
|
+
if next_available_field_id >= MAX_FIELD_ID_EXCLUSIVE:
|
2628
|
+
# Just raise an error instead of wrapping to 0, since this
|
2629
|
+
# breaks our guarantee of unique field IDs across schema
|
2630
|
+
# evolution history (e.g., we may overflow on a schema with IDs
|
2631
|
+
# 0-1MM or 2, 10, etc. already assigned).
|
2632
|
+
raise SchemaCompatibilityError(
|
2633
|
+
f"Schema Field ID overflow: {next_available_field_id} >= {MAX_FIELD_ID_EXCLUSIVE}",
|
2634
|
+
)
|
2635
|
+
|
2636
|
+
# Validate no conflicting operations before applying
|
2637
|
+
self._validate_no_conflicting_operations()
|
2638
|
+
|
2639
|
+
# Apply operations in order
|
2640
|
+
for operation in self.operations:
|
2641
|
+
if operation.operation == "add":
|
2642
|
+
next_available_field_id = self._apply_add_field(
|
2643
|
+
updated_fields,
|
2644
|
+
field_name_to_index,
|
2645
|
+
operation.field,
|
2646
|
+
next_available_field_id,
|
2647
|
+
)
|
2648
|
+
elif operation.operation == "remove":
|
2649
|
+
self._apply_remove_field(
|
2650
|
+
updated_fields,
|
2651
|
+
field_name_to_index,
|
2652
|
+
operation.field_locator,
|
2653
|
+
)
|
2654
|
+
elif operation.operation == "update":
|
2655
|
+
self._apply_update_field(
|
2656
|
+
updated_fields,
|
2657
|
+
field_name_to_index,
|
2658
|
+
operation.field_locator,
|
2659
|
+
operation.field,
|
2660
|
+
)
|
2661
|
+
|
2662
|
+
# Create new schema from updated fields with incremented schema ID
|
2663
|
+
new_schema = Schema.of(updated_fields, schema_id=self.base_schema.id + 1)
|
2664
|
+
|
2665
|
+
# Ensure max_field_id never decreases, even when fields are removed
|
2666
|
+
# This prevents field ID reuse across schema evolution history
|
2667
|
+
if new_schema.max_field_id < self.base_schema.max_field_id:
|
2668
|
+
new_schema["maxFieldId"] = self.base_schema.max_field_id
|
2669
|
+
|
2670
|
+
return new_schema
|
2671
|
+
|
2672
|
+
def _validate_no_conflicting_operations(self) -> None:
|
2673
|
+
"""Validate that operations don't conflict with each other."""
|
2674
|
+
field_operations = {} # field_name -> set of operations
|
2675
|
+
|
2676
|
+
for operation in self.operations:
|
2677
|
+
field_name = None
|
2678
|
+
|
2679
|
+
if operation.operation == "add" and operation.field:
|
2680
|
+
field_name = operation.field.arrow.name
|
2681
|
+
elif (
|
2682
|
+
operation.operation in ("remove", "update") and operation.field_locator
|
2683
|
+
):
|
2684
|
+
# Extract field name from locator
|
2685
|
+
if isinstance(operation.field_locator, str):
|
2686
|
+
field_name = operation.field_locator
|
2687
|
+
elif hasattr(operation.field_locator, "name"):
|
2688
|
+
field_name = operation.field_locator.name
|
2689
|
+
elif (
|
2690
|
+
isinstance(operation.field_locator, list)
|
2691
|
+
and operation.field_locator
|
2692
|
+
):
|
2693
|
+
field_name = operation.field_locator[0]
|
2694
|
+
|
2695
|
+
if field_name:
|
2696
|
+
if field_name not in field_operations:
|
2697
|
+
field_operations[field_name] = set()
|
2698
|
+
field_operations[field_name].add(operation.operation)
|
2699
|
+
|
2700
|
+
# Check for conflicting operations on same field
|
2701
|
+
for field_name, operations in field_operations.items():
|
2702
|
+
if len(operations) > 1:
|
2703
|
+
unique_ops = set(operations)
|
2704
|
+
# Allow multiple update operations on same field (they are cumulative)
|
2705
|
+
if unique_ops == {"update"}:
|
2706
|
+
continue # Multiple updates on same field are allowed
|
2707
|
+
# Any other combination is conflicting
|
2708
|
+
message_suffix = f"Cannot perform {', '.join(sorted(unique_ops))} operations on the same field"
|
2709
|
+
|
2710
|
+
raise ValueError(
|
2711
|
+
f"Conflicting operations detected on field '{field_name}': {sorted(operations)}. "
|
2712
|
+
f"{message_suffix}."
|
2713
|
+
)
|
2714
|
+
|
2715
|
+
def _apply_add_field(
|
2716
|
+
self,
|
2717
|
+
fields: List[Field],
|
2718
|
+
field_name_to_index: Dict[str, int],
|
2719
|
+
new_field: Field,
|
2720
|
+
next_available_field_id: int,
|
2721
|
+
) -> int:
|
2722
|
+
"""Apply add field operation with compatibility validation.
|
2723
|
+
|
2724
|
+
Args:
|
2725
|
+
fields: List of existing fields to append to
|
2726
|
+
field_name_to_index: Mapping of field names to indices
|
2727
|
+
new_field: The field to add (user-specified field_id will be ignored)
|
2728
|
+
next_available_field_id: The next available field ID to assign
|
2729
|
+
|
2730
|
+
Returns:
|
2731
|
+
The next available field ID for subsequent operations
|
2732
|
+
"""
|
2733
|
+
field_name = new_field.arrow.name
|
2734
|
+
|
2735
|
+
# Check if field already exists
|
2736
|
+
if field_name in field_name_to_index:
|
2737
|
+
raise SchemaCompatibilityError(
|
2738
|
+
f"Field '{field_name}' already exists in schema",
|
2739
|
+
)
|
2740
|
+
|
2741
|
+
# Validate compatibility for new field
|
2742
|
+
if not self.allow_incompatible_changes:
|
2743
|
+
self._validate_add_field_compatibility(new_field)
|
2744
|
+
|
2745
|
+
# For add operations, ignore user-specified field ID and auto-assign
|
2746
|
+
auto_assigned_field_id = next_available_field_id
|
2747
|
+
|
2748
|
+
# Create a copy of the field with auto-assigned field ID and correct path
|
2749
|
+
field_with_auto_id = Field.of(
|
2750
|
+
new_field.arrow,
|
2751
|
+
field_id=auto_assigned_field_id,
|
2752
|
+
is_merge_key=new_field.is_merge_key,
|
2753
|
+
merge_order=new_field.merge_order,
|
2754
|
+
is_event_time=new_field.is_event_time,
|
2755
|
+
doc=new_field.doc,
|
2756
|
+
past_default=new_field.past_default,
|
2757
|
+
future_default=new_field.future_default,
|
2758
|
+
consistency_type=new_field.consistency_type,
|
2759
|
+
path=[field_name],
|
2760
|
+
native_object=new_field.native_object,
|
2761
|
+
)
|
2762
|
+
|
2763
|
+
# Add the field
|
2764
|
+
fields.append(field_with_auto_id)
|
2765
|
+
field_name_to_index[field_name] = len(fields) - 1
|
2766
|
+
|
2767
|
+
# Return next available field ID
|
2768
|
+
return next_available_field_id + 1
|
2769
|
+
|
2770
|
+
def _apply_remove_field(
|
2771
|
+
self,
|
2772
|
+
fields: List[Field],
|
2773
|
+
field_name_to_index: Dict[str, int],
|
2774
|
+
field_locator: FieldLocator,
|
2775
|
+
) -> None:
|
2776
|
+
"""Apply remove field operation with compatibility validation."""
|
2777
|
+
field_name = self._get_field_name(field_locator)
|
2778
|
+
|
2779
|
+
# Check if field exists
|
2780
|
+
if field_name not in field_name_to_index:
|
2781
|
+
raise SchemaCompatibilityError(
|
2782
|
+
f"Field '{field_name}' does not exist in schema", field_locator
|
2783
|
+
)
|
2784
|
+
|
2785
|
+
# Validate compatibility for field removal
|
2786
|
+
if not self.allow_incompatible_changes:
|
2787
|
+
field_index = field_name_to_index[field_name]
|
2788
|
+
self._validate_remove_field_compatibility(
|
2789
|
+
fields[field_index], field_locator
|
2790
|
+
)
|
2791
|
+
|
2792
|
+
# Remove the field
|
2793
|
+
field_index = field_name_to_index[field_name]
|
2794
|
+
fields.pop(field_index)
|
2795
|
+
|
2796
|
+
# Update indices
|
2797
|
+
del field_name_to_index[field_name]
|
2798
|
+
for name, index in field_name_to_index.items():
|
2799
|
+
if index > field_index:
|
2800
|
+
field_name_to_index[name] = index - 1
|
2801
|
+
|
2802
|
+
def _apply_update_field(
|
2803
|
+
self,
|
2804
|
+
fields: List[Field],
|
2805
|
+
field_name_to_index: Dict[str, int],
|
2806
|
+
field_locator: FieldLocator,
|
2807
|
+
updated_field: Field,
|
2808
|
+
) -> None:
|
2809
|
+
"""Apply update field operation with compatibility validation."""
|
2810
|
+
field_name = self._get_field_name(field_locator)
|
2811
|
+
|
2812
|
+
# Check if field exists
|
2813
|
+
if field_name not in field_name_to_index:
|
2814
|
+
raise SchemaCompatibilityError(
|
2815
|
+
f"Field '{field_name}' does not exist in schema", field_locator
|
2816
|
+
)
|
2817
|
+
|
2818
|
+
field_index = field_name_to_index[field_name]
|
2819
|
+
old_field = fields[field_index]
|
2820
|
+
|
2821
|
+
# Validate compatibility for field update
|
2822
|
+
if not self.allow_incompatible_changes:
|
2823
|
+
self._validate_update_field_compatibility(
|
2824
|
+
old_field, updated_field, field_locator
|
2825
|
+
)
|
2826
|
+
|
2827
|
+
# Get the new field name from the updated field
|
2828
|
+
new_field_name = updated_field.arrow.name
|
2829
|
+
|
2830
|
+
# Create a copy of the updated field with the correct path
|
2831
|
+
field_with_path = Field.of(
|
2832
|
+
updated_field.arrow,
|
2833
|
+
field_id=updated_field.id,
|
2834
|
+
is_merge_key=updated_field.is_merge_key,
|
2835
|
+
merge_order=updated_field.merge_order,
|
2836
|
+
is_event_time=updated_field.is_event_time,
|
2837
|
+
doc=updated_field.doc,
|
2838
|
+
past_default=updated_field.past_default,
|
2839
|
+
future_default=updated_field.future_default,
|
2840
|
+
consistency_type=updated_field.consistency_type,
|
2841
|
+
path=[new_field_name],
|
2842
|
+
native_object=updated_field.native_object,
|
2843
|
+
)
|
2844
|
+
|
2845
|
+
# Update the field
|
2846
|
+
fields[field_index] = field_with_path
|
2847
|
+
|
2848
|
+
# If field name changed (rename), update the mapping
|
2849
|
+
if field_name != new_field_name:
|
2850
|
+
del field_name_to_index[field_name]
|
2851
|
+
field_name_to_index[new_field_name] = field_index
|
2852
|
+
|
2853
|
+
def _get_field_name(self, field_locator: FieldLocator) -> str:
|
2854
|
+
"""Extract field name from various field locator types."""
|
2855
|
+
if isinstance(field_locator, str):
|
2856
|
+
return field_locator
|
2857
|
+
elif isinstance(field_locator, list):
|
2858
|
+
return field_locator[0] if field_locator else ""
|
2859
|
+
elif isinstance(field_locator, int):
|
2860
|
+
# For field ID, try to find the corresponding field
|
2861
|
+
try:
|
2862
|
+
field = self.base_schema.field(field_locator)
|
2863
|
+
return field.path[0] if field.path else f"field_{field_locator}"
|
2864
|
+
except Exception:
|
2865
|
+
return f"field_{field_locator}"
|
2866
|
+
else:
|
2867
|
+
raise ValueError(f"Invalid field locator type: {type(field_locator)}")
|
2868
|
+
|
2869
|
+
@staticmethod
|
2870
|
+
def _field_locators_match(locator1: FieldLocator, locator2: FieldLocator) -> bool:
|
2871
|
+
"""Check if two field locators refer to the same field."""
|
2872
|
+
# For simplicity, convert both to string names and compare
|
2873
|
+
# This works because we primarily use field names in our operations
|
2874
|
+
if isinstance(locator1, str) and isinstance(locator2, str):
|
2875
|
+
return locator1 == locator2
|
2876
|
+
elif isinstance(locator1, list) and isinstance(locator2, list):
|
2877
|
+
return locator1 == locator2
|
2878
|
+
elif isinstance(locator1, int) and isinstance(locator2, int):
|
2879
|
+
return locator1 == locator2
|
2880
|
+
else:
|
2881
|
+
# Convert to strings and compare (this is a simplified approach)
|
2882
|
+
str1 = (
|
2883
|
+
locator1
|
2884
|
+
if isinstance(locator1, str)
|
2885
|
+
else (
|
2886
|
+
locator1[0]
|
2887
|
+
if isinstance(locator1, list) and locator1
|
2888
|
+
else str(locator1)
|
2889
|
+
)
|
2890
|
+
)
|
2891
|
+
str2 = (
|
2892
|
+
locator2
|
2893
|
+
if isinstance(locator2, str)
|
2894
|
+
else (
|
2895
|
+
locator2[0]
|
2896
|
+
if isinstance(locator2, list) and locator2
|
2897
|
+
else str(locator2)
|
2898
|
+
)
|
2899
|
+
)
|
2900
|
+
return str1 == str2
|
2901
|
+
|
2902
|
+
def _validate_add_field_compatibility(self, new_field: Field) -> None:
|
2903
|
+
"""Validate that adding a new field won't break compatibility."""
|
2904
|
+
field_name = new_field.arrow.name
|
2905
|
+
arrow_field = new_field.arrow
|
2906
|
+
|
2907
|
+
# Check if field is nullable or has default values
|
2908
|
+
is_nullable = arrow_field.nullable
|
2909
|
+
has_past_default = new_field.past_default is not None
|
2910
|
+
has_future_default = new_field.future_default is not None
|
2911
|
+
|
2912
|
+
if not (is_nullable or has_past_default or has_future_default):
|
2913
|
+
raise SchemaCompatibilityError(
|
2914
|
+
f"Adding non-nullable field '{field_name}' without "
|
2915
|
+
f"default values would break compatibility with existing data",
|
2916
|
+
)
|
2917
|
+
|
2918
|
+
def _validate_remove_field_compatibility(
|
2919
|
+
self, field: Field, field_locator: FieldLocator
|
2920
|
+
) -> None:
|
2921
|
+
"""Validate that removing a field won't break compatibility."""
|
2922
|
+
field_name = self._get_field_name(field_locator)
|
2923
|
+
|
2924
|
+
# Check for protected field types that should never be removed
|
2925
|
+
if field.is_merge_key:
|
2926
|
+
raise SchemaCompatibilityError(
|
2927
|
+
f"Cannot remove merge key field '{field_name}'. "
|
2928
|
+
f"Merge keys are critical for data integrity and cannot be removed.",
|
2929
|
+
field_locator,
|
2930
|
+
)
|
2931
|
+
|
2932
|
+
if field.merge_order is not None:
|
2933
|
+
raise SchemaCompatibilityError(
|
2934
|
+
f"Cannot remove merge order field '{field_name}'. "
|
2935
|
+
f"Fields with merge_order are critical for data ordering and cannot be removed.",
|
2936
|
+
field_locator,
|
2937
|
+
)
|
2938
|
+
|
2939
|
+
if field.is_event_time:
|
2940
|
+
raise SchemaCompatibilityError(
|
2941
|
+
f"Cannot remove event time field '{field_name}'. "
|
2942
|
+
f"Event time fields are critical for temporal operations and cannot be removed.",
|
2943
|
+
field_locator,
|
2944
|
+
)
|
2945
|
+
|
2946
|
+
# Removing fields generally breaks compatibility for consumers expecting them
|
2947
|
+
raise SchemaCompatibilityError(
|
2948
|
+
f"Removing field '{field_name}' would break compatibility with existing consumers. "
|
2949
|
+
f"Set allow_incompatible_changes=True to force removal.",
|
2950
|
+
field_locator,
|
2951
|
+
)
|
2952
|
+
|
2953
|
+
def _validate_update_field_compatibility(
|
2954
|
+
self, old_field: Field, new_field: Field, field_locator: FieldLocator
|
2955
|
+
) -> None:
|
2956
|
+
"""Validate that updating a field won't break compatibility."""
|
2957
|
+
old_arrow = old_field.arrow
|
2958
|
+
new_arrow = new_field.arrow
|
2959
|
+
field_name = self._get_field_name(field_locator)
|
2960
|
+
|
2961
|
+
# Protect critical field attributes that should never be changed
|
2962
|
+
if old_field.is_merge_key != new_field.is_merge_key:
|
2963
|
+
raise SchemaCompatibilityError(
|
2964
|
+
f"Cannot change merge key status for field '{field_name}'. "
|
2965
|
+
f"Merge key designation is critical for data integrity and cannot be modified.",
|
2966
|
+
field_locator,
|
2967
|
+
)
|
2968
|
+
|
2969
|
+
if old_field.merge_order != new_field.merge_order:
|
2970
|
+
raise SchemaCompatibilityError(
|
2971
|
+
f"Cannot change merge order for field '{field_name}'. "
|
2972
|
+
f"Merge order is critical for data consistency and cannot be modified.",
|
2973
|
+
field_locator,
|
2974
|
+
)
|
2975
|
+
|
2976
|
+
if old_field.is_event_time != new_field.is_event_time:
|
2977
|
+
raise SchemaCompatibilityError(
|
2978
|
+
f"Cannot change event time status for field '{field_name}'. "
|
2979
|
+
f"Event time designation is critical for temporal operations and cannot be modified.",
|
2980
|
+
field_locator,
|
2981
|
+
)
|
2982
|
+
|
2983
|
+
# Validate schema consistency type evolution rules
|
2984
|
+
self._validate_consistency_type_evolution(old_field, new_field, field_locator)
|
2985
|
+
|
2986
|
+
# Protect past_default immutability
|
2987
|
+
if old_field.past_default != new_field.past_default:
|
2988
|
+
raise SchemaCompatibilityError(
|
2989
|
+
f"Cannot change past_default for field '{field_name}'. "
|
2990
|
+
f"The past_default value is immutable once set to maintain data consistency.",
|
2991
|
+
field_locator,
|
2992
|
+
)
|
2993
|
+
|
2994
|
+
# Check for duplicate field IDs (if field ID is being changed)
|
2995
|
+
if old_field.id != new_field.id and new_field.id is not None:
|
2996
|
+
existing_field_ids = {
|
2997
|
+
f.id
|
2998
|
+
for f in self.base_schema.fields
|
2999
|
+
if f.id is not None and f != old_field
|
3000
|
+
}
|
3001
|
+
if new_field.id in existing_field_ids:
|
3002
|
+
raise SchemaCompatibilityError(
|
3003
|
+
f"Cannot update field '{field_name}' to use duplicate field ID {new_field.id}. "
|
3004
|
+
f"Field IDs must be unique across all fields in the schema.",
|
3005
|
+
field_locator,
|
3006
|
+
)
|
3007
|
+
|
3008
|
+
# Check data type compatibility
|
3009
|
+
if not self._is_type_compatible(old_arrow.type, new_arrow.type):
|
3010
|
+
raise SchemaCompatibilityError(
|
3011
|
+
f"Cannot change field '{field_name}' from {old_arrow.type} to {new_arrow.type}. "
|
3012
|
+
f"This change would break compatibility with PyArrow, Pandas, Polars, Ray Data, and Daft.",
|
3013
|
+
field_locator,
|
3014
|
+
)
|
3015
|
+
|
3016
|
+
# Check nullability - making a field non-nullable is incompatible
|
3017
|
+
if old_arrow.nullable and not new_arrow.nullable:
|
3018
|
+
# Only allow if we have past/future defaults to fill null values
|
3019
|
+
has_past_default = new_field.past_default is not None
|
3020
|
+
has_future_default = new_field.future_default is not None
|
3021
|
+
|
3022
|
+
if not (has_past_default and has_future_default):
|
3023
|
+
raise SchemaCompatibilityError(
|
3024
|
+
f"Cannot make nullable field '{field_name}' non-nullable without "
|
3025
|
+
f"providing both past_default and future_default values",
|
3026
|
+
field_locator,
|
3027
|
+
)
|
3028
|
+
|
3029
|
+
def _validate_consistency_type_evolution(
|
3030
|
+
self, old_field: Field, new_field: Field, field_locator: FieldLocator
|
3031
|
+
) -> None:
|
3032
|
+
"""
|
3033
|
+
Validate schema consistency type evolution rules.
|
3034
|
+
|
3035
|
+
Allowed transitions:
|
3036
|
+
- COERCE -> VALIDATE
|
3037
|
+
- VALIDATE -> COERCE
|
3038
|
+
- COERCE -> NONE
|
3039
|
+
- VALIDATE -> NONE
|
3040
|
+
|
3041
|
+
Forbidden transitions:
|
3042
|
+
- NONE -> COERCE
|
3043
|
+
- NONE -> VALIDATE
|
3044
|
+
"""
|
3045
|
+
old_type = old_field.consistency_type
|
3046
|
+
new_type = new_field.consistency_type
|
3047
|
+
field_name = self._get_field_name(field_locator)
|
3048
|
+
|
3049
|
+
# If types are the same, no validation needed
|
3050
|
+
if old_type == new_type:
|
3051
|
+
return
|
3052
|
+
|
3053
|
+
# Handle None values (treat as no consistency type set)
|
3054
|
+
if old_type is None and new_type is None:
|
3055
|
+
return
|
3056
|
+
|
3057
|
+
# Allow transitions from any type to NONE (relaxing constraints)
|
3058
|
+
if new_type == SchemaConsistencyType.NONE or new_type is None:
|
3059
|
+
return
|
3060
|
+
|
3061
|
+
# Allow transitions between COERCE and VALIDATE (bidirectional)
|
3062
|
+
if old_type in (
|
3063
|
+
SchemaConsistencyType.COERCE,
|
3064
|
+
SchemaConsistencyType.VALIDATE,
|
3065
|
+
) and new_type in (
|
3066
|
+
SchemaConsistencyType.COERCE,
|
3067
|
+
SchemaConsistencyType.VALIDATE,
|
3068
|
+
):
|
3069
|
+
return
|
3070
|
+
|
3071
|
+
# Allow transitions from None to COERCE or VALIDATE (adding constraints)
|
3072
|
+
if old_type is None and new_type in (
|
3073
|
+
SchemaConsistencyType.COERCE,
|
3074
|
+
SchemaConsistencyType.VALIDATE,
|
3075
|
+
):
|
3076
|
+
return
|
3077
|
+
|
3078
|
+
# Forbid transitions from NONE to COERCE or VALIDATE (tightening constraints)
|
3079
|
+
if old_type == SchemaConsistencyType.NONE and new_type in (
|
3080
|
+
SchemaConsistencyType.COERCE,
|
3081
|
+
SchemaConsistencyType.VALIDATE,
|
3082
|
+
):
|
3083
|
+
raise SchemaCompatibilityError(
|
3084
|
+
f"Cannot change consistency type for field '{field_name}' from {old_type.value} to {new_type.value}. "
|
3085
|
+
f"Transitioning from NONE to {new_type.value} would tighten validation constraints "
|
3086
|
+
f"and potentially break existing data processing.",
|
3087
|
+
field_locator,
|
3088
|
+
)
|
3089
|
+
|
3090
|
+
# If we get here, it's an unexpected combination
|
3091
|
+
raise SchemaCompatibilityError(
|
3092
|
+
f"Invalid consistency type transition for field '{field_name}' from "
|
3093
|
+
f"{old_type.value if old_type else 'None'} to {new_type.value if new_type else 'None'}.",
|
3094
|
+
field_locator,
|
3095
|
+
)
|
3096
|
+
|
3097
|
+
def _is_type_compatible(self, old_type: pa.DataType, new_type: pa.DataType) -> bool:
|
3098
|
+
"""
|
3099
|
+
Check if changing from old_type to new_type is backward compatible.
|
3100
|
+
|
3101
|
+
Compatible changes include:
|
3102
|
+
- Same type
|
3103
|
+
- Widening numeric types (int32 -> int64, float32 -> float64)
|
3104
|
+
- Making string/binary types longer
|
3105
|
+
- Adding fields to struct types
|
3106
|
+
- Making list/map value types more permissive
|
3107
|
+
"""
|
3108
|
+
# Same type is always compatible
|
3109
|
+
if old_type.equals(new_type):
|
3110
|
+
return True
|
3111
|
+
|
3112
|
+
# Numeric type widening
|
3113
|
+
if pa.types.is_integer(old_type) and pa.types.is_integer(new_type):
|
3114
|
+
# Check bit width and signedness using string representation
|
3115
|
+
old_signed = "int" in str(old_type) and "uint" not in str(old_type)
|
3116
|
+
new_signed = "int" in str(new_type) and "uint" not in str(new_type)
|
3117
|
+
return new_type.bit_width >= old_type.bit_width and old_signed == new_signed
|
3118
|
+
|
3119
|
+
if pa.types.is_floating(old_type) and pa.types.is_floating(new_type):
|
3120
|
+
return new_type.bit_width >= old_type.bit_width
|
3121
|
+
|
3122
|
+
# Integer to float promotion
|
3123
|
+
if pa.types.is_integer(old_type) and pa.types.is_floating(new_type):
|
3124
|
+
return True
|
3125
|
+
|
3126
|
+
# String/binary type compatibility
|
3127
|
+
if pa.types.is_string(old_type) and pa.types.is_string(new_type):
|
3128
|
+
return True
|
3129
|
+
if pa.types.is_binary(old_type) and pa.types.is_binary(new_type):
|
3130
|
+
return True
|
3131
|
+
|
3132
|
+
# Struct type compatibility (new fields can be added)
|
3133
|
+
if pa.types.is_struct(old_type) and pa.types.is_struct(new_type):
|
3134
|
+
old_names = {field.name for field in old_type}
|
3135
|
+
new_names = {field.name for field in new_type}
|
3136
|
+
|
3137
|
+
# All old fields must exist in new type
|
3138
|
+
if not old_names.issubset(new_names):
|
3139
|
+
return False
|
3140
|
+
|
3141
|
+
# Check compatibility of common fields
|
3142
|
+
for old_field in old_type:
|
3143
|
+
new_field = new_type.field(old_field.name)
|
3144
|
+
if not self._is_type_compatible(old_field.type, new_field.type):
|
3145
|
+
return False
|
3146
|
+
|
3147
|
+
return True
|
3148
|
+
|
3149
|
+
# List type compatibility
|
3150
|
+
if pa.types.is_list(old_type) and pa.types.is_list(new_type):
|
3151
|
+
return self._is_type_compatible(old_type.value_type, new_type.value_type)
|
3152
|
+
|
3153
|
+
# Map type compatibility
|
3154
|
+
if pa.types.is_map(old_type) and pa.types.is_map(new_type):
|
3155
|
+
return self._is_type_compatible(
|
3156
|
+
old_type.key_type, new_type.key_type
|
3157
|
+
) and self._is_type_compatible(old_type.item_type, new_type.item_type)
|
3158
|
+
|
3159
|
+
# Default: types are incompatible
|
3160
|
+
return False
|