deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1925 @@
|
|
1
|
+
"""
|
2
|
+
Tests for SchemaUpdate functionality.
|
3
|
+
|
4
|
+
Note: These tests are in a separate file from test_schema.py to avoid test contamination issues.
|
5
|
+
Some tests in test_schema.py appear to modify global state that affects SchemaUpdate tests
|
6
|
+
when run together. Running these tests in isolation ensures they pass consistently.
|
7
|
+
|
8
|
+
To run both test suites together successfully, run SchemaUpdate tests first:
|
9
|
+
pytest test_schema_update.py test_schema.py
|
10
|
+
"""
|
11
|
+
|
12
|
+
import pytest
|
13
|
+
import pyarrow as pa
|
14
|
+
|
15
|
+
from deltacat.storage.model.schema import (
|
16
|
+
Schema,
|
17
|
+
Field,
|
18
|
+
SchemaUpdate,
|
19
|
+
MAX_FIELD_ID_EXCLUSIVE,
|
20
|
+
)
|
21
|
+
from deltacat.storage.model.types import SchemaConsistencyType, SortOrder
|
22
|
+
from deltacat.storage.model.schema import MergeOrder
|
23
|
+
from deltacat.exceptions import SchemaCompatibilityError
|
24
|
+
|
25
|
+
|
26
|
+
@pytest.fixture(scope="function")
|
27
|
+
def base_schema():
|
28
|
+
"""Simple base schema for testing SchemaUpdate operations."""
|
29
|
+
return Schema.of(
|
30
|
+
[
|
31
|
+
Field.of(
|
32
|
+
pa.field("id", pa.int64(), nullable=False),
|
33
|
+
field_id=1,
|
34
|
+
is_merge_key=True,
|
35
|
+
),
|
36
|
+
Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
|
37
|
+
Field.of(pa.field("age", pa.int32(), nullable=True), field_id=3),
|
38
|
+
]
|
39
|
+
)
|
40
|
+
|
41
|
+
|
42
|
+
@pytest.fixture(scope="function")
|
43
|
+
def complex_schema():
|
44
|
+
"""More complex schema for advanced testing."""
|
45
|
+
return Schema.of(
|
46
|
+
[
|
47
|
+
Field.of(
|
48
|
+
pa.field("user_id", pa.int64(), nullable=False),
|
49
|
+
field_id=1,
|
50
|
+
is_merge_key=True,
|
51
|
+
),
|
52
|
+
Field.of(pa.field("email", pa.string(), nullable=False), field_id=2),
|
53
|
+
Field.of(pa.field("score", pa.float32(), nullable=True), field_id=3),
|
54
|
+
Field.of(
|
55
|
+
pa.field(
|
56
|
+
"metadata",
|
57
|
+
pa.struct(
|
58
|
+
[
|
59
|
+
pa.field("created_at", pa.timestamp("us")),
|
60
|
+
pa.field("tags", pa.list_(pa.string())),
|
61
|
+
]
|
62
|
+
),
|
63
|
+
nullable=True,
|
64
|
+
),
|
65
|
+
field_id=4,
|
66
|
+
),
|
67
|
+
]
|
68
|
+
)
|
69
|
+
|
70
|
+
|
71
|
+
@pytest.fixture(scope="function")
|
72
|
+
def protected_fields_schema():
|
73
|
+
"""Schema with protected fields for testing field protection rules."""
|
74
|
+
return Schema.of(
|
75
|
+
[
|
76
|
+
Field.of(
|
77
|
+
pa.field("id", pa.int64(), nullable=False),
|
78
|
+
field_id=1,
|
79
|
+
is_merge_key=True,
|
80
|
+
),
|
81
|
+
Field.of(
|
82
|
+
pa.field("timestamp", pa.int64(), nullable=False),
|
83
|
+
field_id=2,
|
84
|
+
is_event_time=True,
|
85
|
+
), # Use int64 for event time
|
86
|
+
Field.of(
|
87
|
+
pa.field("priority", pa.int32(), nullable=True),
|
88
|
+
field_id=3,
|
89
|
+
merge_order=MergeOrder.of(SortOrder.ASCENDING),
|
90
|
+
),
|
91
|
+
Field.of(
|
92
|
+
pa.field("data", pa.string(), nullable=True),
|
93
|
+
field_id=4,
|
94
|
+
past_default="default",
|
95
|
+
consistency_type=SchemaConsistencyType.COERCE,
|
96
|
+
),
|
97
|
+
]
|
98
|
+
)
|
99
|
+
|
100
|
+
|
101
|
+
class TestSchemaUpdate:
|
102
|
+
"""Comprehensive tests for SchemaUpdate class."""
|
103
|
+
|
104
|
+
def test_init(self, base_schema):
|
105
|
+
"""Test SchemaUpdate initialization."""
|
106
|
+
update = SchemaUpdate.of(base_schema)
|
107
|
+
assert update.base_schema == base_schema
|
108
|
+
assert not update.allow_incompatible_changes
|
109
|
+
assert len(update.operations) == 0
|
110
|
+
|
111
|
+
update_permissive = SchemaUpdate.of(
|
112
|
+
base_schema, allow_incompatible_changes=True
|
113
|
+
)
|
114
|
+
assert update_permissive.allow_incompatible_changes
|
115
|
+
|
116
|
+
def test_add_field_success(self, base_schema):
|
117
|
+
"""Test successfully adding a new nullable field."""
|
118
|
+
new_field = Field.of(pa.field("email", pa.string(), nullable=True), field_id=4)
|
119
|
+
|
120
|
+
update = SchemaUpdate.of(base_schema)
|
121
|
+
result_schema = update.add_field(new_field).apply()
|
122
|
+
|
123
|
+
assert len(result_schema.fields) == 4
|
124
|
+
# Verify the field was added with correct properties
|
125
|
+
added_field = result_schema.field("email")
|
126
|
+
assert added_field.arrow.name == "email"
|
127
|
+
assert added_field.arrow.type == pa.string()
|
128
|
+
assert added_field.arrow.nullable is True
|
129
|
+
assert added_field.id == 4
|
130
|
+
assert result_schema.field("id") == base_schema.field(
|
131
|
+
"id"
|
132
|
+
) # Original fields preserved
|
133
|
+
|
134
|
+
def test_add_field_with_past_default(self, base_schema):
|
135
|
+
"""Test adding field with past_default is allowed."""
|
136
|
+
new_field = Field.of(
|
137
|
+
pa.field("status", pa.string(), nullable=False),
|
138
|
+
field_id=4,
|
139
|
+
past_default="active",
|
140
|
+
)
|
141
|
+
|
142
|
+
update = SchemaUpdate.of(base_schema)
|
143
|
+
result_schema = update.add_field(new_field).apply()
|
144
|
+
|
145
|
+
assert len(result_schema.fields) == 4
|
146
|
+
# Verify the field was added with correct properties
|
147
|
+
added_field = result_schema.field("status")
|
148
|
+
assert added_field.arrow.name == "status"
|
149
|
+
assert added_field.arrow.type == pa.string()
|
150
|
+
assert added_field.arrow.nullable is False
|
151
|
+
assert added_field.id == 4
|
152
|
+
assert added_field.past_default == "active"
|
153
|
+
|
154
|
+
def test_add_field_with_future_default(self, base_schema):
|
155
|
+
"""Test adding field with future_default is allowed."""
|
156
|
+
new_field = Field.of(
|
157
|
+
pa.field("priority", pa.int32(), nullable=False),
|
158
|
+
field_id=4,
|
159
|
+
future_default=1,
|
160
|
+
)
|
161
|
+
|
162
|
+
update = SchemaUpdate.of(base_schema)
|
163
|
+
result_schema = update.add_field(new_field).apply()
|
164
|
+
|
165
|
+
assert len(result_schema.fields) == 4
|
166
|
+
# Verify the field was added with correct properties
|
167
|
+
added_field = result_schema.field("priority")
|
168
|
+
assert added_field.arrow.name == "priority"
|
169
|
+
assert added_field.arrow.type == pa.int32()
|
170
|
+
assert added_field.arrow.nullable is False
|
171
|
+
assert added_field.id == 4
|
172
|
+
assert added_field.future_default == 1
|
173
|
+
|
174
|
+
def test_add_field_non_nullable_without_defaults_fails(self, base_schema):
|
175
|
+
"""Test that adding non-nullable field without defaults fails."""
|
176
|
+
new_field = Field.of(
|
177
|
+
pa.field("required_field", pa.string(), nullable=False), field_id=4
|
178
|
+
)
|
179
|
+
|
180
|
+
update = SchemaUpdate.of(base_schema)
|
181
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
182
|
+
update.add_field(new_field).apply()
|
183
|
+
|
184
|
+
assert "non-nullable field" in str(exc_info.value)
|
185
|
+
assert "without default values" in str(exc_info.value)
|
186
|
+
|
187
|
+
def test_add_field_non_nullable_allowed_with_flag(self, base_schema):
|
188
|
+
"""Test adding non-nullable field succeeds with allow_incompatible_changes=True."""
|
189
|
+
new_field = Field.of(
|
190
|
+
pa.field("required_field", pa.string(), nullable=False), field_id=4
|
191
|
+
)
|
192
|
+
|
193
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
194
|
+
result_schema = update.add_field(new_field).apply()
|
195
|
+
|
196
|
+
assert len(result_schema.fields) == 4
|
197
|
+
# Verify the field was added with correct properties
|
198
|
+
added_field = result_schema.field("required_field")
|
199
|
+
assert added_field.arrow.name == "required_field"
|
200
|
+
assert added_field.arrow.type == pa.string()
|
201
|
+
assert added_field.arrow.nullable is False
|
202
|
+
assert added_field.id == 4
|
203
|
+
|
204
|
+
def test_add_existing_field_fails(self, base_schema):
|
205
|
+
"""Test that adding a field that already exists fails."""
|
206
|
+
duplicate_field = Field.of(pa.field("name", pa.string()), field_id=5)
|
207
|
+
|
208
|
+
update = SchemaUpdate.of(base_schema)
|
209
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
210
|
+
update.add_field(duplicate_field).apply()
|
211
|
+
|
212
|
+
assert "already exists" in str(exc_info.value)
|
213
|
+
|
214
|
+
def test_add_field_id_overflow_raises_error(self):
|
215
|
+
"""Adding a field when max_field_id is MAX-1 should overflow and error.
|
216
|
+
|
217
|
+
Base schema has field IDs at 0 and MAX_FIELD_ID_EXCLUSIVE - 1. Adding a
|
218
|
+
new field should attempt to auto-assign the next ID which overflows back
|
219
|
+
to 0, causing a duplicate ID error.
|
220
|
+
"""
|
221
|
+
base = Schema.of(
|
222
|
+
[
|
223
|
+
Field.of(
|
224
|
+
pa.field("id_max_minus_one", pa.int64(), nullable=True),
|
225
|
+
field_id=MAX_FIELD_ID_EXCLUSIVE - 1,
|
226
|
+
),
|
227
|
+
]
|
228
|
+
)
|
229
|
+
|
230
|
+
# Add a new nullable field (compatibility-wise OK). The ID is ignored
|
231
|
+
# and will be auto-assigned, which should overflow and raise ValueError.
|
232
|
+
update = SchemaUpdate.of(base)
|
233
|
+
new_field = Field.of(pa.field("overflow", pa.int64(), nullable=True))
|
234
|
+
|
235
|
+
with pytest.raises(SchemaCompatibilityError):
|
236
|
+
update.add_field(new_field).apply()
|
237
|
+
|
238
|
+
def test_remove_field_fails_by_default(self, base_schema):
|
239
|
+
"""Test that removing fields fails by default for compatibility."""
|
240
|
+
update = SchemaUpdate.of(base_schema)
|
241
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
242
|
+
update.remove_field("age").apply()
|
243
|
+
|
244
|
+
assert "would break compatibility" in str(exc_info.value)
|
245
|
+
assert "allow_incompatible_changes=True" in str(exc_info.value)
|
246
|
+
assert exc_info.value.field_locator == "age"
|
247
|
+
|
248
|
+
def test_remove_field_succeeds_with_flag(self, base_schema):
|
249
|
+
"""Test removing field succeeds with allow_incompatible_changes=True."""
|
250
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
251
|
+
result_schema = update.remove_field("age").apply()
|
252
|
+
|
253
|
+
assert len(result_schema.fields) == 2
|
254
|
+
field_names = [f.path[0] for f in result_schema.fields if f.path]
|
255
|
+
assert "age" not in field_names
|
256
|
+
assert "id" in field_names
|
257
|
+
assert "name" in field_names
|
258
|
+
|
259
|
+
def test_remove_nonexistent_field_fails(self, base_schema):
|
260
|
+
"""Test removing a field that doesn't exist fails."""
|
261
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
262
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
263
|
+
update.remove_field("nonexistent").apply()
|
264
|
+
|
265
|
+
assert "does not exist" in str(exc_info.value)
|
266
|
+
assert exc_info.value.field_locator == "nonexistent"
|
267
|
+
|
268
|
+
def test_update_field_compatible_type_widening(self, base_schema):
|
269
|
+
"""Test updating field with compatible type widening (int32 -> int64)."""
|
270
|
+
update = SchemaUpdate.of(base_schema)
|
271
|
+
result_schema = update.update_field_type("age", pa.int64()).apply()
|
272
|
+
|
273
|
+
updated_age_field = result_schema.field("age")
|
274
|
+
assert updated_age_field.arrow.type == pa.int64()
|
275
|
+
assert updated_age_field.arrow.name == "age"
|
276
|
+
assert updated_age_field.id == 3
|
277
|
+
|
278
|
+
def test_update_field_compatible_nullability_change(self, base_schema):
|
279
|
+
"""Test making nullable field non-nullable fails without defaults."""
|
280
|
+
# This should fail because we're making a nullable field non-nullable without defaults
|
281
|
+
update = SchemaUpdate.of(base_schema)
|
282
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
283
|
+
update.update_field_nullability("name", False).apply()
|
284
|
+
|
285
|
+
assert "non-nullable without" in str(exc_info.value)
|
286
|
+
assert "past_default and future_default" in str(exc_info.value)
|
287
|
+
|
288
|
+
def test_update_field_incompatible_nullability_fails(self, base_schema):
|
289
|
+
"""Test making nullable field non-nullable fails without defaults."""
|
290
|
+
update = SchemaUpdate.of(base_schema)
|
291
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
292
|
+
update.update_field_nullability("name", False).apply()
|
293
|
+
|
294
|
+
assert "non-nullable without" in str(exc_info.value)
|
295
|
+
assert "past_default and future_default" in str(exc_info.value)
|
296
|
+
|
297
|
+
def test_update_field_incompatible_type_fails(self, base_schema):
|
298
|
+
"""Test updating field with incompatible type change fails."""
|
299
|
+
# int32 -> string is incompatible
|
300
|
+
update = SchemaUpdate.of(base_schema)
|
301
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
302
|
+
update.update_field_type("age", pa.string()).apply()
|
303
|
+
|
304
|
+
assert "would break compatibility" in str(exc_info.value)
|
305
|
+
assert "PyArrow, Pandas, Polars, Ray Data, and Daft" in str(exc_info.value)
|
306
|
+
|
307
|
+
def test_update_field_incompatible_allowed_with_flag(self, base_schema):
|
308
|
+
"""Test incompatible field update succeeds with allow_incompatible_changes=True."""
|
309
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
310
|
+
result_schema = update.update_field_type("age", pa.string()).apply()
|
311
|
+
|
312
|
+
updated_age_field = result_schema.field("age")
|
313
|
+
assert updated_age_field.arrow.type == pa.string()
|
314
|
+
assert updated_age_field.arrow.name == "age"
|
315
|
+
assert updated_age_field.id == 3
|
316
|
+
|
317
|
+
def test_update_nonexistent_field_fails(self, base_schema):
|
318
|
+
"""Test updating a field that doesn't exist fails."""
|
319
|
+
update = SchemaUpdate.of(base_schema)
|
320
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
321
|
+
update.update_field_type("nonexistent", pa.string()).apply()
|
322
|
+
|
323
|
+
assert "does not exist" in str(exc_info.value)
|
324
|
+
|
325
|
+
def test_method_chaining(self, base_schema):
|
326
|
+
"""Test that SchemaUpdate methods support fluent chaining."""
|
327
|
+
new_field1 = Field.of(pa.field("email", pa.string(), nullable=True), field_id=4)
|
328
|
+
new_field2 = Field.of(
|
329
|
+
pa.field("score", pa.float64(), nullable=True), field_id=5
|
330
|
+
)
|
331
|
+
|
332
|
+
result_schema = (
|
333
|
+
SchemaUpdate.of(base_schema)
|
334
|
+
.add_field(new_field1)
|
335
|
+
.add_field(new_field2)
|
336
|
+
.update_field_type("age", pa.int64())
|
337
|
+
.apply()
|
338
|
+
)
|
339
|
+
|
340
|
+
assert len(result_schema.fields) == 5
|
341
|
+
|
342
|
+
# Verify email field
|
343
|
+
email_field = result_schema.field("email")
|
344
|
+
assert email_field.arrow.name == "email"
|
345
|
+
assert email_field.arrow.type == pa.string()
|
346
|
+
assert email_field.id == 4
|
347
|
+
|
348
|
+
# Verify score field
|
349
|
+
score_field = result_schema.field("score")
|
350
|
+
assert score_field.arrow.name == "score"
|
351
|
+
assert score_field.arrow.type == pa.float64()
|
352
|
+
assert score_field.id == 5
|
353
|
+
|
354
|
+
# Verify updated age field
|
355
|
+
age_field = result_schema.field("age")
|
356
|
+
assert age_field.arrow.type == pa.int64()
|
357
|
+
assert age_field.arrow.name == "age"
|
358
|
+
assert age_field.id == 3
|
359
|
+
|
360
|
+
def test_complex_struct_field_operations(self, complex_schema):
|
361
|
+
"""Test operations on schemas with complex struct fields."""
|
362
|
+
# Add a new nested struct field
|
363
|
+
new_struct_field = Field.of(
|
364
|
+
pa.field(
|
365
|
+
"preferences",
|
366
|
+
pa.struct(
|
367
|
+
[
|
368
|
+
pa.field("theme", pa.string()),
|
369
|
+
pa.field("notifications", pa.bool_()),
|
370
|
+
]
|
371
|
+
),
|
372
|
+
nullable=True,
|
373
|
+
),
|
374
|
+
field_id=5,
|
375
|
+
)
|
376
|
+
|
377
|
+
update = SchemaUpdate.of(complex_schema)
|
378
|
+
result_schema = update.add_field(new_struct_field).apply()
|
379
|
+
|
380
|
+
assert len(result_schema.fields) == 5
|
381
|
+
# Verify the struct field was added correctly
|
382
|
+
prefs_field = result_schema.field("preferences")
|
383
|
+
assert prefs_field.arrow.name == "preferences"
|
384
|
+
assert prefs_field.id == 5
|
385
|
+
assert pa.types.is_struct(prefs_field.arrow.type)
|
386
|
+
|
387
|
+
def test_field_locator_types(self, base_schema):
|
388
|
+
"""Test different types of field locators (string, list, int)."""
|
389
|
+
new_field = Field.of(pa.field("test", pa.string(), nullable=True), field_id=4)
|
390
|
+
|
391
|
+
# Test string locator
|
392
|
+
update1 = SchemaUpdate.of(base_schema)
|
393
|
+
result1 = update1.add_field(new_field).apply()
|
394
|
+
assert len(result1.fields) == 4
|
395
|
+
|
396
|
+
# Test list locator (nested field path)
|
397
|
+
update2 = SchemaUpdate.of(base_schema)
|
398
|
+
result2 = update2.add_field(new_field).apply()
|
399
|
+
assert len(result2.fields) == 4
|
400
|
+
|
401
|
+
# Test int locator for updates (using existing field ID)
|
402
|
+
update3 = SchemaUpdate.of(base_schema)
|
403
|
+
result3 = update3.update_field_type(3, pa.int64()).apply() # Update by field ID
|
404
|
+
assert result3.field("age").arrow.type == pa.int64()
|
405
|
+
|
406
|
+
def test_type_compatibility_validation(self):
|
407
|
+
"""Test the _is_type_compatible method with various type combinations."""
|
408
|
+
base_schema_simple = Schema.of(
|
409
|
+
[Field.of(pa.field("test", pa.int32()), field_id=1)]
|
410
|
+
)
|
411
|
+
update = SchemaUpdate.of(base_schema_simple)
|
412
|
+
|
413
|
+
# Test numeric widening (compatible)
|
414
|
+
assert update._is_type_compatible(pa.int32(), pa.int64())
|
415
|
+
assert update._is_type_compatible(pa.float32(), pa.float64())
|
416
|
+
assert update._is_type_compatible(pa.int32(), pa.float64())
|
417
|
+
|
418
|
+
# Test incompatible changes
|
419
|
+
assert not update._is_type_compatible(pa.int64(), pa.int32()) # narrowing
|
420
|
+
assert not update._is_type_compatible(
|
421
|
+
pa.string(), pa.int32()
|
422
|
+
) # different types
|
423
|
+
assert not update._is_type_compatible(
|
424
|
+
pa.float64(), pa.string()
|
425
|
+
) # different types
|
426
|
+
|
427
|
+
# Test string/binary compatibility
|
428
|
+
assert update._is_type_compatible(pa.string(), pa.string())
|
429
|
+
assert update._is_type_compatible(pa.binary(), pa.binary())
|
430
|
+
|
431
|
+
# Test struct compatibility
|
432
|
+
old_struct = pa.struct([pa.field("a", pa.int32())])
|
433
|
+
new_struct_compatible = pa.struct(
|
434
|
+
[pa.field("a", pa.int32()), pa.field("b", pa.string())]
|
435
|
+
)
|
436
|
+
new_struct_incompatible = pa.struct(
|
437
|
+
[pa.field("b", pa.string())]
|
438
|
+
) # missing field "a"
|
439
|
+
|
440
|
+
assert update._is_type_compatible(old_struct, new_struct_compatible)
|
441
|
+
assert not update._is_type_compatible(old_struct, new_struct_incompatible)
|
442
|
+
|
443
|
+
# Test list compatibility
|
444
|
+
assert update._is_type_compatible(pa.list_(pa.int32()), pa.list_(pa.int64()))
|
445
|
+
assert not update._is_type_compatible(
|
446
|
+
pa.list_(pa.int64()), pa.list_(pa.int32())
|
447
|
+
)
|
448
|
+
|
449
|
+
def test_error_field_locator_attribute(self, base_schema):
|
450
|
+
"""Test that SchemaCompatibilityError includes field_locator."""
|
451
|
+
new_field = Field.of(
|
452
|
+
pa.field("required", pa.string(), nullable=False), field_id=4
|
453
|
+
)
|
454
|
+
|
455
|
+
update = SchemaUpdate.of(base_schema)
|
456
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
457
|
+
update.add_field(new_field).apply()
|
458
|
+
|
459
|
+
assert "Adding non-nullable field" in str(exc_info.value)
|
460
|
+
|
461
|
+
def test_operations_applied_in_order(self, base_schema):
|
462
|
+
"""Test that operations are applied in the order they were added."""
|
463
|
+
original_field_id = base_schema.field_id("name")
|
464
|
+
result_schema = (
|
465
|
+
base_schema.update()
|
466
|
+
.update_field_doc("name", "first name")
|
467
|
+
.update_field_doc("name", "last name")
|
468
|
+
.update_field_doc("name", "middle name")
|
469
|
+
.update_field_doc("name", "full name")
|
470
|
+
.apply()
|
471
|
+
)
|
472
|
+
# Verify that the result reflects the last rename operation
|
473
|
+
actual_field = result_schema.field(original_field_id)
|
474
|
+
assert actual_field.arrow.name == "name"
|
475
|
+
assert actual_field.doc == "full name"
|
476
|
+
assert actual_field.arrow.type == pa.string()
|
477
|
+
assert actual_field.id == original_field_id
|
478
|
+
|
479
|
+
def test_duplicate_field_id_validation_add_field(self, base_schema):
|
480
|
+
"""Test that adding a field with duplicate field ID succeeds because IDs are auto-assigned.
|
481
|
+
|
482
|
+
Note: This test behavior changed after implementing automatic field ID assignment.
|
483
|
+
User-specified field IDs are now ignored to prevent conflicts.
|
484
|
+
"""
|
485
|
+
# Try to add a field with ID 2, which already exists for 'name' field
|
486
|
+
# This should succeed because the user-specified ID will be ignored
|
487
|
+
duplicate_id_field = Field.of(
|
488
|
+
pa.field("new_field", pa.string(), nullable=True), field_id=2
|
489
|
+
)
|
490
|
+
|
491
|
+
update = SchemaUpdate.of(base_schema)
|
492
|
+
result_schema = update.add_field(duplicate_id_field).apply()
|
493
|
+
|
494
|
+
# Field should be added with auto-assigned ID (4), not the conflicting ID (2)
|
495
|
+
new_field = result_schema.field("new_field")
|
496
|
+
assert new_field.id == 4 # Auto-assigned, not 2
|
497
|
+
assert new_field.arrow.name == "new_field"
|
498
|
+
|
499
|
+
# Original field with ID 2 should be unchanged
|
500
|
+
assert result_schema.field("name").id == 2
|
501
|
+
|
502
|
+
def test_duplicate_field_id_validation_update_field(self, base_schema):
|
503
|
+
"""Test that updating a field to use duplicate field ID fails."""
|
504
|
+
# Try to update 'age' field to use ID 1, which already exists for 'id' field
|
505
|
+
updated_field = Field.of(pa.field("age", pa.int32(), nullable=True), field_id=1)
|
506
|
+
|
507
|
+
update = SchemaUpdate.of(base_schema)
|
508
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
509
|
+
update._update_field("age", updated_field).apply()
|
510
|
+
|
511
|
+
assert "duplicate field ID 1" in str(exc_info.value)
|
512
|
+
assert exc_info.value.field_locator == "age"
|
513
|
+
|
514
|
+
def test_cannot_remove_all_fields(self, base_schema):
|
515
|
+
"""Test that removing all fields fails."""
|
516
|
+
|
517
|
+
update = SchemaUpdate.of(base_schema, True)
|
518
|
+
with pytest.raises(ValueError) as exc_info:
|
519
|
+
update.remove_field("name").remove_field("age").remove_field("id").apply()
|
520
|
+
|
521
|
+
assert "Schema must contain at least one field." in str(exc_info.value)
|
522
|
+
|
523
|
+
def test_cannot_remove_merge_key_field(self, protected_fields_schema):
|
524
|
+
"""Test that removing merge key fields is forbidden."""
|
525
|
+
update = SchemaUpdate.of(protected_fields_schema)
|
526
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
527
|
+
update.remove_field("id").apply()
|
528
|
+
|
529
|
+
assert "Cannot remove merge key field" in str(exc_info.value)
|
530
|
+
assert "critical for data integrity" in str(exc_info.value)
|
531
|
+
|
532
|
+
def test_cannot_remove_event_time_field(self, protected_fields_schema):
|
533
|
+
"""Test that removing event time fields is forbidden."""
|
534
|
+
update = SchemaUpdate.of(protected_fields_schema)
|
535
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
536
|
+
update.remove_field("timestamp").apply()
|
537
|
+
|
538
|
+
assert "Cannot remove event time field" in str(exc_info.value)
|
539
|
+
assert "critical for temporal operations" in str(exc_info.value)
|
540
|
+
|
541
|
+
def test_cannot_remove_merge_order_field(self, protected_fields_schema):
|
542
|
+
"""Test that removing merge order fields is forbidden."""
|
543
|
+
update = SchemaUpdate.of(protected_fields_schema)
|
544
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
545
|
+
update.remove_field("priority").apply()
|
546
|
+
|
547
|
+
assert "Cannot remove merge order field" in str(exc_info.value)
|
548
|
+
assert "critical for data ordering" in str(exc_info.value)
|
549
|
+
|
550
|
+
def test_cannot_change_merge_key_status(self, protected_fields_schema):
|
551
|
+
"""Test that changing merge key status is forbidden."""
|
552
|
+
# Try to make merge key field not a merge key
|
553
|
+
updated_field = Field.of(
|
554
|
+
pa.field("id", pa.int64(), nullable=False), field_id=1, is_merge_key=False
|
555
|
+
)
|
556
|
+
|
557
|
+
update = SchemaUpdate.of(protected_fields_schema)
|
558
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
559
|
+
update._update_field("id", updated_field).apply()
|
560
|
+
|
561
|
+
assert "Cannot change merge key status" in str(exc_info.value)
|
562
|
+
assert "critical for data integrity" in str(exc_info.value)
|
563
|
+
|
564
|
+
def test_cannot_change_event_time_status(self, protected_fields_schema):
|
565
|
+
"""Test that changing event time status is forbidden."""
|
566
|
+
# Try to make event time field not an event time field
|
567
|
+
updated_field = Field.of(
|
568
|
+
pa.field("timestamp", pa.timestamp("us"), nullable=False),
|
569
|
+
field_id=2,
|
570
|
+
is_event_time=False,
|
571
|
+
)
|
572
|
+
|
573
|
+
update = SchemaUpdate.of(protected_fields_schema)
|
574
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
575
|
+
update._update_field("timestamp", updated_field).apply()
|
576
|
+
|
577
|
+
assert "Cannot change event time status" in str(exc_info.value)
|
578
|
+
assert "critical for temporal operations" in str(exc_info.value)
|
579
|
+
|
580
|
+
def test_cannot_change_merge_order(self, protected_fields_schema):
|
581
|
+
"""Test that changing merge order is forbidden."""
|
582
|
+
# Try to change merge order from ASCENDING to DESCENDING
|
583
|
+
updated_field = Field.of(
|
584
|
+
pa.field("priority", pa.int32(), nullable=True),
|
585
|
+
field_id=3,
|
586
|
+
merge_order=MergeOrder.of(SortOrder.DESCENDING),
|
587
|
+
)
|
588
|
+
|
589
|
+
update = SchemaUpdate.of(protected_fields_schema)
|
590
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
591
|
+
update._update_field("priority", updated_field).apply()
|
592
|
+
|
593
|
+
assert "Cannot change merge order" in str(exc_info.value)
|
594
|
+
assert "critical for data consistency" in str(exc_info.value)
|
595
|
+
|
596
|
+
def test_cannot_change_past_default(self, protected_fields_schema):
|
597
|
+
"""Test that changing past_default is forbidden."""
|
598
|
+
# Try to change past_default from "default" to "new_default"
|
599
|
+
updated_field = Field.of(
|
600
|
+
pa.field("data", pa.string(), nullable=True),
|
601
|
+
field_id=4,
|
602
|
+
past_default="new_default",
|
603
|
+
consistency_type=SchemaConsistencyType.COERCE,
|
604
|
+
)
|
605
|
+
|
606
|
+
update = SchemaUpdate.of(protected_fields_schema)
|
607
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
608
|
+
update._update_field("data", updated_field).apply()
|
609
|
+
|
610
|
+
assert "Cannot change past_default" in str(exc_info.value)
|
611
|
+
assert "immutable once set" in str(exc_info.value)
|
612
|
+
|
613
|
+
def test_consistency_type_evolution_coerce_to_validate(self, base_schema):
|
614
|
+
"""Test allowed transition from COERCE to VALIDATE."""
|
615
|
+
# First add a field with COERCE consistency type
|
616
|
+
coerce_field = Field.of(
|
617
|
+
pa.field("test", pa.string(), nullable=True),
|
618
|
+
field_id=4,
|
619
|
+
consistency_type=SchemaConsistencyType.COERCE,
|
620
|
+
)
|
621
|
+
schema_with_coerce = (
|
622
|
+
SchemaUpdate.of(base_schema).add_field(coerce_field).apply()
|
623
|
+
)
|
624
|
+
|
625
|
+
# Now update it to VALIDATE - this should be allowed
|
626
|
+
update = SchemaUpdate.of(schema_with_coerce)
|
627
|
+
result_schema = update.update_field_consistency_type(
|
628
|
+
"test", SchemaConsistencyType.VALIDATE
|
629
|
+
).apply()
|
630
|
+
|
631
|
+
updated_field = result_schema.field("test")
|
632
|
+
assert updated_field.consistency_type == SchemaConsistencyType.VALIDATE
|
633
|
+
|
634
|
+
def test_consistency_type_evolution_validate_to_coerce(self, base_schema):
|
635
|
+
"""Test allowed transition from VALIDATE to COERCE."""
|
636
|
+
# First add a field with VALIDATE consistency type
|
637
|
+
validate_field = Field.of(
|
638
|
+
pa.field("test", pa.string(), nullable=True),
|
639
|
+
field_id=4,
|
640
|
+
consistency_type=SchemaConsistencyType.VALIDATE,
|
641
|
+
)
|
642
|
+
schema_with_validate = (
|
643
|
+
SchemaUpdate.of(base_schema).add_field(validate_field).apply()
|
644
|
+
)
|
645
|
+
|
646
|
+
# Now update it to COERCE - this should be allowed
|
647
|
+
update = SchemaUpdate.of(schema_with_validate)
|
648
|
+
result_schema = update.update_field_consistency_type(
|
649
|
+
"test", SchemaConsistencyType.COERCE
|
650
|
+
).apply()
|
651
|
+
|
652
|
+
updated_field = result_schema.field("test")
|
653
|
+
assert updated_field.consistency_type == SchemaConsistencyType.COERCE
|
654
|
+
|
655
|
+
def test_consistency_type_evolution_to_none_allowed(self, base_schema):
|
656
|
+
"""Test allowed transition from COERCE/VALIDATE to NONE."""
|
657
|
+
# First add a field with COERCE consistency type
|
658
|
+
coerce_field = Field.of(
|
659
|
+
pa.field("test", pa.string(), nullable=True),
|
660
|
+
field_id=4,
|
661
|
+
consistency_type=SchemaConsistencyType.COERCE,
|
662
|
+
)
|
663
|
+
schema_with_coerce = (
|
664
|
+
SchemaUpdate.of(base_schema).add_field(coerce_field).apply()
|
665
|
+
)
|
666
|
+
|
667
|
+
# Now update it to NONE - this should be allowed (relaxing constraints)
|
668
|
+
update = SchemaUpdate.of(schema_with_coerce)
|
669
|
+
result_schema = update.update_field_consistency_type(
|
670
|
+
"test", SchemaConsistencyType.NONE
|
671
|
+
).apply()
|
672
|
+
|
673
|
+
updated_field = result_schema.field("test")
|
674
|
+
assert updated_field.consistency_type == SchemaConsistencyType.NONE
|
675
|
+
|
676
|
+
def test_consistency_type_evolution_none_to_coerce_forbidden(self, base_schema):
|
677
|
+
"""Test forbidden transition from NONE to COERCE."""
|
678
|
+
# First add a field with NONE consistency type
|
679
|
+
none_field = Field.of(
|
680
|
+
pa.field("test", pa.string(), nullable=True),
|
681
|
+
field_id=4,
|
682
|
+
consistency_type=SchemaConsistencyType.NONE,
|
683
|
+
)
|
684
|
+
schema_with_none = SchemaUpdate.of(base_schema).add_field(none_field).apply()
|
685
|
+
|
686
|
+
# Now try to update it to COERCE - this should fail
|
687
|
+
update = SchemaUpdate.of(schema_with_none)
|
688
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
689
|
+
update.update_field_consistency_type(
|
690
|
+
"test", SchemaConsistencyType.COERCE
|
691
|
+
).apply()
|
692
|
+
|
693
|
+
assert "Cannot change consistency type" in str(exc_info.value)
|
694
|
+
assert "from none to coerce" in str(exc_info.value)
|
695
|
+
assert "tighten validation constraints" in str(exc_info.value)
|
696
|
+
|
697
|
+
def test_consistency_type_evolution_none_to_validate_forbidden(self, base_schema):
|
698
|
+
"""Test forbidden transition from NONE to VALIDATE."""
|
699
|
+
# First add a field with NONE consistency type
|
700
|
+
none_field = Field.of(
|
701
|
+
pa.field("test", pa.string(), nullable=True),
|
702
|
+
field_id=4,
|
703
|
+
consistency_type=SchemaConsistencyType.NONE,
|
704
|
+
)
|
705
|
+
schema_with_none = SchemaUpdate.of(base_schema).add_field(none_field).apply()
|
706
|
+
|
707
|
+
# Now try to update it to VALIDATE - this should fail
|
708
|
+
update = SchemaUpdate.of(schema_with_none)
|
709
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
710
|
+
update.update_field_consistency_type(
|
711
|
+
"test", SchemaConsistencyType.VALIDATE
|
712
|
+
).apply()
|
713
|
+
|
714
|
+
assert "Cannot change consistency type" in str(exc_info.value)
|
715
|
+
assert "from none to validate" in str(exc_info.value)
|
716
|
+
assert "tighten validation constraints" in str(exc_info.value)
|
717
|
+
|
718
|
+
def test_protected_fields_allowed_with_incompatible_flag(
|
719
|
+
self, protected_fields_schema
|
720
|
+
):
|
721
|
+
"""Test that protected field changes are allowed with allow_incompatible_changes=True."""
|
722
|
+
# Should be able to remove merge key field with the flag
|
723
|
+
update = SchemaUpdate.of(
|
724
|
+
protected_fields_schema, allow_incompatible_changes=True
|
725
|
+
)
|
726
|
+
result_schema = update.remove_field("id").apply()
|
727
|
+
|
728
|
+
assert len(result_schema.fields) == 3
|
729
|
+
field_names = [f.path[0] for f in result_schema.fields if f.path]
|
730
|
+
assert "id" not in field_names
|
731
|
+
|
732
|
+
def test_duplicate_field_id_still_forbidden_with_flag(self, base_schema):
|
733
|
+
"""Test that duplicate field IDs are prevented through auto-assignment even with allow_incompatible_changes=True.
|
734
|
+
|
735
|
+
Note: This test behavior changed after implementing automatic field ID assignment.
|
736
|
+
Duplicate field ID conflicts can no longer occur because IDs are auto-assigned.
|
737
|
+
"""
|
738
|
+
|
739
|
+
test_schema = Schema.of(
|
740
|
+
[
|
741
|
+
Field.of(
|
742
|
+
pa.field("foo", pa.int64(), nullable=False),
|
743
|
+
field_id=1,
|
744
|
+
is_merge_key=True,
|
745
|
+
),
|
746
|
+
Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
|
747
|
+
Field.of(pa.field("age", pa.int32(), nullable=True), field_id=3),
|
748
|
+
]
|
749
|
+
)
|
750
|
+
# User specifies conflicting field ID, but it will be ignored and auto-assigned
|
751
|
+
duplicate_id_field = Field.of(
|
752
|
+
pa.field("new_field", pa.string(), nullable=True), field_id=1
|
753
|
+
)
|
754
|
+
|
755
|
+
update = SchemaUpdate.of(test_schema, allow_incompatible_changes=True)
|
756
|
+
result_schema = update.add_field(duplicate_id_field).apply()
|
757
|
+
|
758
|
+
# Field should be added with auto-assigned ID (4), not the conflicting ID (1)
|
759
|
+
new_field = result_schema.field("new_field")
|
760
|
+
assert new_field.id == 4 # Auto-assigned, not 1
|
761
|
+
assert new_field.arrow.name == "new_field"
|
762
|
+
|
763
|
+
# Original field with ID 1 should be unchanged
|
764
|
+
assert result_schema.field("foo").id == 1
|
765
|
+
|
766
|
+
# Verify no duplicate field IDs in final schema
|
767
|
+
field_ids = [field.id for field in result_schema.fields]
|
768
|
+
assert len(field_ids) == len(
|
769
|
+
set(field_ids)
|
770
|
+
), f"Duplicate field IDs found: {field_ids}"
|
771
|
+
|
772
|
+
def test_rename_field_success(self, base_schema):
|
773
|
+
"""Test successfully renaming a field."""
|
774
|
+
update = SchemaUpdate.of(base_schema)
|
775
|
+
result_schema = update.rename_field("name", "full_name").apply()
|
776
|
+
|
777
|
+
# Original field should be gone
|
778
|
+
field_names = [f.path[0] for f in result_schema.fields if f.path]
|
779
|
+
assert "name" not in field_names
|
780
|
+
assert "full_name" in field_names
|
781
|
+
|
782
|
+
# Renamed field should have same properties except name
|
783
|
+
original_field = base_schema.field("name")
|
784
|
+
renamed_field = result_schema.field("full_name")
|
785
|
+
|
786
|
+
assert renamed_field.arrow.name == "full_name"
|
787
|
+
assert renamed_field.arrow.type == original_field.arrow.type
|
788
|
+
assert renamed_field.arrow.nullable == original_field.arrow.nullable
|
789
|
+
assert renamed_field.id == original_field.id
|
790
|
+
assert renamed_field.doc == original_field.doc
|
791
|
+
assert renamed_field.consistency_type == original_field.consistency_type
|
792
|
+
|
793
|
+
def test_rename_field_nonexistent_field_fails(self, base_schema):
|
794
|
+
"""Test renaming a field that doesn't exist fails."""
|
795
|
+
update = SchemaUpdate.of(base_schema)
|
796
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
797
|
+
update.rename_field("nonexistent", "new_name").apply()
|
798
|
+
|
799
|
+
assert "does not exist" in str(exc_info.value)
|
800
|
+
assert exc_info.value.field_locator == "nonexistent"
|
801
|
+
|
802
|
+
def test_update_field_type_success(self, base_schema):
|
803
|
+
"""Test successfully updating field type."""
|
804
|
+
update = SchemaUpdate.of(base_schema)
|
805
|
+
result_schema = update.update_field_type("age", pa.int64()).apply()
|
806
|
+
|
807
|
+
# Field should have new type but same other properties
|
808
|
+
original_field = base_schema.field("age")
|
809
|
+
updated_field = result_schema.field("age")
|
810
|
+
|
811
|
+
assert updated_field.arrow.type == pa.int64()
|
812
|
+
assert updated_field.arrow.name == original_field.arrow.name
|
813
|
+
assert updated_field.arrow.nullable == original_field.arrow.nullable
|
814
|
+
assert updated_field.id == original_field.id
|
815
|
+
assert updated_field.doc == original_field.doc
|
816
|
+
assert updated_field.consistency_type == original_field.consistency_type
|
817
|
+
|
818
|
+
def test_update_field_type_incompatible_fails(self, base_schema):
|
819
|
+
"""Test updating field type with incompatible type fails."""
|
820
|
+
update = SchemaUpdate.of(base_schema)
|
821
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
822
|
+
update.update_field_type("age", pa.string()).apply()
|
823
|
+
|
824
|
+
assert "would break compatibility" in str(exc_info.value)
|
825
|
+
|
826
|
+
def test_update_field_type_incompatible_allowed_with_flag(self, base_schema):
|
827
|
+
"""Test incompatible type update succeeds with allow_incompatible_changes=True."""
|
828
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
829
|
+
result_schema = update.update_field_type("age", pa.string()).apply()
|
830
|
+
|
831
|
+
updated_field = result_schema.field("age")
|
832
|
+
assert updated_field.arrow.type == pa.string()
|
833
|
+
|
834
|
+
def test_update_field_type_nonexistent_field_fails(self, base_schema):
|
835
|
+
"""Test updating type of a field that doesn't exist fails."""
|
836
|
+
update = SchemaUpdate.of(base_schema)
|
837
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
838
|
+
update.update_field_type("nonexistent", pa.string()).apply()
|
839
|
+
|
840
|
+
assert "does not exist" in str(exc_info.value)
|
841
|
+
assert exc_info.value.field_locator == "nonexistent"
|
842
|
+
|
843
|
+
def test_update_field_doc_success(self, base_schema):
|
844
|
+
"""Test successfully updating field documentation."""
|
845
|
+
update = SchemaUpdate.of(base_schema)
|
846
|
+
result_schema = update.update_field_doc(
|
847
|
+
"name", "Full name of the person"
|
848
|
+
).apply()
|
849
|
+
|
850
|
+
# Field should have new doc but same other properties
|
851
|
+
original_field = base_schema.field("name")
|
852
|
+
updated_field = result_schema.field("name")
|
853
|
+
|
854
|
+
assert updated_field.doc == "Full name of the person"
|
855
|
+
assert updated_field.arrow.name == original_field.arrow.name
|
856
|
+
assert updated_field.arrow.type == original_field.arrow.type
|
857
|
+
assert updated_field.arrow.nullable == original_field.arrow.nullable
|
858
|
+
assert updated_field.id == original_field.id
|
859
|
+
assert updated_field.consistency_type == original_field.consistency_type
|
860
|
+
|
861
|
+
def test_update_field_doc_to_none(self, base_schema):
|
862
|
+
"""Test updating field documentation to None."""
|
863
|
+
# First set some doc
|
864
|
+
schema_with_doc = (
|
865
|
+
SchemaUpdate.of(base_schema)
|
866
|
+
.update_field_doc("name", "Original doc")
|
867
|
+
.apply()
|
868
|
+
)
|
869
|
+
|
870
|
+
# Then update to None
|
871
|
+
update = SchemaUpdate.of(schema_with_doc)
|
872
|
+
result_schema = update.update_field_doc("name", None).apply()
|
873
|
+
|
874
|
+
updated_field = result_schema.field("name")
|
875
|
+
assert updated_field.doc is None
|
876
|
+
|
877
|
+
def test_update_field_doc_nonexistent_field_fails(self, base_schema):
|
878
|
+
"""Test updating doc of a field that doesn't exist fails."""
|
879
|
+
update = SchemaUpdate.of(base_schema)
|
880
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
881
|
+
update.update_field_doc("nonexistent", "Some doc").apply()
|
882
|
+
|
883
|
+
assert "does not exist" in str(exc_info.value)
|
884
|
+
assert exc_info.value.field_locator == "nonexistent"
|
885
|
+
|
886
|
+
def test_update_field_nullability_success(self, base_schema):
|
887
|
+
"""Test successfully updating field nullability."""
|
888
|
+
# Make nullable field non-nullable (should succeed with allow_incompatible_changes)
|
889
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
890
|
+
result_schema = update.update_field_nullability("name", False).apply()
|
891
|
+
|
892
|
+
# Field should have new nullability but same other properties
|
893
|
+
original_field = base_schema.field("name")
|
894
|
+
updated_field = result_schema.field("name")
|
895
|
+
|
896
|
+
assert updated_field.arrow.nullable is False
|
897
|
+
assert updated_field.arrow.name == original_field.arrow.name
|
898
|
+
assert updated_field.arrow.type == original_field.arrow.type
|
899
|
+
assert updated_field.id == original_field.id
|
900
|
+
assert updated_field.doc == original_field.doc
|
901
|
+
assert updated_field.consistency_type == original_field.consistency_type
|
902
|
+
|
903
|
+
def test_update_field_nullability_incompatible_fails(self, base_schema):
|
904
|
+
"""Test making nullable field non-nullable fails without flag."""
|
905
|
+
update = SchemaUpdate.of(base_schema)
|
906
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
907
|
+
update.update_field_nullability("name", False).apply()
|
908
|
+
|
909
|
+
assert "non-nullable without" in str(exc_info.value)
|
910
|
+
|
911
|
+
def test_update_field_nullability_nonexistent_field_fails(self, base_schema):
|
912
|
+
"""Test updating nullability of a field that doesn't exist fails."""
|
913
|
+
update = SchemaUpdate.of(base_schema)
|
914
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
915
|
+
update.update_field_nullability("nonexistent", True).apply()
|
916
|
+
|
917
|
+
assert "does not exist" in str(exc_info.value)
|
918
|
+
assert exc_info.value.field_locator == "nonexistent"
|
919
|
+
|
920
|
+
def test_update_field_consistency_type_success(self, base_schema):
|
921
|
+
"""Test successfully updating field consistency type."""
|
922
|
+
update = SchemaUpdate.of(base_schema)
|
923
|
+
result_schema = update.update_field_consistency_type(
|
924
|
+
"name", SchemaConsistencyType.VALIDATE
|
925
|
+
).apply()
|
926
|
+
|
927
|
+
# Field should have new consistency type but same other properties
|
928
|
+
original_field = base_schema.field("name")
|
929
|
+
updated_field = result_schema.field("name")
|
930
|
+
|
931
|
+
assert updated_field.consistency_type == SchemaConsistencyType.VALIDATE
|
932
|
+
assert updated_field.arrow.name == original_field.arrow.name
|
933
|
+
assert updated_field.arrow.type == original_field.arrow.type
|
934
|
+
assert updated_field.arrow.nullable == original_field.arrow.nullable
|
935
|
+
assert updated_field.id == original_field.id
|
936
|
+
assert updated_field.doc == original_field.doc
|
937
|
+
|
938
|
+
def test_update_field_consistency_type_to_none(self, base_schema):
|
939
|
+
"""Test updating field consistency type to None."""
|
940
|
+
# First set some consistency type
|
941
|
+
schema_with_coerce = (
|
942
|
+
SchemaUpdate.of(base_schema)
|
943
|
+
.update_field_consistency_type("name", SchemaConsistencyType.COERCE)
|
944
|
+
.apply()
|
945
|
+
)
|
946
|
+
|
947
|
+
# Then update to None
|
948
|
+
update = SchemaUpdate.of(schema_with_coerce)
|
949
|
+
result_schema = update.update_field_consistency_type("name", None).apply()
|
950
|
+
|
951
|
+
updated_field = result_schema.field("name")
|
952
|
+
assert updated_field.consistency_type is None
|
953
|
+
|
954
|
+
def test_update_field_consistency_type_nonexistent_field_fails(self, base_schema):
|
955
|
+
"""Test updating consistency type of a field that doesn't exist fails."""
|
956
|
+
update = SchemaUpdate.of(base_schema)
|
957
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
958
|
+
update.update_field_consistency_type(
|
959
|
+
"nonexistent", SchemaConsistencyType.VALIDATE
|
960
|
+
).apply()
|
961
|
+
|
962
|
+
assert "does not exist" in str(exc_info.value)
|
963
|
+
assert exc_info.value.field_locator == "nonexistent"
|
964
|
+
|
965
|
+
def test_update_field_future_default_success(self, base_schema):
|
966
|
+
"""Test successfully updating field future default."""
|
967
|
+
update = SchemaUpdate.of(base_schema)
|
968
|
+
result_schema = update.update_field_future_default("name", "Unknown").apply()
|
969
|
+
|
970
|
+
# Field should have new future default but same other properties
|
971
|
+
original_field = base_schema.field("name")
|
972
|
+
updated_field = result_schema.field("name")
|
973
|
+
|
974
|
+
assert updated_field.future_default == "Unknown"
|
975
|
+
assert updated_field.arrow.name == original_field.arrow.name
|
976
|
+
assert updated_field.arrow.type == original_field.arrow.type
|
977
|
+
assert updated_field.arrow.nullable == original_field.arrow.nullable
|
978
|
+
assert updated_field.id == original_field.id
|
979
|
+
assert updated_field.doc == original_field.doc
|
980
|
+
assert updated_field.consistency_type == original_field.consistency_type
|
981
|
+
assert updated_field.past_default == original_field.past_default
|
982
|
+
|
983
|
+
def test_update_field_future_default_to_none(self, base_schema):
|
984
|
+
"""Test updating field future default to None."""
|
985
|
+
# First set some future default
|
986
|
+
schema_with_default = (
|
987
|
+
SchemaUpdate.of(base_schema)
|
988
|
+
.update_field_future_default("name", "Default Name")
|
989
|
+
.apply()
|
990
|
+
)
|
991
|
+
|
992
|
+
# Then update to None
|
993
|
+
update = SchemaUpdate.of(schema_with_default)
|
994
|
+
result_schema = update.update_field_future_default("name", None).apply()
|
995
|
+
|
996
|
+
updated_field = result_schema.field("name")
|
997
|
+
assert updated_field.future_default is None
|
998
|
+
|
999
|
+
def test_update_field_future_default_invalid_type_fails(self, base_schema):
|
1000
|
+
"""Test updating field future default with incompatible type fails."""
|
1001
|
+
update = SchemaUpdate.of(base_schema)
|
1002
|
+
with pytest.raises(ValueError) as exc_info:
|
1003
|
+
update.update_field_future_default(
|
1004
|
+
"name", 123
|
1005
|
+
).apply() # int for string field
|
1006
|
+
|
1007
|
+
assert "not compatible with type" in str(exc_info.value)
|
1008
|
+
|
1009
|
+
def test_update_field_future_default_nonexistent_field_fails(self, base_schema):
|
1010
|
+
"""Test updating future default of a field that doesn't exist fails."""
|
1011
|
+
update = SchemaUpdate.of(base_schema)
|
1012
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
1013
|
+
update.update_field_future_default("nonexistent", "value").apply()
|
1014
|
+
|
1015
|
+
assert "does not exist" in str(exc_info.value)
|
1016
|
+
assert exc_info.value.field_locator == "nonexistent"
|
1017
|
+
|
1018
|
+
def test_method_chaining_with_metadata_preservation(self, base_schema):
|
1019
|
+
"""Test that chaining operations on the same field preserves metadata correctly."""
|
1020
|
+
result_schema = (
|
1021
|
+
SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1022
|
+
.update_field_type("age", pa.int64())
|
1023
|
+
.update_field_consistency_type("age", SchemaConsistencyType.VALIDATE)
|
1024
|
+
.update_field_future_default("age", 0)
|
1025
|
+
.apply()
|
1026
|
+
)
|
1027
|
+
|
1028
|
+
age_field = result_schema.field("age")
|
1029
|
+
assert age_field.arrow.type == pa.int64()
|
1030
|
+
assert age_field.consistency_type == SchemaConsistencyType.VALIDATE
|
1031
|
+
assert age_field.future_default == 0
|
1032
|
+
|
1033
|
+
def test_individual_methods_work_correctly(self, base_schema):
|
1034
|
+
"""Test that each method works correctly on its own."""
|
1035
|
+
# Test doc update
|
1036
|
+
result1 = (
|
1037
|
+
SchemaUpdate.of(base_schema).update_field_doc("name", "Full name").apply()
|
1038
|
+
)
|
1039
|
+
assert result1.field("name").doc == "Full name"
|
1040
|
+
|
1041
|
+
# Test nullability update
|
1042
|
+
result2 = (
|
1043
|
+
SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1044
|
+
.update_field_nullability("name", False)
|
1045
|
+
.apply()
|
1046
|
+
)
|
1047
|
+
assert result2.field("name").arrow.nullable is False
|
1048
|
+
|
1049
|
+
# Test rename
|
1050
|
+
result3 = SchemaUpdate.of(base_schema).rename_field("name", "full_name").apply()
|
1051
|
+
assert result3.field("full_name").arrow.name == "full_name"
|
1052
|
+
|
1053
|
+
# Test multiple independent operations
|
1054
|
+
result4 = (
|
1055
|
+
SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1056
|
+
.update_field_type("age", pa.int64())
|
1057
|
+
.update_field_consistency_type("name", SchemaConsistencyType.VALIDATE)
|
1058
|
+
.apply()
|
1059
|
+
)
|
1060
|
+
|
1061
|
+
assert result4.field("age").arrow.type == pa.int64()
|
1062
|
+
assert result4.field("name").consistency_type == SchemaConsistencyType.VALIDATE
|
1063
|
+
|
1064
|
+
def test_method_chaining_different_fields(self, base_schema):
|
1065
|
+
"""Test chaining operations on different fields."""
|
1066
|
+
result_schema = (
|
1067
|
+
SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1068
|
+
.update_field_type("age", pa.int64())
|
1069
|
+
.update_field_doc("name", "Updated name")
|
1070
|
+
.update_field_consistency_type("id", SchemaConsistencyType.VALIDATE)
|
1071
|
+
.apply()
|
1072
|
+
)
|
1073
|
+
|
1074
|
+
age_field = result_schema.field("age")
|
1075
|
+
assert age_field.arrow.type == pa.int64()
|
1076
|
+
|
1077
|
+
name_field = result_schema.field("name")
|
1078
|
+
assert name_field.doc == "Updated name"
|
1079
|
+
|
1080
|
+
id_field = result_schema.field("id")
|
1081
|
+
assert id_field.consistency_type == SchemaConsistencyType.VALIDATE
|
1082
|
+
|
1083
|
+
def test_user_friendly_methods_vs_protected_update_field(self, base_schema):
|
1084
|
+
"""Test that user-friendly methods produce same results as protected _update_field."""
|
1085
|
+
# Using update_field_type should be equivalent to using _update_field with manually constructed field
|
1086
|
+
update1 = SchemaUpdate.of(base_schema)
|
1087
|
+
result1 = update1.update_field_type("age", pa.int64()).apply()
|
1088
|
+
|
1089
|
+
# Manually construct the updated field
|
1090
|
+
original_field = base_schema.field("age")
|
1091
|
+
new_arrow_field = pa.field(
|
1092
|
+
original_field.arrow.name,
|
1093
|
+
pa.int64(),
|
1094
|
+
nullable=original_field.arrow.nullable,
|
1095
|
+
metadata=original_field.arrow.metadata,
|
1096
|
+
)
|
1097
|
+
updated_field = Field.of(
|
1098
|
+
new_arrow_field,
|
1099
|
+
field_id=original_field.id,
|
1100
|
+
is_merge_key=original_field.is_merge_key,
|
1101
|
+
merge_order=original_field.merge_order,
|
1102
|
+
is_event_time=original_field.is_event_time,
|
1103
|
+
doc=original_field.doc,
|
1104
|
+
past_default=original_field.past_default,
|
1105
|
+
future_default=original_field.future_default,
|
1106
|
+
consistency_type=original_field.consistency_type,
|
1107
|
+
native_object=original_field.native_object,
|
1108
|
+
)
|
1109
|
+
|
1110
|
+
update2 = SchemaUpdate.of(base_schema)
|
1111
|
+
result2 = update2._update_field("age", updated_field).apply()
|
1112
|
+
|
1113
|
+
# Results should be equivalent
|
1114
|
+
field1 = result1.field("age")
|
1115
|
+
field2 = result2.field("age")
|
1116
|
+
|
1117
|
+
assert field1.arrow.type == field2.arrow.type
|
1118
|
+
assert field1.arrow.name == field2.arrow.name
|
1119
|
+
assert field1.arrow.nullable == field2.arrow.nullable
|
1120
|
+
assert field1.id == field2.id
|
1121
|
+
assert field1.doc == field2.doc
|
1122
|
+
assert field1.consistency_type == field2.consistency_type
|
1123
|
+
|
1124
|
+
def test_schema_update_convenience_method(self, base_schema):
|
1125
|
+
"""Test Schema.update() convenience method."""
|
1126
|
+
# Start with a schema that has a field we can modify
|
1127
|
+
base_schema = (
|
1128
|
+
base_schema.update()
|
1129
|
+
.add_field(Field.of(pa.field("score", pa.int32(), nullable=True)))
|
1130
|
+
.apply()
|
1131
|
+
)
|
1132
|
+
|
1133
|
+
# Test update_field_type (compatible type widening)
|
1134
|
+
result1 = base_schema.update().update_field_type("score", pa.int64()).apply()
|
1135
|
+
score_field = result1.field("score")
|
1136
|
+
assert score_field.arrow.type == pa.int64()
|
1137
|
+
assert score_field.arrow.name == "score"
|
1138
|
+
assert score_field.id == 4 # Auto-assigned, not 10
|
1139
|
+
|
1140
|
+
# Test update_field_doc
|
1141
|
+
result2 = base_schema.update().update_field_doc("score", "User score").apply()
|
1142
|
+
score_field = result2.field("score")
|
1143
|
+
assert score_field.doc == "User score"
|
1144
|
+
assert score_field.arrow.type == pa.int32() # Type unchanged
|
1145
|
+
|
1146
|
+
# Test update_field_consistency_type
|
1147
|
+
result3 = (
|
1148
|
+
base_schema.update()
|
1149
|
+
.update_field_consistency_type("score", SchemaConsistencyType.VALIDATE)
|
1150
|
+
.apply()
|
1151
|
+
)
|
1152
|
+
score_field = result3.field("score")
|
1153
|
+
assert score_field.consistency_type == SchemaConsistencyType.VALIDATE
|
1154
|
+
assert score_field.arrow.type == pa.int32() # Type unchanged
|
1155
|
+
|
1156
|
+
# Test update_field_future_default
|
1157
|
+
result4 = base_schema.update().update_field_future_default("score", 100).apply()
|
1158
|
+
score_field = result4.field("score")
|
1159
|
+
assert score_field.future_default == 100
|
1160
|
+
assert score_field.arrow.type == pa.int32() # Type unchanged
|
1161
|
+
|
1162
|
+
# Test rename_field
|
1163
|
+
result5 = base_schema.update().rename_field("score", "user_score").apply()
|
1164
|
+
field_names = [f.path[0] for f in result5.fields if f.path]
|
1165
|
+
assert "score" not in field_names
|
1166
|
+
assert "user_score" in field_names
|
1167
|
+
renamed_field = result5.field("user_score")
|
1168
|
+
assert renamed_field.arrow.name == "user_score"
|
1169
|
+
assert renamed_field.arrow.type == pa.int32()
|
1170
|
+
assert renamed_field.id == 4 # Retains original auto-assigne field ID
|
1171
|
+
|
1172
|
+
# Test method chaining
|
1173
|
+
result6 = (
|
1174
|
+
base_schema.update()
|
1175
|
+
.update_field_type("score", pa.int64())
|
1176
|
+
.update_field_doc("score", "User score in points")
|
1177
|
+
.update_field_consistency_type("score", SchemaConsistencyType.COERCE)
|
1178
|
+
.update_field_future_default("score", 0)
|
1179
|
+
.apply()
|
1180
|
+
)
|
1181
|
+
|
1182
|
+
final_score_field = result6.field("score")
|
1183
|
+
assert final_score_field.arrow.type == pa.int64()
|
1184
|
+
assert final_score_field.doc == "User score in points"
|
1185
|
+
assert final_score_field.consistency_type == SchemaConsistencyType.COERCE
|
1186
|
+
assert final_score_field.future_default == 0
|
1187
|
+
|
1188
|
+
def test_add_multiple_fields_unique_field_ids(self, base_schema):
|
1189
|
+
"""Test adding multiple fields in one SchemaUpdate gets unique, incremental field IDs."""
|
1190
|
+
# Create multiple new fields with different types to add simultaneously
|
1191
|
+
# Note: Field IDs specified here will be ignored and auto-assigned
|
1192
|
+
new_field1 = Field.of(
|
1193
|
+
pa.field("email", pa.string(), nullable=True),
|
1194
|
+
field_id=999, # Will be ignored, auto-assigned to 4
|
1195
|
+
)
|
1196
|
+
new_field2 = Field.of(
|
1197
|
+
pa.field("score", pa.float64(), nullable=True),
|
1198
|
+
field_id=888, # Will be ignored, auto-assigned to 5
|
1199
|
+
)
|
1200
|
+
new_field3 = Field.of(
|
1201
|
+
pa.field("active", pa.bool_(), nullable=False),
|
1202
|
+
field_id=777, # Will be ignored, auto-assigned to 6
|
1203
|
+
past_default=True,
|
1204
|
+
)
|
1205
|
+
new_field4 = Field.of(
|
1206
|
+
pa.field("created_at", pa.timestamp("us"), nullable=True),
|
1207
|
+
field_id=666, # Will be ignored, auto-assigned to 7
|
1208
|
+
)
|
1209
|
+
|
1210
|
+
# Add all fields in a single SchemaUpdate operation
|
1211
|
+
update = SchemaUpdate.of(base_schema)
|
1212
|
+
result_schema = (
|
1213
|
+
update.add_field(new_field1)
|
1214
|
+
.add_field(new_field2)
|
1215
|
+
.add_field(new_field3)
|
1216
|
+
.add_field(new_field4)
|
1217
|
+
.apply()
|
1218
|
+
)
|
1219
|
+
|
1220
|
+
# Verify all fields were added successfully
|
1221
|
+
assert len(result_schema.fields) == 7 # 3 original + 4 new
|
1222
|
+
|
1223
|
+
# Verify each field has the expected unique field ID
|
1224
|
+
email_field = result_schema.field("email")
|
1225
|
+
assert email_field.id == 4
|
1226
|
+
assert email_field.arrow.name == "email"
|
1227
|
+
assert email_field.arrow.type == pa.string()
|
1228
|
+
|
1229
|
+
score_field = result_schema.field("score")
|
1230
|
+
assert score_field.id == 5
|
1231
|
+
assert score_field.arrow.name == "score"
|
1232
|
+
assert score_field.arrow.type == pa.float64()
|
1233
|
+
|
1234
|
+
active_field = result_schema.field("active")
|
1235
|
+
assert active_field.id == 6
|
1236
|
+
assert active_field.arrow.name == "active"
|
1237
|
+
assert active_field.arrow.type == pa.bool_()
|
1238
|
+
assert active_field.past_default is True
|
1239
|
+
|
1240
|
+
created_at_field = result_schema.field("created_at")
|
1241
|
+
assert created_at_field.id == 7
|
1242
|
+
assert created_at_field.arrow.name == "created_at"
|
1243
|
+
assert pa.types.is_timestamp(created_at_field.arrow.type)
|
1244
|
+
|
1245
|
+
# Verify original fields are preserved
|
1246
|
+
assert result_schema.field("id").id == 1
|
1247
|
+
assert result_schema.field("name").id == 2
|
1248
|
+
assert result_schema.field("age").id == 3
|
1249
|
+
|
1250
|
+
# Verify no duplicate field IDs exist
|
1251
|
+
field_ids = [field.id for field in result_schema.fields]
|
1252
|
+
assert len(field_ids) == len(
|
1253
|
+
set(field_ids)
|
1254
|
+
), f"Duplicate field IDs found: {field_ids}"
|
1255
|
+
|
1256
|
+
# Verify field IDs are sequential starting from max_field_id + 1
|
1257
|
+
expected_ids = [1, 2, 3, 4, 5, 6, 7]
|
1258
|
+
assert sorted(field_ids) == expected_ids
|
1259
|
+
|
1260
|
+
def test_conflicting_operations_add_then_remove_same_field(self, base_schema):
|
1261
|
+
"""Test conflicting operations: adding a field then removing the same field should raise ValueError."""
|
1262
|
+
new_field = Field.of(pa.field("temp", pa.string(), nullable=True), field_id=4)
|
1263
|
+
|
1264
|
+
# Add field then remove the same field - should raise ValueError for conflicting operations
|
1265
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1266
|
+
with pytest.raises(ValueError) as exc_info:
|
1267
|
+
(
|
1268
|
+
update.add_field(new_field)
|
1269
|
+
.remove_field("temp") # Conflicts with add operation
|
1270
|
+
.apply()
|
1271
|
+
)
|
1272
|
+
|
1273
|
+
assert "Conflicting operations detected on field 'temp'" in str(exc_info.value)
|
1274
|
+
|
1275
|
+
def test_conflicting_operations_remove_then_add_same_field(self, base_schema):
|
1276
|
+
"""Test conflicting operations: removing a field then adding it back should raise ValueError."""
|
1277
|
+
# Remove existing field then add it back - should raise ValueError for conflicting operations
|
1278
|
+
replacement_field = Field.of(
|
1279
|
+
pa.field("age", pa.int32(), nullable=True), field_id=3
|
1280
|
+
)
|
1281
|
+
|
1282
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1283
|
+
with pytest.raises(ValueError) as exc_info:
|
1284
|
+
(
|
1285
|
+
update.remove_field("age") # Remove existing field
|
1286
|
+
.add_field(replacement_field) # Conflicts with remove operation
|
1287
|
+
.apply()
|
1288
|
+
)
|
1289
|
+
|
1290
|
+
assert "Conflicting operations detected on field 'age'" in str(exc_info.value)
|
1291
|
+
|
1292
|
+
def test_conflicting_operations_update_then_remove_same_field(self, base_schema):
|
1293
|
+
"""Test conflicting operations: updating a field then removing it should raise ValueError."""
|
1294
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1295
|
+
with pytest.raises(ValueError) as exc_info:
|
1296
|
+
(
|
1297
|
+
update.update_field_type("age", pa.int64()) # Update field type
|
1298
|
+
.remove_field("age") # Conflicts with update operation
|
1299
|
+
.apply()
|
1300
|
+
)
|
1301
|
+
|
1302
|
+
assert "Conflicting operations detected on field 'age'" in str(exc_info.value)
|
1303
|
+
|
1304
|
+
def test_conflicting_operations_remove_then_update_same_field_fails(
|
1305
|
+
self, base_schema
|
1306
|
+
):
|
1307
|
+
"""Test conflicting operations: removing a field then trying to update it fails during method chaining.
|
1308
|
+
|
1309
|
+
Note: This fails with AttributeError during update_field_type() call because _get_existing_field
|
1310
|
+
returns None for the removed field. This happens before our conflict validation in apply().
|
1311
|
+
The conflict detection catches most cases, but this specific order triggers the old behavior.
|
1312
|
+
"""
|
1313
|
+
update = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1314
|
+
|
1315
|
+
# Remove field first, then try to update it - fails during update_field_type() call
|
1316
|
+
with pytest.raises(AttributeError) as exc_info:
|
1317
|
+
(
|
1318
|
+
update.remove_field("age").update_field_type( # Remove field
|
1319
|
+
"age", pa.int64()
|
1320
|
+
) # Fails here due to _get_existing_field returning None
|
1321
|
+
)
|
1322
|
+
|
1323
|
+
assert "NoneType" in str(exc_info.value)
|
1324
|
+
assert "arrow" in str(exc_info.value)
|
1325
|
+
|
1326
|
+
def test_multiple_updates_same_field_allowed(self, base_schema):
|
1327
|
+
"""Test multiple updates to the same field are allowed and applied cumulatively."""
|
1328
|
+
update = SchemaUpdate.of(base_schema)
|
1329
|
+
result_schema = (
|
1330
|
+
update.update_field_type("age", pa.int64()) # First update
|
1331
|
+
.update_field_doc("age", "Age in years") # Second update - should work
|
1332
|
+
.update_field_consistency_type(
|
1333
|
+
"age", SchemaConsistencyType.VALIDATE
|
1334
|
+
) # Third update
|
1335
|
+
.apply()
|
1336
|
+
)
|
1337
|
+
|
1338
|
+
# All updates should be applied cumulatively
|
1339
|
+
updated_field = result_schema.field("age")
|
1340
|
+
assert updated_field.arrow.type == pa.int64()
|
1341
|
+
assert updated_field.doc == "Age in years"
|
1342
|
+
assert updated_field.consistency_type == SchemaConsistencyType.VALIDATE
|
1343
|
+
assert updated_field.id == 3 # ID should remain same
|
1344
|
+
|
1345
|
+
def test_multiple_updates_only_are_allowed_explicitly(self, base_schema):
|
1346
|
+
"""Test that ONLY multiple update operations on same field are allowed - this demonstrates the refined logic."""
|
1347
|
+
update = SchemaUpdate.of(base_schema)
|
1348
|
+
|
1349
|
+
# Multiple update operations should work
|
1350
|
+
result_schema = (
|
1351
|
+
update.update_field_type("age", pa.int64())
|
1352
|
+
.update_field_doc("age", "Updated age field")
|
1353
|
+
.update_field_consistency_type("age", SchemaConsistencyType.COERCE)
|
1354
|
+
.update_field_future_default("age", 25)
|
1355
|
+
.apply()
|
1356
|
+
)
|
1357
|
+
|
1358
|
+
# All updates should be applied cumulatively
|
1359
|
+
age_field = result_schema.field("age")
|
1360
|
+
assert age_field.arrow.type == pa.int64()
|
1361
|
+
assert age_field.doc == "Updated age field"
|
1362
|
+
assert age_field.consistency_type == SchemaConsistencyType.COERCE
|
1363
|
+
assert age_field.future_default == 25
|
1364
|
+
assert age_field.id == 3 # Original ID preserved
|
1365
|
+
|
1366
|
+
def test_non_conflicting_operations_succeed(self, base_schema):
|
1367
|
+
"""Test that non-conflicting operations on different fields succeed."""
|
1368
|
+
new_field = Field.of(
|
1369
|
+
pa.field("email", pa.string(), nullable=True), field_id=999
|
1370
|
+
)
|
1371
|
+
|
1372
|
+
update = SchemaUpdate.of(base_schema)
|
1373
|
+
result_schema = (
|
1374
|
+
update.add_field(new_field) # Add new field "email"
|
1375
|
+
.update_field_type("age", pa.int64()) # Update different field "age"
|
1376
|
+
.update_field_doc("name", "Full name") # Update different field "name"
|
1377
|
+
.apply()
|
1378
|
+
)
|
1379
|
+
|
1380
|
+
# All operations should succeed since they target different fields
|
1381
|
+
assert len(result_schema.fields) == 4 # 3 original + 1 new
|
1382
|
+
|
1383
|
+
# Verify new field was added with auto-assigned ID
|
1384
|
+
email_field = result_schema.field("email")
|
1385
|
+
assert email_field.id == 4 # Auto-assigned, not 999
|
1386
|
+
assert email_field.arrow.name == "email"
|
1387
|
+
|
1388
|
+
# Verify updates were applied
|
1389
|
+
age_field = result_schema.field("age")
|
1390
|
+
assert age_field.arrow.type == pa.int64()
|
1391
|
+
assert age_field.id == 3 # Original ID preserved
|
1392
|
+
|
1393
|
+
name_field = result_schema.field("name")
|
1394
|
+
assert name_field.doc == "Full name"
|
1395
|
+
assert name_field.id == 2 # Original ID preserved
|
1396
|
+
|
1397
|
+
def test_add_duplicate_field_name_fails(self, base_schema):
|
1398
|
+
"""Test adding a field with a name that already exists should fail."""
|
1399
|
+
# Try to add a field with same name as existing field
|
1400
|
+
duplicate_field = Field.of(
|
1401
|
+
pa.field("name", pa.int32(), nullable=True), field_id=4
|
1402
|
+
)
|
1403
|
+
|
1404
|
+
update = SchemaUpdate.of(base_schema)
|
1405
|
+
with pytest.raises(SchemaCompatibilityError) as exc_info:
|
1406
|
+
update.add_field(duplicate_field).apply()
|
1407
|
+
|
1408
|
+
assert "already exists" in str(exc_info.value)
|
1409
|
+
|
1410
|
+
def test_add_field_ignores_user_specified_field_id(self, base_schema):
|
1411
|
+
"""Test that add_field operations ignore user-specified field IDs and auto-assign sequentially.
|
1412
|
+
|
1413
|
+
This ensures field ID uniqueness and prevents users from accidentally creating
|
1414
|
+
conflicts by specifying existing field IDs.
|
1415
|
+
"""
|
1416
|
+
# Try to add fields with conflicting field IDs (should be ignored)
|
1417
|
+
new_field1 = Field.of(
|
1418
|
+
pa.field("email", pa.string(), nullable=True),
|
1419
|
+
field_id=1, # Intentionally conflicts with existing "id" field
|
1420
|
+
)
|
1421
|
+
new_field2 = Field.of(
|
1422
|
+
pa.field("score", pa.float64(), nullable=True),
|
1423
|
+
field_id=2, # Intentionally conflicts with existing "name" field
|
1424
|
+
)
|
1425
|
+
new_field3 = Field.of(
|
1426
|
+
pa.field("active", pa.bool_(), nullable=True),
|
1427
|
+
field_id=999, # High number that should be ignored
|
1428
|
+
)
|
1429
|
+
|
1430
|
+
update = SchemaUpdate.of(base_schema)
|
1431
|
+
result_schema = (
|
1432
|
+
update.add_field(new_field1)
|
1433
|
+
.add_field(new_field2)
|
1434
|
+
.add_field(new_field3)
|
1435
|
+
.apply()
|
1436
|
+
)
|
1437
|
+
|
1438
|
+
# New fields should get auto-assigned field IDs starting from max_field_id + 1
|
1439
|
+
assert len(result_schema.fields) == 6 # 3 original + 3 new
|
1440
|
+
|
1441
|
+
# Verify original fields keep their IDs
|
1442
|
+
assert result_schema.field("id").id == 1
|
1443
|
+
assert result_schema.field("name").id == 2
|
1444
|
+
assert result_schema.field("age").id == 3
|
1445
|
+
|
1446
|
+
# Verify new fields get sequential auto-assigned IDs (ignoring user input)
|
1447
|
+
email_field = result_schema.field("email")
|
1448
|
+
score_field = result_schema.field("score")
|
1449
|
+
active_field = result_schema.field("active")
|
1450
|
+
|
1451
|
+
assert email_field.id == 4 # Not 1 (user-specified)
|
1452
|
+
assert score_field.id == 5 # Not 2 (user-specified)
|
1453
|
+
assert active_field.id == 6 # Not 999 (user-specified)
|
1454
|
+
|
1455
|
+
# Verify no duplicate field IDs
|
1456
|
+
field_ids = [field.id for field in result_schema.fields]
|
1457
|
+
assert len(field_ids) == len(
|
1458
|
+
set(field_ids)
|
1459
|
+
), f"Duplicate field IDs found: {field_ids}"
|
1460
|
+
|
1461
|
+
def test_update_field_preserves_original_field_id(self, base_schema):
|
1462
|
+
"""Test that update operations preserve the original field's ID regardless of user input.
|
1463
|
+
|
1464
|
+
This ensures field ID stability during updates - the field ID should never change
|
1465
|
+
when updating an existing field's properties.
|
1466
|
+
"""
|
1467
|
+
# Create a field update with a different field ID (should be ignored)
|
1468
|
+
update = SchemaUpdate.of(base_schema)
|
1469
|
+
|
1470
|
+
# Update field type - the field ID in this context should be ignored
|
1471
|
+
result_schema = update.update_field_type("age", pa.int64()).apply()
|
1472
|
+
|
1473
|
+
# Field ID should remain the same as original (3), not change
|
1474
|
+
updated_field = result_schema.field("age")
|
1475
|
+
original_field = base_schema.field("age")
|
1476
|
+
|
1477
|
+
assert updated_field.id == original_field.id # Should be 3
|
1478
|
+
assert updated_field.arrow.type == pa.int64() # Type should be updated
|
1479
|
+
assert updated_field.arrow.name == "age" # Name should stay same
|
1480
|
+
|
1481
|
+
# All other fields should keep their original IDs too
|
1482
|
+
assert result_schema.field("id").id == 1
|
1483
|
+
assert result_schema.field("name").id == 2
|
1484
|
+
|
1485
|
+
def test_mixed_add_update_field_id_management(self, base_schema):
|
1486
|
+
"""Test field ID management with mixed add and update operations.
|
1487
|
+
|
1488
|
+
Updates should preserve existing field IDs, while adds should get new sequential IDs.
|
1489
|
+
"""
|
1490
|
+
# Add a field with conflicting ID, then update an existing field
|
1491
|
+
new_field = Field.of(
|
1492
|
+
pa.field("email", pa.string(), nullable=True),
|
1493
|
+
field_id=2, # Same as "name" field - should be ignored
|
1494
|
+
)
|
1495
|
+
|
1496
|
+
update = SchemaUpdate.of(base_schema)
|
1497
|
+
result_schema = (
|
1498
|
+
update.add_field(new_field) # Should get field_id=4, not 2
|
1499
|
+
.update_field_type("age", pa.int64()) # Should keep field_id=3
|
1500
|
+
.update_field_doc("name", "Full name") # Should keep field_id=2
|
1501
|
+
.apply()
|
1502
|
+
)
|
1503
|
+
|
1504
|
+
# Verify field ID assignments
|
1505
|
+
assert result_schema.field("id").id == 1 # Original
|
1506
|
+
assert result_schema.field("name").id == 2 # Original, updated doc
|
1507
|
+
assert result_schema.field("age").id == 3 # Original, updated type
|
1508
|
+
assert result_schema.field("email").id == 4 # New field, auto-assigned
|
1509
|
+
|
1510
|
+
# Verify updates were applied
|
1511
|
+
assert result_schema.field("age").arrow.type == pa.int64()
|
1512
|
+
assert result_schema.field("name").doc == "Full name"
|
1513
|
+
assert result_schema.field("email").arrow.name == "email"
|
1514
|
+
|
1515
|
+
# Verify no duplicates
|
1516
|
+
field_ids = [field.id for field in result_schema.fields]
|
1517
|
+
assert len(field_ids) == len(set(field_ids))
|
1518
|
+
|
1519
|
+
def test_field_id_auto_assignment_with_gaps(self):
|
1520
|
+
"""Test that field ID auto-assignment handles gaps in existing field IDs correctly.
|
1521
|
+
|
1522
|
+
If the schema has field IDs [1, 3, 7], new fields should start from 8.
|
1523
|
+
"""
|
1524
|
+
# Create a schema with gaps in field IDs
|
1525
|
+
schema_with_gaps = Schema.of(
|
1526
|
+
[
|
1527
|
+
Field.of(pa.field("id", pa.int64()), field_id=1),
|
1528
|
+
Field.of(pa.field("name", pa.string()), field_id=3), # Gap at 2
|
1529
|
+
Field.of(pa.field("score", pa.float32()), field_id=7), # Gap at 4,5,6
|
1530
|
+
]
|
1531
|
+
)
|
1532
|
+
|
1533
|
+
new_field = Field.of(
|
1534
|
+
pa.field("email", pa.string(), nullable=True),
|
1535
|
+
field_id=999, # Should be ignored, auto-assigned to 8
|
1536
|
+
)
|
1537
|
+
|
1538
|
+
update = SchemaUpdate.of(schema_with_gaps)
|
1539
|
+
result_schema = update.add_field(new_field).apply()
|
1540
|
+
|
1541
|
+
# New field should get max_field_id + 1 = 7 + 1 = 8
|
1542
|
+
email_field = result_schema.field("email")
|
1543
|
+
assert email_field.id == 8 # Not 999 or any of the existing gaps
|
1544
|
+
|
1545
|
+
# Original fields should be unchanged
|
1546
|
+
assert result_schema.field("id").id == 1
|
1547
|
+
assert result_schema.field("name").id == 3
|
1548
|
+
assert result_schema.field("score").id == 7
|
1549
|
+
|
1550
|
+
def test_field_id_never_reused_after_max_field_removal(self):
|
1551
|
+
"""Test that field IDs are never reused, even when max field ID is removed and same-named field added back.
|
1552
|
+
|
1553
|
+
This ensures field ID uniqueness over schema evolution history - a field with the same name
|
1554
|
+
but added after removal gets a new field ID, making it clear it's a different field.
|
1555
|
+
"""
|
1556
|
+
# Create schema with fields having IDs 1, 2, 3
|
1557
|
+
base_schema = Schema.of(
|
1558
|
+
[
|
1559
|
+
Field.of(pa.field("id", pa.int64()), field_id=1),
|
1560
|
+
Field.of(pa.field("name", pa.string()), field_id=2),
|
1561
|
+
Field.of(
|
1562
|
+
pa.field("score", pa.float32()), field_id=3
|
1563
|
+
), # This has max field ID
|
1564
|
+
]
|
1565
|
+
)
|
1566
|
+
|
1567
|
+
# Step 1: Remove the field with the max field ID (score, ID=3)
|
1568
|
+
update1 = SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1569
|
+
schema_after_remove = update1.remove_field("score").apply()
|
1570
|
+
|
1571
|
+
# Verify the field is removed
|
1572
|
+
assert len(schema_after_remove.fields) == 2
|
1573
|
+
field_names = [f.path[0] for f in schema_after_remove.fields if f.path]
|
1574
|
+
assert "score" not in field_names
|
1575
|
+
assert "id" in field_names
|
1576
|
+
assert "name" in field_names
|
1577
|
+
|
1578
|
+
# Max field ID should still be 3 (based on original schema)
|
1579
|
+
assert schema_after_remove.max_field_id == 3
|
1580
|
+
|
1581
|
+
# Step 2: Add a field back with the same name ("score") but different type
|
1582
|
+
new_score_field = Field.of(
|
1583
|
+
pa.field(
|
1584
|
+
"score", pa.int32(), nullable=True
|
1585
|
+
), # Different type than original
|
1586
|
+
field_id=999, # Will be ignored, should get ID 4 (not reuse 3)
|
1587
|
+
)
|
1588
|
+
|
1589
|
+
update2 = SchemaUpdate.of(schema_after_remove)
|
1590
|
+
schema_after_add = update2.add_field(new_score_field).apply()
|
1591
|
+
|
1592
|
+
# Verify field is added back
|
1593
|
+
assert len(schema_after_add.fields) == 3
|
1594
|
+
restored_score_field = schema_after_add.field("score")
|
1595
|
+
|
1596
|
+
# New field should get ID 4 (max_field_id + 1), NOT reuse ID 3
|
1597
|
+
assert (
|
1598
|
+
restored_score_field.id == 4
|
1599
|
+
) # Should be 4, not 3 (the removed field's ID)
|
1600
|
+
assert restored_score_field.arrow.name == "score"
|
1601
|
+
assert (
|
1602
|
+
restored_score_field.arrow.type == pa.int32()
|
1603
|
+
) # Different type than original
|
1604
|
+
|
1605
|
+
# Original fields should keep their IDs
|
1606
|
+
assert schema_after_add.field("id").id == 1
|
1607
|
+
assert schema_after_add.field("name").id == 2
|
1608
|
+
|
1609
|
+
# Verify no duplicate field IDs in final schema
|
1610
|
+
field_ids = [field.id for field in schema_after_add.fields]
|
1611
|
+
assert len(field_ids) == len(
|
1612
|
+
set(field_ids)
|
1613
|
+
), f"Duplicate field IDs found: {field_ids}"
|
1614
|
+
assert sorted(field_ids) == [1, 2, 4] # Field ID 3 is permanently "retired"
|
1615
|
+
|
1616
|
+
def test_field_id_never_reused_multiple_removes_adds(self):
|
1617
|
+
"""Test field ID non-reuse with multiple remove/add cycles.
|
1618
|
+
|
1619
|
+
This tests that field IDs continue incrementing even through multiple
|
1620
|
+
remove and add operations, ensuring each field gets a truly unique ID.
|
1621
|
+
"""
|
1622
|
+
# Start with schema having IDs 1, 2, 3
|
1623
|
+
base_schema = Schema.of(
|
1624
|
+
[
|
1625
|
+
Field.of(pa.field("id", pa.int64()), field_id=1),
|
1626
|
+
Field.of(pa.field("name", pa.string()), field_id=2),
|
1627
|
+
Field.of(pa.field("score", pa.float32()), field_id=3),
|
1628
|
+
]
|
1629
|
+
)
|
1630
|
+
|
1631
|
+
# Remove field with ID 3, add new field -> should get ID 4
|
1632
|
+
step1_schema = (
|
1633
|
+
SchemaUpdate.of(base_schema, allow_incompatible_changes=True)
|
1634
|
+
.remove_field("score")
|
1635
|
+
.add_field(Field.of(pa.field("email", pa.string()), field_id=999))
|
1636
|
+
.apply()
|
1637
|
+
)
|
1638
|
+
assert step1_schema.field("email").id == 4
|
1639
|
+
|
1640
|
+
# Remove field with ID 2, add new field -> should get ID 5
|
1641
|
+
step2_schema = (
|
1642
|
+
SchemaUpdate.of(step1_schema, allow_incompatible_changes=True)
|
1643
|
+
.remove_field("name")
|
1644
|
+
.add_field(Field.of(pa.field("phone", pa.string()), field_id=888))
|
1645
|
+
.apply()
|
1646
|
+
)
|
1647
|
+
assert step2_schema.field("phone").id == 5
|
1648
|
+
|
1649
|
+
# Add back "name" field -> should get ID 6 (not reuse 2)
|
1650
|
+
step3_schema = (
|
1651
|
+
SchemaUpdate.of(step2_schema)
|
1652
|
+
.add_field(Field.of(pa.field("name", pa.string()), field_id=777))
|
1653
|
+
.apply()
|
1654
|
+
)
|
1655
|
+
restored_name_field = step3_schema.field("name")
|
1656
|
+
assert restored_name_field.id == 6 # Not 2 (the original name field's ID)
|
1657
|
+
|
1658
|
+
# Final schema should have fields with IDs [1, 4, 5, 6]
|
1659
|
+
# IDs 2 and 3 are permanently "retired"
|
1660
|
+
field_ids = [field.id for field in step3_schema.fields]
|
1661
|
+
assert sorted(field_ids) == [1, 4, 5, 6]
|
1662
|
+
|
1663
|
+
# Verify field names in final schema
|
1664
|
+
field_names = [f.path[0] for f in step3_schema.fields if f.path]
|
1665
|
+
assert sorted(field_names) == ["email", "id", "name", "phone"]
|
1666
|
+
|
1667
|
+
def test_schema_update_increments_id_by_one(self, base_schema):
|
1668
|
+
"""Test that SchemaUpdate.apply() increments schema ID by exactly 1."""
|
1669
|
+
# Create a schema with a specific schema ID
|
1670
|
+
test_schema = Schema.of(
|
1671
|
+
[
|
1672
|
+
Field.of(
|
1673
|
+
pa.field("id", pa.int64(), nullable=False),
|
1674
|
+
field_id=1,
|
1675
|
+
is_merge_key=True,
|
1676
|
+
),
|
1677
|
+
Field.of(pa.field("name", pa.string(), nullable=True), field_id=2),
|
1678
|
+
Field.of(pa.field("age", pa.int32(), nullable=True), field_id=3),
|
1679
|
+
],
|
1680
|
+
schema_id=5, # Explicitly set schema ID to 5
|
1681
|
+
)
|
1682
|
+
|
1683
|
+
# Verify base schema has the expected ID
|
1684
|
+
assert test_schema.id == 5
|
1685
|
+
|
1686
|
+
# Apply a schema update (add a new field)
|
1687
|
+
new_field = Field.of(pa.field("email", pa.string(), nullable=True), field_id=4)
|
1688
|
+
updated_schema = SchemaUpdate.of(test_schema).add_field(new_field).apply()
|
1689
|
+
|
1690
|
+
# Verify the updated schema has ID = base_schema.id + 1
|
1691
|
+
assert updated_schema.id == 6 # 5 + 1
|
1692
|
+
assert len(updated_schema.fields) == 4
|
1693
|
+
|
1694
|
+
def test_schema_update_increments_id_from_zero(self):
|
1695
|
+
"""Test that schema ID increments correctly when starting from 0."""
|
1696
|
+
# Create a schema with default schema ID (0)
|
1697
|
+
base_schema = Schema.of(
|
1698
|
+
[
|
1699
|
+
Field.of(pa.field("id", pa.int64()), field_id=1),
|
1700
|
+
Field.of(pa.field("name", pa.string()), field_id=2),
|
1701
|
+
]
|
1702
|
+
)
|
1703
|
+
|
1704
|
+
# Verify base schema has default ID of 0
|
1705
|
+
assert base_schema.id == 0
|
1706
|
+
|
1707
|
+
# Apply a schema update
|
1708
|
+
updated_schema = (
|
1709
|
+
base_schema.update()
|
1710
|
+
.update_field_type("name", pa.string())
|
1711
|
+
.update_field_doc("name", "Full name")
|
1712
|
+
.apply()
|
1713
|
+
)
|
1714
|
+
|
1715
|
+
# Verify the updated schema has ID = 0 + 1 = 1
|
1716
|
+
assert updated_schema.id == 1
|
1717
|
+
|
1718
|
+
def test_multiple_schema_updates_increment_sequentially(self):
|
1719
|
+
"""Test that multiple schema updates increment ID sequentially."""
|
1720
|
+
# Start with schema ID 10
|
1721
|
+
base_schema = Schema.of(
|
1722
|
+
[
|
1723
|
+
Field.of(pa.field("id", pa.int64()), field_id=1),
|
1724
|
+
Field.of(pa.field("name", pa.string()), field_id=2),
|
1725
|
+
],
|
1726
|
+
schema_id=10,
|
1727
|
+
)
|
1728
|
+
|
1729
|
+
assert base_schema.id == 10
|
1730
|
+
|
1731
|
+
# First update: should go from 10 to 11
|
1732
|
+
schema_v11 = (
|
1733
|
+
base_schema.update()
|
1734
|
+
.add_field(Field.of(pa.field("age", pa.int32(), nullable=True)))
|
1735
|
+
.apply()
|
1736
|
+
)
|
1737
|
+
assert schema_v11.id == 11
|
1738
|
+
|
1739
|
+
# Second update: should go from 11 to 12
|
1740
|
+
schema_v12 = (
|
1741
|
+
schema_v11.update()
|
1742
|
+
.add_field(Field.of(pa.field("email", pa.string(), nullable=True)))
|
1743
|
+
.apply()
|
1744
|
+
)
|
1745
|
+
assert schema_v12.id == 12
|
1746
|
+
|
1747
|
+
# Third update: should go from 12 to 13
|
1748
|
+
schema_v13 = (
|
1749
|
+
schema_v12.update()
|
1750
|
+
.update_field_consistency_type("name", SchemaConsistencyType.VALIDATE)
|
1751
|
+
.apply()
|
1752
|
+
)
|
1753
|
+
assert schema_v13.id == 13
|
1754
|
+
|
1755
|
+
def test_schema_update_different_operation_types_increment_id(self):
|
1756
|
+
"""Test that different types of schema operations all increment schema ID."""
|
1757
|
+
base_schema = Schema.of(
|
1758
|
+
[
|
1759
|
+
Field.of(pa.field("id", pa.int64()), field_id=1, is_merge_key=True),
|
1760
|
+
Field.of(pa.field("name", pa.string()), field_id=2),
|
1761
|
+
Field.of(pa.field("age", pa.int32()), field_id=3),
|
1762
|
+
],
|
1763
|
+
schema_id=100,
|
1764
|
+
)
|
1765
|
+
|
1766
|
+
# Test add field operation
|
1767
|
+
add_result = (
|
1768
|
+
base_schema.update()
|
1769
|
+
.add_field(Field.of(pa.field("email", pa.string(), nullable=True)))
|
1770
|
+
.apply()
|
1771
|
+
)
|
1772
|
+
assert add_result.id == 101
|
1773
|
+
|
1774
|
+
# Test update field operation
|
1775
|
+
update_result = (
|
1776
|
+
base_schema.update().update_field_type("age", pa.int64()).apply()
|
1777
|
+
)
|
1778
|
+
assert update_result.id == 101
|
1779
|
+
|
1780
|
+
# Test rename field operation
|
1781
|
+
rename_result = base_schema.update().rename_field("name", "full_name").apply()
|
1782
|
+
assert rename_result.id == 101
|
1783
|
+
|
1784
|
+
# Test remove field operation (with incompatible changes allowed)
|
1785
|
+
remove_result = (
|
1786
|
+
base_schema.update(allow_incompatible_changes=True)
|
1787
|
+
.remove_field("age")
|
1788
|
+
.apply()
|
1789
|
+
)
|
1790
|
+
assert remove_result.id == 101
|
1791
|
+
|
1792
|
+
# Test update field documentation
|
1793
|
+
doc_result = (
|
1794
|
+
base_schema.update().update_field_doc("name", "Person's full name").apply()
|
1795
|
+
)
|
1796
|
+
assert doc_result.id == 101
|
1797
|
+
|
1798
|
+
def test_schema_update_chained_operations_increment_once(self):
|
1799
|
+
"""Test that multiple chained operations in one update increment ID by 1, not per operation."""
|
1800
|
+
base_schema = Schema.of(
|
1801
|
+
[
|
1802
|
+
Field.of(pa.field("id", pa.int64()), field_id=1),
|
1803
|
+
Field.of(pa.field("name", pa.string()), field_id=2),
|
1804
|
+
Field.of(pa.field("age", pa.int32()), field_id=3),
|
1805
|
+
],
|
1806
|
+
schema_id=50,
|
1807
|
+
)
|
1808
|
+
|
1809
|
+
# Chain multiple operations in a single SchemaUpdate
|
1810
|
+
chained_result = (
|
1811
|
+
base_schema.update()
|
1812
|
+
.add_field(Field.of(pa.field("email", pa.string(), nullable=True)))
|
1813
|
+
.add_field(Field.of(pa.field("phone", pa.string(), nullable=True)))
|
1814
|
+
.update_field_type("age", pa.int64())
|
1815
|
+
.update_field_doc("name", "Full name")
|
1816
|
+
.rename_field("id", "user_id")
|
1817
|
+
.apply()
|
1818
|
+
)
|
1819
|
+
|
1820
|
+
# Even with 5 operations, schema ID should only increment by 1
|
1821
|
+
assert chained_result.id == 51 # 50 + 1, not 50 + 5
|
1822
|
+
|
1823
|
+
def test_schema_subschema_operations_increment_id(self):
|
1824
|
+
"""Test that subschema operations (add/delete/replace) also increment schema ID by 1."""
|
1825
|
+
# Create a base schema
|
1826
|
+
base_schema = Schema.of(
|
1827
|
+
[
|
1828
|
+
Field.of(pa.field("id", pa.int64()), field_id=1),
|
1829
|
+
Field.of(pa.field("name", pa.string()), field_id=2),
|
1830
|
+
],
|
1831
|
+
schema_id=20,
|
1832
|
+
)
|
1833
|
+
|
1834
|
+
# Test add_subschema operation
|
1835
|
+
add_subschema_result = base_schema.add_subschema(
|
1836
|
+
"user_profile",
|
1837
|
+
[
|
1838
|
+
Field.of(pa.field("email", pa.string()), field_id=3),
|
1839
|
+
Field.of(pa.field("age", pa.int32()), field_id=4),
|
1840
|
+
],
|
1841
|
+
)
|
1842
|
+
assert add_subschema_result.id == 21 # 20 + 1
|
1843
|
+
|
1844
|
+
# Test replace_subschema operation
|
1845
|
+
schema_with_subschema = base_schema.add_subschema(
|
1846
|
+
"test_subschema", [Field.of(pa.field("temp", pa.string()), field_id=5)]
|
1847
|
+
)
|
1848
|
+
replace_result = schema_with_subschema.replace_subschema(
|
1849
|
+
"test_subschema", [Field.of(pa.field("replaced", pa.int32()), field_id=6)]
|
1850
|
+
)
|
1851
|
+
assert replace_result.id == 22 # 21 + 1
|
1852
|
+
|
1853
|
+
# Test delete_subschema operation
|
1854
|
+
delete_result = schema_with_subschema.delete_subschema("test_subschema")
|
1855
|
+
assert delete_result.id == 22 # 21 + 1
|
1856
|
+
|
1857
|
+
def test_schema_id_increment_with_high_values(self):
|
1858
|
+
"""Test that schema ID increment works correctly with high values."""
|
1859
|
+
# Test with a high schema ID to ensure no overflow issues
|
1860
|
+
high_id = 999999
|
1861
|
+
base_schema = Schema.of(
|
1862
|
+
[Field.of(pa.field("id", pa.int64()), field_id=1)],
|
1863
|
+
schema_id=high_id,
|
1864
|
+
)
|
1865
|
+
|
1866
|
+
updated_schema = (
|
1867
|
+
base_schema.update()
|
1868
|
+
.add_field(Field.of(pa.field("name", pa.string(), nullable=True)))
|
1869
|
+
.apply()
|
1870
|
+
)
|
1871
|
+
|
1872
|
+
assert updated_schema.id == high_id + 1
|
1873
|
+
|
1874
|
+
def test_schema_id_preserved_in_failed_updates(self):
|
1875
|
+
"""Test that schema ID is not incremented when schema updates fail."""
|
1876
|
+
base_schema = Schema.of(
|
1877
|
+
[
|
1878
|
+
Field.of(pa.field("id", pa.int64()), field_id=1),
|
1879
|
+
Field.of(pa.field("name", pa.string()), field_id=2),
|
1880
|
+
],
|
1881
|
+
schema_id=42,
|
1882
|
+
)
|
1883
|
+
|
1884
|
+
# Try an operation that should fail (adding non-nullable field without defaults)
|
1885
|
+
with pytest.raises(Exception): # Could be SchemaCompatibilityError or other
|
1886
|
+
base_schema.update().add_field(
|
1887
|
+
Field.of(pa.field("required_field", pa.string(), nullable=False))
|
1888
|
+
).apply()
|
1889
|
+
|
1890
|
+
# Original schema should still have the same ID
|
1891
|
+
assert base_schema.id == 42
|
1892
|
+
|
1893
|
+
# A successful update should still increment correctly
|
1894
|
+
success_schema = (
|
1895
|
+
base_schema.update()
|
1896
|
+
.add_field(Field.of(pa.field("optional_field", pa.string(), nullable=True)))
|
1897
|
+
.apply()
|
1898
|
+
)
|
1899
|
+
assert success_schema.id == 43
|
1900
|
+
|
1901
|
+
def test_schema_id_increment_consistency_across_update_methods(self):
|
1902
|
+
"""Test that schema ID increments consistently regardless of how SchemaUpdate is created."""
|
1903
|
+
base_schema = Schema.of(
|
1904
|
+
[Field.of(pa.field("id", pa.int64()), field_id=1)],
|
1905
|
+
schema_id=77,
|
1906
|
+
)
|
1907
|
+
|
1908
|
+
# Method 1: Using Schema.update()
|
1909
|
+
result1 = (
|
1910
|
+
base_schema.update()
|
1911
|
+
.add_field(Field.of(pa.field("field1", pa.string(), nullable=True)))
|
1912
|
+
.apply()
|
1913
|
+
)
|
1914
|
+
assert result1.id == 78
|
1915
|
+
|
1916
|
+
# Method 2: Using SchemaUpdate.of()
|
1917
|
+
result2 = (
|
1918
|
+
SchemaUpdate.of(base_schema)
|
1919
|
+
.add_field(Field.of(pa.field("field2", pa.string(), nullable=True)))
|
1920
|
+
.apply()
|
1921
|
+
)
|
1922
|
+
assert result2.id == 78
|
1923
|
+
|
1924
|
+
# Both methods should produce the same schema ID increment
|
1925
|
+
assert result1.id == result2.id
|