deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -11,15 +11,15 @@ from deltacat.exceptions import (
|
|
11
11
|
UnclassifiedDeltaCatError,
|
12
12
|
)
|
13
13
|
from daft.exceptions import DaftTransientError
|
14
|
-
from deltacat.tests.
|
14
|
+
from deltacat.tests.utils.exceptions import (
|
15
15
|
InvalidNamespaceError,
|
16
|
-
|
16
|
+
MainStorageValidationError,
|
17
17
|
)
|
18
|
+
from deltacat.tests.utils import main_deltacat_storage_mock as ds
|
18
19
|
from botocore.exceptions import NoCredentialsError
|
19
20
|
from tenacity import retry, retry_if_exception_type, stop_after_attempt
|
20
21
|
|
21
22
|
from pyarrow.lib import ArrowCapacityError
|
22
|
-
import deltacat.tests.local_deltacat_storage as ds
|
23
23
|
|
24
24
|
|
25
25
|
class MockUnknownException(Exception):
|
@@ -41,7 +41,7 @@ def mock_remote_task(exception_to_raise):
|
|
41
41
|
mock_raise_exception(exception_to_raise)
|
42
42
|
|
43
43
|
|
44
|
-
class
|
44
|
+
class TestCategorizeErrorsMain(unittest.TestCase):
|
45
45
|
def test_pyarrow_exception_categorizer(self):
|
46
46
|
self.assertRaises(
|
47
47
|
DependencyPyarrowCapacityError,
|
@@ -50,7 +50,7 @@ class TestCategorizeErrors(unittest.TestCase):
|
|
50
50
|
|
51
51
|
def test_storage_exception_categorizer(self):
|
52
52
|
self.assertRaises(
|
53
|
-
|
53
|
+
MainStorageValidationError,
|
54
54
|
lambda: mock_raise_exception(InvalidNamespaceError, deltacat_storage=ds),
|
55
55
|
)
|
56
56
|
|
@@ -98,3 +98,7 @@ class TestCategorizeErrors(unittest.TestCase):
|
|
98
98
|
return
|
99
99
|
|
100
100
|
self.assertFalse(True)
|
101
|
+
|
102
|
+
|
103
|
+
if __name__ == "__main__":
|
104
|
+
unittest.main()
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import base64
|
2
|
+
import msgpack
|
3
|
+
import json
|
4
|
+
import os
|
5
|
+
import shutil
|
6
|
+
|
7
|
+
from tempfile import mkdtemp
|
8
|
+
|
9
|
+
|
10
|
+
def _convert_bytes_to_base64_str(obj):
|
11
|
+
if isinstance(obj, dict):
|
12
|
+
for key, value in obj.items():
|
13
|
+
if isinstance(value, bytes):
|
14
|
+
obj[key] = base64.b64encode(value).decode("utf-8")
|
15
|
+
elif isinstance(value, list):
|
16
|
+
_convert_bytes_to_base64_str(value)
|
17
|
+
elif isinstance(value, dict):
|
18
|
+
_convert_bytes_to_base64_str(value)
|
19
|
+
elif isinstance(obj, list):
|
20
|
+
for i, item in enumerate(obj):
|
21
|
+
if isinstance(item, bytes):
|
22
|
+
obj[i] = base64.b64encode(item).decode("utf-8")
|
23
|
+
elif isinstance(item, (dict, list)):
|
24
|
+
_convert_bytes_to_base64_str(item)
|
25
|
+
|
26
|
+
|
27
|
+
def copy_and_convert(src_dir, dst_dir=None):
|
28
|
+
"""
|
29
|
+
Helper function for copying a metastore recursively and converting all
|
30
|
+
messagepack files to json. This can be used manually to more easily
|
31
|
+
introspect metastore metadata.
|
32
|
+
"""
|
33
|
+
if dst_dir is None:
|
34
|
+
dst_dir = mkdtemp()
|
35
|
+
print(f"destination is: {dst_dir}")
|
36
|
+
if not os.path.exists(dst_dir):
|
37
|
+
os.makedirs(dst_dir)
|
38
|
+
|
39
|
+
for item in os.listdir(src_dir):
|
40
|
+
src_path = os.path.join(src_dir, item)
|
41
|
+
dst_path = os.path.join(dst_dir, item)
|
42
|
+
|
43
|
+
if os.path.isdir(src_path):
|
44
|
+
copy_and_convert(src_path, dst_path)
|
45
|
+
else:
|
46
|
+
if item.endswith(".mpk"):
|
47
|
+
with open(src_path, "rb") as f:
|
48
|
+
data = msgpack.unpackb(f.read(), raw=False)
|
49
|
+
_convert_bytes_to_base64_str(data)
|
50
|
+
dst_path = dst_path[:-4] + ".json"
|
51
|
+
with open(dst_path, "w") as f:
|
52
|
+
json.dump(data, f)
|
53
|
+
else:
|
54
|
+
shutil.copy2(src_path, dst_path)
|
@@ -1,8 +1,9 @@
|
|
1
1
|
from typing import List, Optional, Union
|
2
2
|
import pyarrow as pa
|
3
3
|
from deltacat.storage import Delta, Partition, PartitionLocator, DeltaLocator
|
4
|
-
|
4
|
+
from deltacat.storage import metastore
|
5
5
|
from deltacat.types.media import StorageType, ContentType
|
6
|
+
from deltacat.storage.model.schema import Schema
|
6
7
|
|
7
8
|
|
8
9
|
def create_delta_from_csv_file(
|
@@ -14,60 +15,89 @@ def create_delta_from_csv_file(
|
|
14
15
|
*args,
|
15
16
|
**kwargs,
|
16
17
|
) -> Delta:
|
18
|
+
assert file_paths is not None, "file_paths cannot be empty"
|
19
|
+
pa_table = create_table_from_csv_file_paths(file_paths)
|
20
|
+
schema = Schema.of(pa_table.schema)
|
17
21
|
staged_partition = stage_partition_from_file_paths(
|
18
22
|
namespace,
|
19
23
|
file_paths,
|
24
|
+
schema,
|
20
25
|
*args,
|
21
26
|
table_name=table_name,
|
22
27
|
table_version=table_version,
|
23
28
|
**kwargs,
|
24
29
|
)
|
25
30
|
committed_delta = commit_delta_to_staged_partition(
|
26
|
-
staged_partition,
|
31
|
+
staged_partition,
|
32
|
+
pa_table,
|
33
|
+
content_type,
|
34
|
+
*args,
|
35
|
+
**kwargs,
|
27
36
|
)
|
28
37
|
return committed_delta
|
29
38
|
|
30
39
|
|
40
|
+
def create_table_from_csv_file_paths(
|
41
|
+
file_paths: List[str],
|
42
|
+
) -> pa.Table:
|
43
|
+
tables = []
|
44
|
+
for file_path in file_paths:
|
45
|
+
table = pa.csv.read_csv(file_path)
|
46
|
+
tables.append(table)
|
47
|
+
return pa.concat_tables(tables)
|
48
|
+
|
49
|
+
|
31
50
|
def stage_partition_from_file_paths(
|
32
51
|
namespace: str,
|
33
52
|
file_paths: List[str],
|
53
|
+
schema: Schema,
|
34
54
|
table_name: Optional[str] = None,
|
35
55
|
table_version: int = 1,
|
36
56
|
*args,
|
37
57
|
**kwargs,
|
38
58
|
) -> Partition:
|
39
|
-
|
59
|
+
if not metastore.namespace_exists(namespace, **kwargs):
|
60
|
+
metastore.create_namespace(namespace, **kwargs)
|
40
61
|
if table_name is None:
|
41
62
|
table_name = "-".join(file_paths).replace("/", "_")
|
42
|
-
|
43
|
-
|
44
|
-
|
63
|
+
metastore.create_table_version(
|
64
|
+
namespace,
|
65
|
+
table_name,
|
66
|
+
str(table_version),
|
67
|
+
schema=schema,
|
68
|
+
**kwargs,
|
69
|
+
)
|
70
|
+
stream = metastore.get_stream(
|
71
|
+
namespace,
|
72
|
+
table_name,
|
73
|
+
str(table_version),
|
74
|
+
**kwargs,
|
75
|
+
)
|
76
|
+
staged_partition = metastore.stage_partition(stream, **kwargs)
|
45
77
|
return staged_partition
|
46
78
|
|
47
79
|
|
48
80
|
def commit_delta_to_staged_partition(
|
49
81
|
staged_partition,
|
50
|
-
|
51
|
-
pa_table: pa.Table = None,
|
82
|
+
pa_table: pa.Table,
|
52
83
|
content_type: ContentType = ContentType.PARQUET,
|
53
84
|
*args,
|
54
85
|
**kwargs,
|
55
86
|
) -> Delta:
|
56
87
|
committed_delta = commit_delta_to_partition(
|
57
88
|
staged_partition,
|
89
|
+
pa_table,
|
90
|
+
content_type,
|
58
91
|
*args,
|
59
|
-
file_paths=file_paths,
|
60
|
-
content_type=content_type,
|
61
|
-
pa_table=pa_table,
|
62
92
|
**kwargs,
|
63
93
|
)
|
64
|
-
|
94
|
+
metastore.commit_partition(staged_partition, **kwargs)
|
65
95
|
return committed_delta
|
66
96
|
|
67
97
|
|
68
98
|
def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> Delta:
|
69
99
|
return pa.concat_tables(
|
70
|
-
|
100
|
+
metastore.download_delta(
|
71
101
|
delta_like,
|
72
102
|
storage_type=StorageType.LOCAL,
|
73
103
|
*args,
|
@@ -78,7 +108,6 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
|
|
78
108
|
|
79
109
|
def commit_delta_to_partition(
|
80
110
|
partition: Union[Partition, PartitionLocator],
|
81
|
-
file_paths: List[str] = None,
|
82
111
|
pa_table: pa.Table = None,
|
83
112
|
content_type: ContentType = ContentType.PARQUET,
|
84
113
|
*args,
|
@@ -86,20 +115,15 @@ def commit_delta_to_partition(
|
|
86
115
|
) -> Delta:
|
87
116
|
|
88
117
|
if isinstance(partition, PartitionLocator):
|
89
|
-
partition =
|
118
|
+
partition = metastore.get_partition(
|
90
119
|
partition.stream_locator, partition.partition_values, *args, **kwargs
|
91
120
|
)
|
92
|
-
if pa_table is None:
|
93
|
-
assert file_paths is not None, "One of pa_table or file_paths must be passed."
|
94
|
-
tables = []
|
95
|
-
for file_path in file_paths:
|
96
|
-
table = pa.csv.read_csv(file_path)
|
97
|
-
tables.append(table)
|
98
|
-
|
99
|
-
pa_table = pa.concat_tables(tables)
|
100
121
|
|
101
|
-
staged_delta =
|
102
|
-
pa_table,
|
122
|
+
staged_delta = metastore.stage_delta(
|
123
|
+
pa_table,
|
124
|
+
partition,
|
125
|
+
content_type=content_type,
|
126
|
+
**kwargs,
|
103
127
|
)
|
104
128
|
|
105
|
-
return
|
129
|
+
return metastore.commit_delta(staged_delta, **kwargs)
|
@@ -1,7 +1,57 @@
|
|
1
1
|
from typing import Optional, Dict
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from deltacat import (
|
6
|
+
ContentEncoding,
|
7
|
+
ContentType,
|
8
|
+
)
|
9
|
+
from deltacat.storage import (
|
10
|
+
BucketTransform,
|
11
|
+
BucketTransformParameters,
|
12
|
+
BucketingStrategy,
|
13
|
+
CommitState,
|
14
|
+
Delta,
|
15
|
+
DeltaLocator,
|
16
|
+
DeltaType,
|
17
|
+
EntryParams,
|
18
|
+
EntryType,
|
19
|
+
Field,
|
20
|
+
LifecycleState,
|
21
|
+
ManifestAuthor,
|
22
|
+
ManifestEntry,
|
23
|
+
Namespace,
|
24
|
+
NamespaceLocator,
|
25
|
+
NullOrder,
|
26
|
+
Partition,
|
27
|
+
PartitionKey,
|
28
|
+
PartitionKeyList,
|
29
|
+
PartitionLocator,
|
30
|
+
PartitionScheme,
|
31
|
+
PartitionSchemeList,
|
32
|
+
Schema,
|
33
|
+
SchemaList,
|
34
|
+
SortScheme,
|
35
|
+
SortSchemeList,
|
36
|
+
SortKey,
|
37
|
+
SortKeyList,
|
38
|
+
SortOrder,
|
39
|
+
StreamLocator,
|
40
|
+
StreamFormat,
|
41
|
+
Stream,
|
42
|
+
Table,
|
43
|
+
TableLocator,
|
44
|
+
TableVersionLocator,
|
45
|
+
TableVersion,
|
46
|
+
TruncateTransform,
|
47
|
+
TruncateTransformParameters,
|
48
|
+
)
|
49
|
+
|
50
|
+
from deltacat.storage.model.manifest import (
|
51
|
+
Manifest,
|
52
|
+
ManifestMeta,
|
53
|
+
ManifestEntryList,
|
54
|
+
)
|
5
55
|
from deltacat.utils.common import current_time_ms
|
6
56
|
|
7
57
|
|
@@ -13,11 +63,14 @@ def create_empty_delta(
|
|
13
63
|
manifest_entry_id: Optional[str] = None,
|
14
64
|
) -> Delta:
|
15
65
|
stream_position = current_time_ms()
|
16
|
-
delta_locator = DeltaLocator.of(
|
66
|
+
delta_locator = DeltaLocator.of(
|
67
|
+
partition.locator,
|
68
|
+
stream_position=stream_position,
|
69
|
+
)
|
17
70
|
|
18
71
|
if manifest_entry_id:
|
19
72
|
manifest = Manifest.of(
|
20
|
-
entries=[],
|
73
|
+
entries=ManifestEntryList.of([]),
|
21
74
|
author=author,
|
22
75
|
uuid=manifest_entry_id,
|
23
76
|
)
|
@@ -32,3 +85,202 @@ def create_empty_delta(
|
|
32
85
|
manifest=manifest,
|
33
86
|
previous_stream_position=partition.stream_position,
|
34
87
|
)
|
88
|
+
|
89
|
+
|
90
|
+
def create_test_namespace():
|
91
|
+
namespace_locator = NamespaceLocator.of(namespace="test_namespace")
|
92
|
+
return Namespace.of(locator=namespace_locator)
|
93
|
+
|
94
|
+
|
95
|
+
def create_test_table():
|
96
|
+
table_locator = TableLocator.at(
|
97
|
+
namespace="test_namespace",
|
98
|
+
table_name="test_table",
|
99
|
+
)
|
100
|
+
return Table.of(
|
101
|
+
locator=table_locator,
|
102
|
+
description="test table description",
|
103
|
+
)
|
104
|
+
|
105
|
+
|
106
|
+
def create_test_table_version():
|
107
|
+
table_version_locator = TableVersionLocator.at(
|
108
|
+
namespace="test_namespace",
|
109
|
+
table_name="test_table",
|
110
|
+
table_version="v.1",
|
111
|
+
)
|
112
|
+
schema = Schema.of(
|
113
|
+
[
|
114
|
+
Field.of(
|
115
|
+
field=pa.field("some_string", pa.string(), nullable=False),
|
116
|
+
field_id=1,
|
117
|
+
is_merge_key=True,
|
118
|
+
),
|
119
|
+
Field.of(
|
120
|
+
field=pa.field("some_int32", pa.int32(), nullable=False),
|
121
|
+
field_id=2,
|
122
|
+
is_merge_key=True,
|
123
|
+
),
|
124
|
+
Field.of(
|
125
|
+
field=pa.field("some_float64", pa.float64()),
|
126
|
+
field_id=3,
|
127
|
+
is_merge_key=False,
|
128
|
+
),
|
129
|
+
]
|
130
|
+
)
|
131
|
+
bucket_transform = BucketTransform.of(
|
132
|
+
BucketTransformParameters.of(
|
133
|
+
num_buckets=2,
|
134
|
+
bucketing_strategy=BucketingStrategy.DEFAULT,
|
135
|
+
)
|
136
|
+
)
|
137
|
+
partition_keys = [
|
138
|
+
PartitionKey.of(
|
139
|
+
key=["some_string", "some_int32"],
|
140
|
+
name="test_partition_key",
|
141
|
+
field_id=1,
|
142
|
+
transform=bucket_transform,
|
143
|
+
)
|
144
|
+
]
|
145
|
+
partition_scheme = PartitionScheme.of(
|
146
|
+
keys=PartitionKeyList.of(partition_keys),
|
147
|
+
name="test_partition_scheme",
|
148
|
+
scheme_id="test_partition_scheme_id",
|
149
|
+
)
|
150
|
+
sort_keys = [
|
151
|
+
SortKey.of(
|
152
|
+
key=["some_int32"],
|
153
|
+
sort_order=SortOrder.DESCENDING,
|
154
|
+
null_order=NullOrder.AT_START,
|
155
|
+
transform=TruncateTransform.of(
|
156
|
+
TruncateTransformParameters.of(width=3),
|
157
|
+
),
|
158
|
+
)
|
159
|
+
]
|
160
|
+
sort_scheme = SortScheme.of(
|
161
|
+
keys=SortKeyList.of(sort_keys),
|
162
|
+
name="test_sort_scheme",
|
163
|
+
scheme_id="test_sort_scheme_id",
|
164
|
+
)
|
165
|
+
return TableVersion.of(
|
166
|
+
locator=table_version_locator,
|
167
|
+
schema=schema,
|
168
|
+
partition_scheme=partition_scheme,
|
169
|
+
description="test table version description",
|
170
|
+
properties={"test_property_key": "test_property_value"},
|
171
|
+
content_types=[ContentType.PARQUET],
|
172
|
+
sort_scheme=sort_scheme,
|
173
|
+
watermark=None,
|
174
|
+
lifecycle_state=LifecycleState.CREATED,
|
175
|
+
schemas=SchemaList.of([schema]),
|
176
|
+
partition_schemes=PartitionSchemeList.of([partition_scheme]),
|
177
|
+
sort_schemes=SortSchemeList.of([sort_scheme]),
|
178
|
+
)
|
179
|
+
|
180
|
+
|
181
|
+
def create_test_stream():
|
182
|
+
stream_locator = StreamLocator.at(
|
183
|
+
namespace="test_namespace",
|
184
|
+
table_name="test_table",
|
185
|
+
table_version="v.1",
|
186
|
+
stream_id="test_stream_id",
|
187
|
+
stream_format=StreamFormat.DELTACAT,
|
188
|
+
)
|
189
|
+
bucket_transform = BucketTransform.of(
|
190
|
+
BucketTransformParameters.of(
|
191
|
+
num_buckets=2,
|
192
|
+
bucketing_strategy=BucketingStrategy.DEFAULT,
|
193
|
+
)
|
194
|
+
)
|
195
|
+
partition_keys = [
|
196
|
+
PartitionKey.of(
|
197
|
+
key=["some_string", "some_int32"],
|
198
|
+
name="test_partition_key",
|
199
|
+
field_id=1,
|
200
|
+
transform=bucket_transform,
|
201
|
+
)
|
202
|
+
]
|
203
|
+
partition_scheme = PartitionScheme.of(
|
204
|
+
keys=PartitionKeyList.of(partition_keys),
|
205
|
+
name="test_partition_scheme",
|
206
|
+
scheme_id="test_partition_scheme_id",
|
207
|
+
)
|
208
|
+
return Stream.of(
|
209
|
+
locator=stream_locator,
|
210
|
+
partition_scheme=partition_scheme,
|
211
|
+
state=CommitState.STAGED,
|
212
|
+
previous_stream_id="test_previous_stream_id",
|
213
|
+
watermark=1,
|
214
|
+
)
|
215
|
+
|
216
|
+
|
217
|
+
def create_test_partition():
|
218
|
+
partition_locator = PartitionLocator.at(
|
219
|
+
namespace="test_namespace",
|
220
|
+
table_name="test_table",
|
221
|
+
table_version="v.1",
|
222
|
+
stream_id="test_stream_id",
|
223
|
+
stream_format=StreamFormat.DELTACAT,
|
224
|
+
partition_values=["a", 1],
|
225
|
+
partition_id="test_partition_id",
|
226
|
+
)
|
227
|
+
return Partition.of(
|
228
|
+
locator=partition_locator,
|
229
|
+
content_types=[ContentType.PARQUET],
|
230
|
+
state=CommitState.STAGED,
|
231
|
+
previous_stream_position=0,
|
232
|
+
previous_partition_id="test_previous_partition_id",
|
233
|
+
stream_position=1,
|
234
|
+
partition_scheme_id="test_partition_scheme_id",
|
235
|
+
)
|
236
|
+
|
237
|
+
|
238
|
+
def create_test_delta():
|
239
|
+
delta_locator = DeltaLocator.at(
|
240
|
+
namespace="test_namespace",
|
241
|
+
table_name="test_table",
|
242
|
+
table_version="v.1",
|
243
|
+
stream_id="test_stream_id",
|
244
|
+
stream_format=StreamFormat.DELTACAT,
|
245
|
+
partition_values=["a", 1],
|
246
|
+
partition_id="test_partition_id",
|
247
|
+
stream_position=1,
|
248
|
+
)
|
249
|
+
manifest_entry_params = EntryParams.of(
|
250
|
+
equality_field_locators=["some_string", "some_int32"],
|
251
|
+
)
|
252
|
+
manifest_meta = ManifestMeta.of(
|
253
|
+
record_count=1,
|
254
|
+
content_length=10,
|
255
|
+
content_type=ContentType.PARQUET.value,
|
256
|
+
content_encoding=ContentEncoding.IDENTITY.value,
|
257
|
+
source_content_length=100,
|
258
|
+
credentials={"foo": "bar"},
|
259
|
+
content_type_parameters=[{"param1": "value1"}],
|
260
|
+
entry_type=EntryType.EQUALITY_DELETE,
|
261
|
+
entry_params=manifest_entry_params,
|
262
|
+
)
|
263
|
+
manifest = Manifest.of(
|
264
|
+
entries=ManifestEntryList(
|
265
|
+
[
|
266
|
+
ManifestEntry.of(
|
267
|
+
url="s3://test/url",
|
268
|
+
meta=manifest_meta,
|
269
|
+
)
|
270
|
+
]
|
271
|
+
),
|
272
|
+
author=ManifestAuthor.of(
|
273
|
+
name="deltacat",
|
274
|
+
version="2.0",
|
275
|
+
),
|
276
|
+
entry_type=EntryType.EQUALITY_DELETE,
|
277
|
+
entry_params=manifest_entry_params,
|
278
|
+
)
|
279
|
+
return Delta.of(
|
280
|
+
locator=delta_locator,
|
281
|
+
delta_type=DeltaType.APPEND,
|
282
|
+
meta=manifest_meta,
|
283
|
+
properties={"property1": "value1"},
|
284
|
+
manifest=manifest,
|
285
|
+
previous_stream_position=0,
|
286
|
+
)
|
File without changes
|
@@ -0,0 +1,104 @@
|
|
1
|
+
import pytest
|
2
|
+
import pandas as pd
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from deltacat.types.tables import (
|
6
|
+
to_pandas,
|
7
|
+
to_pyarrow,
|
8
|
+
get_table_length,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
def test_convert_to_pandas_error_cases():
|
13
|
+
"""Test convert_to_pandas with invalid inputs."""
|
14
|
+
# Test None input
|
15
|
+
with pytest.raises(
|
16
|
+
ValueError, match="No pandas conversion function found for table type"
|
17
|
+
):
|
18
|
+
to_pandas(None)
|
19
|
+
|
20
|
+
# Test unsupported type
|
21
|
+
with pytest.raises(
|
22
|
+
ValueError, match="No pandas conversion function found for table type"
|
23
|
+
):
|
24
|
+
to_pandas("invalid_string")
|
25
|
+
|
26
|
+
# Test unsupported type with complex object
|
27
|
+
with pytest.raises(
|
28
|
+
ValueError, match="No pandas conversion function found for table type"
|
29
|
+
):
|
30
|
+
to_pandas({"not": "a_dataframe"})
|
31
|
+
|
32
|
+
|
33
|
+
def test_convert_to_arrow_error_cases():
|
34
|
+
"""Test convert_to_arrow with invalid inputs."""
|
35
|
+
# Test None input
|
36
|
+
with pytest.raises(
|
37
|
+
ValueError, match="No pyarrow conversion function found for table type"
|
38
|
+
):
|
39
|
+
to_pyarrow(None)
|
40
|
+
|
41
|
+
# Test unsupported type
|
42
|
+
with pytest.raises(
|
43
|
+
ValueError, match="No pyarrow conversion function found for table type"
|
44
|
+
):
|
45
|
+
to_pyarrow("invalid_string")
|
46
|
+
|
47
|
+
# Test unsupported type with complex object
|
48
|
+
with pytest.raises(
|
49
|
+
ValueError, match="No pyarrow conversion function found for table type"
|
50
|
+
):
|
51
|
+
to_pyarrow({"not": "a_table"})
|
52
|
+
|
53
|
+
|
54
|
+
def test_conversion_functions_with_real_data():
|
55
|
+
"""Test conversion functions with actual data structures."""
|
56
|
+
# Create test data
|
57
|
+
test_df = pd.DataFrame({"id": [1, 2], "name": ["test1", "test2"]})
|
58
|
+
test_table = pa.Table.from_pandas(test_df)
|
59
|
+
|
60
|
+
# Test pandas conversion
|
61
|
+
converted_df = to_pandas(test_df)
|
62
|
+
assert isinstance(converted_df, pd.DataFrame)
|
63
|
+
assert converted_df.equals(test_df)
|
64
|
+
|
65
|
+
# Test arrow conversion
|
66
|
+
converted_table = to_pyarrow(test_table)
|
67
|
+
assert isinstance(converted_table, pa.Table)
|
68
|
+
assert converted_table.equals(test_table)
|
69
|
+
|
70
|
+
# Test cross-conversion
|
71
|
+
df_from_table = to_pandas(test_table)
|
72
|
+
table_from_df = to_pyarrow(test_df)
|
73
|
+
assert isinstance(df_from_table, pd.DataFrame)
|
74
|
+
assert isinstance(table_from_df, pa.Table)
|
75
|
+
|
76
|
+
|
77
|
+
def test_conversion_roundtrip_consistency():
|
78
|
+
"""Test that conversion functions maintain data integrity through roundtrips."""
|
79
|
+
# Create test data
|
80
|
+
original_df = pd.DataFrame(
|
81
|
+
{
|
82
|
+
"id": [1, 2, 3, 4, 5],
|
83
|
+
"name": ["Alice", "Bob", "Charlie", "Dave", "Eve"],
|
84
|
+
"age": [25, 30, 35, 40, 45],
|
85
|
+
"city": ["NYC", "LA", "Chicago", "Houston", "Phoenix"],
|
86
|
+
}
|
87
|
+
)
|
88
|
+
|
89
|
+
# Test pandas -> arrow -> pandas roundtrip
|
90
|
+
arrow_table = to_pyarrow(original_df)
|
91
|
+
roundtrip_df = to_pandas(arrow_table)
|
92
|
+
|
93
|
+
# Verify data integrity (allowing for potential type changes)
|
94
|
+
assert get_table_length(original_df) == get_table_length(
|
95
|
+
roundtrip_df
|
96
|
+
), "Row count should be preserved"
|
97
|
+
assert list(original_df.columns) == list(
|
98
|
+
roundtrip_df.columns
|
99
|
+
), "Column names should be preserved"
|
100
|
+
|
101
|
+
# Verify ID column integrity (critical for merge operations)
|
102
|
+
original_ids = sorted(original_df["id"].tolist())
|
103
|
+
roundtrip_ids = sorted(roundtrip_df["id"].tolist())
|
104
|
+
assert original_ids == roundtrip_ids, "ID column should be preserved exactly"
|
@@ -0,0 +1,22 @@
|
|
1
|
+
"""
|
2
|
+
Exception classes for main storage testing that mirror the local storage exceptions.
|
3
|
+
These are used to test the main metastore error categorization functionality.
|
4
|
+
"""
|
5
|
+
|
6
|
+
|
7
|
+
class InvalidNamespaceError(Exception):
|
8
|
+
"""Exception raised when an invalid namespace is provided to main storage."""
|
9
|
+
|
10
|
+
error_name = "InvalidNamespaceError"
|
11
|
+
|
12
|
+
|
13
|
+
class MainStorageValidationError(Exception):
|
14
|
+
"""Exception raised when main storage validation fails."""
|
15
|
+
|
16
|
+
error_name = "MainStorageValidationError"
|
17
|
+
|
18
|
+
|
19
|
+
class MainStorageError(Exception):
|
20
|
+
"""General exception for main storage operations."""
|
21
|
+
|
22
|
+
error_name = "MainStorageError"
|
@@ -0,0 +1,31 @@
|
|
1
|
+
"""
|
2
|
+
Mock module that provides storage-specific error categorization functions for main storage testing.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from deltacat.tests.utils.exceptions import (
|
6
|
+
InvalidNamespaceError,
|
7
|
+
MainStorageValidationError,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
def can_categorize(e: BaseException, **kwargs) -> bool:
|
12
|
+
"""
|
13
|
+
Mock implementation of can_categorize for main storage testing.
|
14
|
+
Returns True if the input error can be categorized by main storage.
|
15
|
+
"""
|
16
|
+
if isinstance(e, InvalidNamespaceError):
|
17
|
+
return True
|
18
|
+
else:
|
19
|
+
return False
|
20
|
+
|
21
|
+
|
22
|
+
def raise_categorized_error(e: BaseException, **kwargs):
|
23
|
+
"""
|
24
|
+
Mock implementation of raise_categorized_error for main storage testing.
|
25
|
+
Converts categorizable errors to their main storage equivalent.
|
26
|
+
"""
|
27
|
+
if isinstance(e, InvalidNamespaceError):
|
28
|
+
raise MainStorageValidationError("Namespace provided is invalid!")
|
29
|
+
else:
|
30
|
+
# If we can't categorize it, re-raise the original exception
|
31
|
+
raise e
|