deltacat 1.1.26__tar.gz → 1.1.28__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {deltacat-1.1.26/deltacat.egg-info → deltacat-1.1.28}/PKG-INFO +1 -1
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/__init__.py +1 -1
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/private/compaction_utils.py +4 -1
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/resource_estimation/delta.py +1 -1
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/resource_estimation/test_delta.py +37 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_compact_partition_incremental.py +10 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +10 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_compact_partition_rebase.py +11 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +10 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/test_pyarrow.py +255 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/pyarrow.py +156 -27
- {deltacat-1.1.26 → deltacat-1.1.28/deltacat.egg-info}/PKG-INFO +1 -1
- {deltacat-1.1.26 → deltacat-1.1.28}/LICENSE +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/MANIFEST.in +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/README.md +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/aws/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/aws/clients.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/aws/constants.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/aws/redshift/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/aws/redshift/model/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/aws/redshift/model/manifest.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/aws/s3u.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/benchmarking/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/benchmarking/conftest.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/catalog/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/catalog/default_catalog_impl/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/catalog/delegate.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/catalog/interface.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/catalog/model/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/catalog/model/catalog.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/catalog/model/table_definition.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/compaction_session.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/compactor_version.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/materialize_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/repartition_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/model/table_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/repartition_session.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/steps/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/steps/dedupe.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/steps/materialize.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/steps/repartition.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/utils/io.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/utils/sort_key.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor/utils/system_columns.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/compaction_session.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/constants.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/deletes/utils.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/private/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/steps/merge.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/utils/dedupe.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/utils/io.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/utils/merge.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/utils/primary_key_index.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/utils/task_options.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/merge_on_read/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/merge_on_read/daft.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/merge_on_read/model/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/merge_on_read/utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/merge_on_read/utils/delta.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/resource_estimation/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/resource_estimation/manifest.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/resource_estimation/model.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/resource_estimation/parquet.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/stats/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/stats/models/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/stats/models/delta_stats.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/stats/models/stats_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/stats/types.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/constants.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/exceptions.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/aws/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/aws/redshift/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/dataset.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/file_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/memcached_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/ray_plasma_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/read_api.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/redis_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/io/s3_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/logs.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/interface.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/delete_parameters.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/delta.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/list_result.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/locator.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/namespace.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/partition.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/partition_spec.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/sort_key.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/stream.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/table.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/table_version.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/transform.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/storage/model/types.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/aws/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/aws/test_clients.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/aws/test_s3u.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/catalog/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/catalog/test_default_catalog_impl.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compact_partition_test_cases.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/resource_estimation/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/resource_estimation/data/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_util_common.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_util_constant.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/io/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/io/test_cloudpickle_bug_fix.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/io/test_file_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/io/test_memcached_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/io/test_redis_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/io/test_s3_object_store.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/local_deltacat_storage/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/local_deltacat_storage/exceptions.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/test_exceptions.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/test_logs.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/test_utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/test_utils/constants.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/test_utils/pyarrow.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/test_utils/storage.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/test_utils/utils.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/data/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/ray_utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/test_cloudpickle.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/test_daft.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/test_metrics.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/test_placement.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/utils/test_resources.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/types/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/types/media.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/types/partial_download.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/types/tables.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/arguments.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/cloudpickle.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/common.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/daft.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/metrics.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/numpy.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/pandas.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/performance.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/placement.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/ray_utils/__init__.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/ray_utils/collections.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/ray_utils/concurrency.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/ray_utils/dataset.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/ray_utils/performance.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/ray_utils/runtime.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/resources.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/s3fs.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat/utils/schema.py +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat.egg-info/SOURCES.txt +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat.egg-info/dependency_links.txt +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat.egg-info/requires.txt +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/deltacat.egg-info/top_level.txt +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/setup.cfg +0 -0
- {deltacat-1.1.26 → deltacat-1.1.28}/setup.py +0 -0
{deltacat-1.1.26 → deltacat-1.1.28}/deltacat/compute/compactor_v2/private/compaction_utils.py
RENAMED
@@ -584,8 +584,11 @@ def _process_merge_results(
|
|
584
584
|
f"Duplicate record count ({duplicate_hash_bucket_mat_results}) is as large "
|
585
585
|
f"as or greater than params.num_rounds, which is {params.num_rounds}"
|
586
586
|
)
|
587
|
+
# ensure start index is the first file index if task index is same
|
587
588
|
hb_id_to_entry_indices_range[str(mat_result.task_index)] = (
|
588
|
-
file_index
|
589
|
+
hb_id_to_entry_indices_range.get(str(mat_result.task_index), [file_index])[
|
590
|
+
0
|
591
|
+
],
|
589
592
|
file_index + mat_result.pyarrow_write_result.files,
|
590
593
|
)
|
591
594
|
|
@@ -188,7 +188,7 @@ def _estimate_resources_required_to_process_delta_using_file_sampling(
|
|
188
188
|
sampled_on_disk_size += delta.manifest.entries[entry_index].meta.content_length
|
189
189
|
sampled_num_rows += len(tbl)
|
190
190
|
|
191
|
-
if not sampled_on_disk_size:
|
191
|
+
if not sampled_on_disk_size or not sampled_in_memory_size:
|
192
192
|
return EstimatedResources.of(
|
193
193
|
memory_bytes=0,
|
194
194
|
statistics=Statistics.of(
|
{deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/resource_estimation/test_delta.py
RENAMED
@@ -437,6 +437,43 @@ class TestEstimateResourcesRequiredToProcessDelta:
|
|
437
437
|
== parquet_delta_with_manifest.meta.content_length
|
438
438
|
)
|
439
439
|
|
440
|
+
def test_parquet_delta_when_file_sampling_and_arrow_size_zero(
|
441
|
+
self,
|
442
|
+
local_deltacat_storage_kwargs,
|
443
|
+
parquet_delta_with_manifest: Delta,
|
444
|
+
monkeypatch,
|
445
|
+
):
|
446
|
+
params = EstimateResourcesParams.of(
|
447
|
+
resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING,
|
448
|
+
max_files_to_sample=2,
|
449
|
+
)
|
450
|
+
|
451
|
+
def mock_func(*args, **kwargs):
|
452
|
+
class MockedValue:
|
453
|
+
nbytes = 0
|
454
|
+
|
455
|
+
def __len__(self):
|
456
|
+
return 0
|
457
|
+
|
458
|
+
return MockedValue()
|
459
|
+
|
460
|
+
monkeypatch.setattr(ds, "download_delta_manifest_entry", mock_func)
|
461
|
+
|
462
|
+
result = estimate_resources_required_to_process_delta(
|
463
|
+
delta=parquet_delta_with_manifest,
|
464
|
+
operation_type=OperationType.PYARROW_DOWNLOAD,
|
465
|
+
deltacat_storage=ds,
|
466
|
+
deltacat_storage_kwargs=local_deltacat_storage_kwargs,
|
467
|
+
estimate_resources_params=params,
|
468
|
+
)
|
469
|
+
|
470
|
+
assert parquet_delta_with_manifest.manifest is not None
|
471
|
+
assert result.memory_bytes == 0
|
472
|
+
assert (
|
473
|
+
result.statistics.on_disk_size_bytes
|
474
|
+
== parquet_delta_with_manifest.meta.content_length
|
475
|
+
)
|
476
|
+
|
440
477
|
def test_delta_manifest_utsv_when_file_sampling(
|
441
478
|
self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta
|
442
479
|
):
|
{deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_compact_partition_incremental.py
RENAMED
@@ -328,6 +328,16 @@ def test_compact_partition_incremental(
|
|
328
328
|
**compaction_audit_obj
|
329
329
|
)
|
330
330
|
|
331
|
+
# assert if RCF covers all files
|
332
|
+
if compactor_version != CompactorVersion.V1.value:
|
333
|
+
previous_end = None
|
334
|
+
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
335
|
+
assert (previous_end is None and start == 0) or start == previous_end
|
336
|
+
previous_end = end
|
337
|
+
assert (
|
338
|
+
previous_end == round_completion_info.compacted_pyarrow_write_result.files
|
339
|
+
)
|
340
|
+
|
331
341
|
tables = ds.download_delta(
|
332
342
|
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
333
343
|
)
|
{deltacat-1.1.26 → deltacat-1.1.28}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py
RENAMED
@@ -309,6 +309,16 @@ def test_compact_partition_rebase_multiple_rounds_same_source_and_destination(
|
|
309
309
|
**compaction_audit_obj
|
310
310
|
)
|
311
311
|
|
312
|
+
# assert if RCF covers all files
|
313
|
+
# multiple rounds feature is only supported in V2 compactor
|
314
|
+
previous_end = None
|
315
|
+
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
316
|
+
assert (previous_end is None and start == 0) or start == previous_end
|
317
|
+
previous_end = end
|
318
|
+
assert (
|
319
|
+
previous_end == round_completion_info.compacted_pyarrow_write_result.files
|
320
|
+
)
|
321
|
+
|
312
322
|
# Assert not in-place compacted
|
313
323
|
assert (
|
314
324
|
execute_compaction_result_spy.call_args.args[-1] is False
|
@@ -299,6 +299,17 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
299
299
|
round_completion_info.compaction_audit_url
|
300
300
|
)
|
301
301
|
|
302
|
+
# assert if RCF covers all files
|
303
|
+
if compactor_version != CompactorVersion.V1.value:
|
304
|
+
previous_end = None
|
305
|
+
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
306
|
+
assert (previous_end is None and start == 0) or start == previous_end
|
307
|
+
previous_end = end
|
308
|
+
assert (
|
309
|
+
previous_end
|
310
|
+
== round_completion_info.compacted_pyarrow_write_result.files
|
311
|
+
)
|
312
|
+
|
302
313
|
compaction_audit_obj: Dict[str, Any] = read_s3_contents(
|
303
314
|
s3_resource, audit_bucket, audit_key
|
304
315
|
)
|
@@ -355,6 +355,16 @@ def test_compact_partition_rebase_then_incremental(
|
|
355
355
|
compacted_delta_locator_incremental: DeltaLocator = (
|
356
356
|
round_completion_info.compacted_delta_locator
|
357
357
|
)
|
358
|
+
# assert if RCF covers all files
|
359
|
+
if compactor_version != CompactorVersion.V1.value:
|
360
|
+
previous_end = None
|
361
|
+
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
362
|
+
assert (previous_end is None and start == 0) or start == previous_end
|
363
|
+
previous_end = end
|
364
|
+
assert (
|
365
|
+
previous_end == round_completion_info.compacted_pyarrow_write_result.files
|
366
|
+
)
|
367
|
+
|
358
368
|
audit_bucket, audit_key = round_completion_info.compaction_audit_url.replace(
|
359
369
|
"s3://", ""
|
360
370
|
).split("/", 1)
|
@@ -7,7 +7,9 @@ from deltacat.utils.pyarrow import (
|
|
7
7
|
s3_file_to_table,
|
8
8
|
ReadKwargsProviderPyArrowSchemaOverride,
|
9
9
|
RAISE_ON_EMPTY_CSV_KWARG,
|
10
|
+
RAISE_ON_DECIMAL_OVERFLOW,
|
10
11
|
)
|
12
|
+
import decimal
|
11
13
|
from deltacat.types.media import ContentEncoding, ContentType
|
12
14
|
from deltacat.types.partial_download import PartialParquetParameters
|
13
15
|
from pyarrow.parquet import ParquetFile
|
@@ -16,6 +18,12 @@ import pyarrow as pa
|
|
16
18
|
PARQUET_FILE_PATH = "deltacat/tests/utils/data/test_file.parquet"
|
17
19
|
EMPTY_UTSV_PATH = "deltacat/tests/utils/data/empty.csv"
|
18
20
|
NON_EMPTY_VALID_UTSV_PATH = "deltacat/tests/utils/data/non_empty_valid.csv"
|
21
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH = (
|
22
|
+
"deltacat/tests/utils/data/overflowing_decimal_precision.csv"
|
23
|
+
)
|
24
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH = (
|
25
|
+
"deltacat/tests/utils/data/overflowing_decimal_scale.csv"
|
26
|
+
)
|
19
27
|
GZIP_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.gz"
|
20
28
|
BZ2_COMPRESSED_FILE_UTSV_PATH = "deltacat/tests/utils/data/non_empty_compressed.bz2"
|
21
29
|
|
@@ -407,6 +415,253 @@ class TestReadCSV(TestCase):
|
|
407
415
|
),
|
408
416
|
)
|
409
417
|
|
418
|
+
def test_read_csv_when_decimal_precision_overflows_and_raise_kwarg_specified(self):
|
419
|
+
schema = pa.schema(
|
420
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
421
|
+
)
|
422
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
423
|
+
_add_column_kwargs(
|
424
|
+
ContentType.UNESCAPED_TSV.value,
|
425
|
+
["is_active", "decimal_value"],
|
426
|
+
["is_active", "decimal_value"],
|
427
|
+
kwargs,
|
428
|
+
)
|
429
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
430
|
+
|
431
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
432
|
+
self.assertRaises(
|
433
|
+
pa.lib.ArrowInvalid,
|
434
|
+
lambda: pyarrow_read_csv(
|
435
|
+
OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH,
|
436
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
437
|
+
),
|
438
|
+
)
|
439
|
+
|
440
|
+
def test_read_csv_when_decimal_precision_overflows_sanity(self):
|
441
|
+
schema = pa.schema(
|
442
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(4, 2))]
|
443
|
+
)
|
444
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
445
|
+
_add_column_kwargs(
|
446
|
+
ContentType.UNESCAPED_TSV.value,
|
447
|
+
["is_active", "decimal_value"],
|
448
|
+
["is_active", "decimal_value"],
|
449
|
+
kwargs,
|
450
|
+
)
|
451
|
+
|
452
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
453
|
+
|
454
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
455
|
+
|
456
|
+
self.assertRaises(
|
457
|
+
pa.lib.ArrowInvalid,
|
458
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_PRECISION_UTSV_PATH, **kwargs),
|
459
|
+
)
|
460
|
+
|
461
|
+
def test_read_csv_when_decimal_scale_overflows_and_raise_kwarg_specified(self):
|
462
|
+
schema = pa.schema(
|
463
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
464
|
+
)
|
465
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
466
|
+
_add_column_kwargs(
|
467
|
+
ContentType.UNESCAPED_TSV.value,
|
468
|
+
["is_active", "decimal_value"],
|
469
|
+
["is_active", "decimal_value"],
|
470
|
+
kwargs,
|
471
|
+
)
|
472
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
473
|
+
|
474
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
475
|
+
|
476
|
+
self.assertRaises(
|
477
|
+
pa.lib.ArrowInvalid,
|
478
|
+
lambda: pyarrow_read_csv(
|
479
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
480
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
481
|
+
),
|
482
|
+
)
|
483
|
+
|
484
|
+
def test_read_csv_when_decimal_scale_overflows_sanity(self):
|
485
|
+
schema = pa.schema(
|
486
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, 2))]
|
487
|
+
)
|
488
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
489
|
+
_add_column_kwargs(
|
490
|
+
ContentType.UNESCAPED_TSV.value,
|
491
|
+
["is_active", "decimal_value"],
|
492
|
+
["is_active", "decimal_value"],
|
493
|
+
kwargs,
|
494
|
+
)
|
495
|
+
|
496
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
497
|
+
|
498
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
499
|
+
|
500
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
501
|
+
|
502
|
+
self.assertEqual(len(result), 3)
|
503
|
+
self.assertEqual(
|
504
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
505
|
+
) # rounding decimal
|
506
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
507
|
+
self.assertEqual(len(result.column_names), 2)
|
508
|
+
result_schema = result.schema
|
509
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
510
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, 2))
|
511
|
+
|
512
|
+
def test_read_csv_when_decimal_scale_overflows_and_negative_scale(self):
|
513
|
+
schema = pa.schema(
|
514
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(20, -2))]
|
515
|
+
)
|
516
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
517
|
+
_add_column_kwargs(
|
518
|
+
ContentType.UNESCAPED_TSV.value,
|
519
|
+
["is_active", "decimal_value"],
|
520
|
+
["is_active", "decimal_value"],
|
521
|
+
kwargs,
|
522
|
+
)
|
523
|
+
|
524
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
525
|
+
|
526
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
527
|
+
|
528
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
529
|
+
|
530
|
+
self.assertEqual(len(result), 3)
|
531
|
+
self.assertEqual(
|
532
|
+
result[1][0].as_py(),
|
533
|
+
decimal.Decimal("322200"), # consequence of negative scale
|
534
|
+
) # rounding decimal
|
535
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("00"))
|
536
|
+
self.assertEqual(len(result.column_names), 2)
|
537
|
+
result_schema = result.schema
|
538
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
539
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(20, -2))
|
540
|
+
|
541
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256(self):
|
542
|
+
schema = pa.schema(
|
543
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
544
|
+
)
|
545
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
546
|
+
_add_column_kwargs(
|
547
|
+
ContentType.UNESCAPED_TSV.value,
|
548
|
+
["is_active", "decimal_value"],
|
549
|
+
["is_active", "decimal_value"],
|
550
|
+
kwargs,
|
551
|
+
)
|
552
|
+
|
553
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
554
|
+
|
555
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
556
|
+
|
557
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
558
|
+
|
559
|
+
self.assertEqual(len(result), 3)
|
560
|
+
self.assertEqual(
|
561
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
562
|
+
) # rounding decimal
|
563
|
+
self.assertEqual(result[1][1].as_py(), decimal.Decimal("32.33")) # not rounded
|
564
|
+
self.assertEqual(len(result.column_names), 2)
|
565
|
+
result_schema = result.schema
|
566
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
567
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal256(20, 2))
|
568
|
+
|
569
|
+
def test_read_csv_when_decimal_scale_overflows_with_decimal256_and_raise_on_overflow(
|
570
|
+
self,
|
571
|
+
):
|
572
|
+
schema = pa.schema(
|
573
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal256(20, 2))]
|
574
|
+
)
|
575
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
576
|
+
_add_column_kwargs(
|
577
|
+
ContentType.UNESCAPED_TSV.value,
|
578
|
+
["is_active", "decimal_value"],
|
579
|
+
["is_active", "decimal_value"],
|
580
|
+
kwargs,
|
581
|
+
)
|
582
|
+
|
583
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
584
|
+
|
585
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
586
|
+
|
587
|
+
self.assertRaises(
|
588
|
+
pa.lib.ArrowNotImplementedError,
|
589
|
+
lambda: pyarrow_read_csv(
|
590
|
+
OVERFLOWING_DECIMAL_SCALE_UTSV_PATH,
|
591
|
+
**{**kwargs, RAISE_ON_DECIMAL_OVERFLOW: True}
|
592
|
+
),
|
593
|
+
)
|
594
|
+
|
595
|
+
def test_read_csv_when_decimal_scale_overflows_without_any_schema_then_infers(self):
|
596
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
597
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=None)
|
598
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
599
|
+
|
600
|
+
result = pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs)
|
601
|
+
|
602
|
+
# The default behavior of pyarrow is to invalid skip rows
|
603
|
+
self.assertEqual(len(result), 2)
|
604
|
+
self.assertEqual(result[1][0].as_py(), 32.33) # rounding decimal
|
605
|
+
self.assertEqual(result[1][1].as_py(), 0.4) # not rounded
|
606
|
+
self.assertEqual(len(result.column_names), 2)
|
607
|
+
result_schema = result.schema
|
608
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
609
|
+
self.assertEqual(result_schema.field(1).type, pa.float64())
|
610
|
+
|
611
|
+
def test_read_csv_when_decimal_scale_and_precision_overflow_and_raise_on_overflow(
|
612
|
+
self,
|
613
|
+
):
|
614
|
+
schema = pa.schema(
|
615
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(5, 2))]
|
616
|
+
)
|
617
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
618
|
+
_add_column_kwargs(
|
619
|
+
ContentType.UNESCAPED_TSV.value,
|
620
|
+
["is_active", "decimal_value"],
|
621
|
+
["is_active", "decimal_value"],
|
622
|
+
kwargs,
|
623
|
+
)
|
624
|
+
|
625
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
626
|
+
|
627
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
628
|
+
|
629
|
+
self.assertRaises(
|
630
|
+
pa.lib.ArrowInvalid,
|
631
|
+
lambda: pyarrow_read_csv(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, **kwargs),
|
632
|
+
)
|
633
|
+
|
634
|
+
def test_read_csv_when_decimal_scale_overflow_and_file_like_obj_passed(self):
|
635
|
+
schema = pa.schema(
|
636
|
+
[("is_active", pa.string()), ("decimal_value", pa.decimal128(15, 2))]
|
637
|
+
)
|
638
|
+
kwargs = content_type_to_reader_kwargs(ContentType.UNESCAPED_TSV.value)
|
639
|
+
_add_column_kwargs(
|
640
|
+
ContentType.UNESCAPED_TSV.value,
|
641
|
+
["is_active", "decimal_value"],
|
642
|
+
["is_active", "decimal_value"],
|
643
|
+
kwargs,
|
644
|
+
)
|
645
|
+
|
646
|
+
read_kwargs_provider = ReadKwargsProviderPyArrowSchemaOverride(schema=schema)
|
647
|
+
|
648
|
+
kwargs = read_kwargs_provider(ContentType.UNESCAPED_TSV.value, kwargs)
|
649
|
+
|
650
|
+
with open(OVERFLOWING_DECIMAL_SCALE_UTSV_PATH, "rb") as file:
|
651
|
+
result = pyarrow_read_csv(file, **kwargs)
|
652
|
+
|
653
|
+
self.assertEqual(len(result), 3)
|
654
|
+
self.assertEqual(
|
655
|
+
result[1][0].as_py(), decimal.Decimal("322236.66")
|
656
|
+
) # rounding decimal
|
657
|
+
self.assertEqual(
|
658
|
+
result[1][1].as_py(), decimal.Decimal("32.33")
|
659
|
+
) # not rounded
|
660
|
+
self.assertEqual(len(result.column_names), 2)
|
661
|
+
result_schema = result.schema
|
662
|
+
self.assertEqual(result_schema.field(0).type, "string")
|
663
|
+
self.assertEqual(result_schema.field(1).type, pa.decimal128(15, 2))
|
664
|
+
|
410
665
|
|
411
666
|
class TestS3FileToTable(TestCase):
|
412
667
|
def test_s3_file_to_table_identity_sanity(self):
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# Allow classes to use self-referencing Type hints in Python 3.7.
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
+
import copy
|
4
5
|
import bz2
|
5
6
|
import gzip
|
6
7
|
import io
|
@@ -47,6 +48,17 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
47
48
|
RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
|
48
49
|
READER_TYPE_KWARG = "reader_type"
|
49
50
|
|
51
|
+
"""
|
52
|
+
By default, round decimal values using half_to_even round mode when
|
53
|
+
rescaling a decimal to the given scale and precision in the schema would cause
|
54
|
+
data loss. Setting any non null value of this argument will result
|
55
|
+
in an error instead.
|
56
|
+
"""
|
57
|
+
RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
|
58
|
+
# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
|
59
|
+
DECIMAL256_DEFAULT_SCALE = 38
|
60
|
+
DECIMAL256_MAX_PRECISION = 76
|
61
|
+
|
50
62
|
|
51
63
|
def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
|
52
64
|
|
@@ -64,45 +76,162 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
|
|
64
76
|
return target_schema
|
65
77
|
|
66
78
|
|
67
|
-
def
|
68
|
-
|
69
|
-
|
70
|
-
|
79
|
+
def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
|
80
|
+
schema = None
|
81
|
+
if (
|
82
|
+
"convert_options" in kwargs
|
83
|
+
and kwargs["convert_options"].column_types is not None
|
84
|
+
):
|
85
|
+
schema = kwargs["convert_options"].column_types
|
86
|
+
if not isinstance(schema, pa.Schema):
|
87
|
+
schema = pa.schema(schema)
|
88
|
+
if kwargs["convert_options"].include_columns:
|
89
|
+
schema = _filter_schema_for_columns(
|
90
|
+
schema, kwargs["convert_options"].include_columns
|
91
|
+
)
|
92
|
+
elif (
|
93
|
+
kwargs.get("read_options") is not None
|
94
|
+
and kwargs["read_options"].column_names
|
95
|
+
):
|
96
|
+
schema = _filter_schema_for_columns(
|
97
|
+
schema, kwargs["read_options"].column_names
|
98
|
+
)
|
99
|
+
else:
|
100
|
+
logger.debug(
|
101
|
+
"Schema not specified in the kwargs."
|
102
|
+
" Hence, schema could not be inferred from the empty CSV."
|
71
103
|
)
|
104
|
+
|
105
|
+
return schema
|
106
|
+
|
107
|
+
|
108
|
+
def _new_schema_with_replaced_fields(
|
109
|
+
schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
|
110
|
+
) -> pa.Schema:
|
111
|
+
if schema is None:
|
112
|
+
return None
|
113
|
+
|
114
|
+
new_schema_fields = []
|
115
|
+
for field in schema:
|
116
|
+
new_field = field_to_replace(field)
|
117
|
+
if new_field is not None:
|
118
|
+
new_schema_fields.append(new_field)
|
119
|
+
else:
|
120
|
+
new_schema_fields.append(field)
|
121
|
+
|
122
|
+
return pa.schema(new_schema_fields, metadata=schema.metadata)
|
123
|
+
|
124
|
+
|
125
|
+
def _read_csv_rounding_decimal_columns_to_fit_scale(
|
126
|
+
schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
|
127
|
+
) -> pa.Table:
|
128
|
+
# Note: We read decimals as strings first because CSV
|
129
|
+
# conversion to decimal256 isn't implemented as of pyarrow==12.0.1
|
130
|
+
new_schema = _new_schema_with_replaced_fields(
|
131
|
+
schema,
|
132
|
+
lambda fld: pa.field(fld.name, pa.string(), metadata=fld.metadata)
|
133
|
+
if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
|
134
|
+
else None,
|
135
|
+
)
|
136
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
137
|
+
["read_options", "parse_options", "convert_options", "memory_pool"],
|
138
|
+
reader_kwargs,
|
139
|
+
)
|
140
|
+
# Creating a shallow copy for efficiency
|
141
|
+
new_convert_options = copy.copy(new_kwargs["convert_options"])
|
142
|
+
new_convert_options.column_types = new_schema
|
143
|
+
new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
|
144
|
+
arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
|
145
|
+
|
146
|
+
for column_index, field in enumerate(schema):
|
147
|
+
if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
|
148
|
+
column_array = arrow_table[field.name]
|
149
|
+
# We always cast to decimal256 to accomodate fixed scale of 38
|
150
|
+
cast_to_type = pa.decimal256(
|
151
|
+
DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
|
152
|
+
)
|
153
|
+
casted_decimal_array = pc.cast(column_array, cast_to_type)
|
154
|
+
# Note that scale can be negative
|
155
|
+
rounded_column_array = pc.round(
|
156
|
+
casted_decimal_array, ndigits=field.type.scale
|
157
|
+
)
|
158
|
+
final_decimal_array = pc.cast(rounded_column_array, field.type)
|
159
|
+
arrow_table = arrow_table.set_column(
|
160
|
+
column_index,
|
161
|
+
field,
|
162
|
+
final_decimal_array,
|
163
|
+
)
|
164
|
+
logger.debug(
|
165
|
+
f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
|
166
|
+
f" {field.type.precision} precision"
|
167
|
+
)
|
168
|
+
|
169
|
+
return arrow_table
|
170
|
+
|
171
|
+
|
172
|
+
def pyarrow_read_csv_default(*args, **kwargs):
|
173
|
+
new_kwargs = sanitize_kwargs_by_supported_kwargs(
|
174
|
+
["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
|
175
|
+
)
|
176
|
+
|
177
|
+
try:
|
72
178
|
return pacsv.read_csv(*args, **new_kwargs)
|
73
179
|
except pa.lib.ArrowInvalid as e:
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
)
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
180
|
+
error_str = e.__str__()
|
181
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
182
|
+
|
183
|
+
if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
|
184
|
+
logger.debug(f"Read CSV empty schema being used: {schema}")
|
185
|
+
return pa.Table.from_pylist([], schema=schema)
|
186
|
+
if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
187
|
+
# Note, this logic requires expensive casting. To prevent downgrading performance
|
188
|
+
# for happy path reads, we are handling this case in response to an error.
|
189
|
+
logger.warning(
|
190
|
+
"Rescaling Decimal to the given scale in the schema. "
|
191
|
+
f"Original error: {error_str}"
|
192
|
+
)
|
193
|
+
|
194
|
+
if schema is not None and "convert_options" in kwargs:
|
195
|
+
if (
|
196
|
+
"Rescaling Decimal" in error_str
|
197
|
+
and "value would cause data loss" in error_str
|
90
198
|
):
|
91
|
-
|
92
|
-
|
199
|
+
logger.debug(f"Checking if the file: {args[0]}...")
|
200
|
+
# Since we are re-reading the file, we have to seek to beginning
|
201
|
+
if isinstance(args[0], io.IOBase) and args[0].seekable():
|
202
|
+
logger.debug(f"Seeking to the beginning of the file {args[0]}")
|
203
|
+
args[0].seek(0)
|
204
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
205
|
+
schema=schema, reader_args=args, reader_kwargs=kwargs
|
93
206
|
)
|
94
|
-
|
95
207
|
else:
|
96
208
|
logger.debug(
|
97
|
-
"Schema
|
98
|
-
"
|
209
|
+
"Schema is None when trying to adjust decimal values. "
|
210
|
+
"Hence, bubbling up exception..."
|
99
211
|
)
|
100
212
|
|
101
|
-
logger.debug(f"Read CSV empty schema being used: {schema}")
|
102
|
-
return pa.Table.from_pylist([], schema=schema)
|
103
213
|
raise e
|
104
214
|
|
105
215
|
|
216
|
+
def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
|
217
|
+
schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
|
218
|
+
|
219
|
+
# CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
|
220
|
+
# Below ensures decimal256 is casted properly.
|
221
|
+
schema_includes_decimal256 = (
|
222
|
+
(True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
|
223
|
+
if schema is not None
|
224
|
+
else None
|
225
|
+
)
|
226
|
+
if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
|
227
|
+
# falling back to expensive method of reading CSV
|
228
|
+
return _read_csv_rounding_decimal_columns_to_fit_scale(
|
229
|
+
schema, reader_args=args, reader_kwargs=kwargs
|
230
|
+
)
|
231
|
+
else:
|
232
|
+
return pyarrow_read_csv_default(*args, **kwargs)
|
233
|
+
|
234
|
+
|
106
235
|
CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
|
107
236
|
ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
|
108
237
|
ContentType.TSV.value: pyarrow_read_csv,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|