deltacat 1.1.27__tar.gz → 1.1.29__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {deltacat-1.1.27/deltacat.egg-info → deltacat-1.1.29}/PKG-INFO +1 -1
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/__init__.py +1 -1
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/constants.py +15 -1
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/steps/merge.py +30 -5
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/primary_key_index.py +15 -3
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compact_partition_test_cases.py +32 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +133 -0
- deltacat-1.1.29/deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/test_utils/pyarrow.py +15 -8
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/test_pyarrow.py +278 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/pyarrow.py +162 -31
- {deltacat-1.1.27 → deltacat-1.1.29/deltacat.egg-info}/PKG-INFO +1 -1
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat.egg-info/SOURCES.txt +1 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/LICENSE +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/MANIFEST.in +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/README.md +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/aws/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/aws/clients.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/aws/constants.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/aws/redshift/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/aws/redshift/model/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/aws/redshift/model/manifest.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/aws/s3u.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/benchmarking/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/benchmarking/conftest.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/catalog/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/catalog/default_catalog_impl/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/catalog/delegate.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/catalog/interface.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/catalog/model/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/catalog/model/catalog.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/catalog/model/table_definition.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/compaction_session.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/compactor_version.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/materialize_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/repartition_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/model/table_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/repartition_session.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/steps/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/steps/dedupe.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/steps/materialize.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/steps/repartition.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/utils/io.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/utils/sort_key.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor/utils/system_columns.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/compaction_session.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/delete_file_envelope.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/delete_strategy.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/delete_strategy_equality_delete.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/model.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/deletes/utils.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/merge_file_group.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/merge_input.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/model/merge_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/private/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/private/compaction_utils.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/steps/hash_bucket.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/delta.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/io.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/merge.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/task_options.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/merge_on_read/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/merge_on_read/daft.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/merge_on_read/model/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/merge_on_read/utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/merge_on_read/utils/delta.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/resource_estimation/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/resource_estimation/delta.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/resource_estimation/manifest.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/resource_estimation/model.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/resource_estimation/parquet.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/stats/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/stats/models/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/stats/models/delta_stats.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/stats/models/stats_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/stats/types.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/constants.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/exceptions.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/aws/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/aws/redshift/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/dataset.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/file_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/memcached_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/ray_plasma_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/read_api.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/redis_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/io/s3_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/logs.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/interface.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/delete_parameters.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/delta.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/list_result.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/locator.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/namespace.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/partition.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/partition_spec.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/sort_key.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/stream.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/table.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/table_version.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/transform.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/storage/model/types.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/aws/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/aws/test_clients.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/aws/test_s3u.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/catalog/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/catalog/test_default_catalog_impl.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compact_partition_rebase_test_cases.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor/steps/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/resource_estimation/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/resource_estimation/data/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/resource_estimation/test_delta.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/resource_estimation/test_manifest.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_incremental.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_multiple_rounds.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_rebase.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/test_util_common.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/test_util_constant.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/io/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/io/test_cloudpickle_bug_fix.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/io/test_file_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/io/test_memcached_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/io/test_redis_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/io/test_s3_object_store.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/local_deltacat_storage/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/local_deltacat_storage/exceptions.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/test_exceptions.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/test_logs.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/test_utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/test_utils/constants.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/test_utils/storage.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/test_utils/utils.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/data/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/ray_utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/ray_utils/test_concurrency.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/ray_utils/test_dataset.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/test_cloudpickle.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/test_daft.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/test_metrics.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/test_placement.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/utils/test_resources.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/types/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/types/media.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/types/partial_download.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/types/tables.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/arguments.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/cloudpickle.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/common.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/daft.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/metrics.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/numpy.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/pandas.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/performance.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/placement.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/ray_utils/__init__.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/ray_utils/collections.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/ray_utils/concurrency.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/ray_utils/dataset.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/ray_utils/performance.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/ray_utils/runtime.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/resources.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/s3fs.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat/utils/schema.py +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat.egg-info/dependency_links.txt +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat.egg-info/requires.txt +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/deltacat.egg-info/top_level.txt +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/setup.cfg +0 -0
- {deltacat-1.1.27 → deltacat-1.1.29}/setup.py +0 -0
@@ -1,3 +1,5 @@
|
|
1
|
+
from deltacat.utils.common import env_bool, env_integer
|
2
|
+
|
1
3
|
TOTAL_BYTES_IN_SHA1_HASH = 20
|
2
4
|
|
3
5
|
PK_DELIMITER = "L6kl7u5f"
|
@@ -31,7 +33,9 @@ TOTAL_MEMORY_BUFFER_PERCENTAGE = 30
|
|
31
33
|
# The total size of records that will be hash bucketed at once
|
32
34
|
# Since, sorting is nlogn, we ensure that is not performed
|
33
35
|
# on a very large dataset for best performance.
|
34
|
-
MAX_SIZE_OF_RECORD_BATCH_IN_GIB =
|
36
|
+
MAX_SIZE_OF_RECORD_BATCH_IN_GIB = env_integer(
|
37
|
+
"MAX_SIZE_OF_RECORD_BATCH_IN_GIB", 2 * 1024 * 1024 * 1024
|
38
|
+
)
|
35
39
|
|
36
40
|
# Whether to drop duplicates during merge.
|
37
41
|
DROP_DUPLICATES = True
|
@@ -78,3 +82,13 @@ COMPACT_PARTITION_METRIC_PREFIX = "compact_partition"
|
|
78
82
|
# Number of rounds to run hash/merge for a single
|
79
83
|
# partition. (For large table support)
|
80
84
|
DEFAULT_NUM_ROUNDS = 1
|
85
|
+
|
86
|
+
# Whether to perform sha1 hashing when required to
|
87
|
+
# optimize memory. For example, hashing is always
|
88
|
+
# required for bucketing where it's not mandatory
|
89
|
+
# when dropping duplicates. Setting this to True
|
90
|
+
# will disable sha1 hashing in cases where it isn't
|
91
|
+
# mandatory. This flag is False by default.
|
92
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED = env_bool(
|
93
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED", False
|
94
|
+
)
|
@@ -7,6 +7,7 @@ import ray
|
|
7
7
|
import itertools
|
8
8
|
import time
|
9
9
|
import pyarrow.compute as pc
|
10
|
+
from deltacat.utils.pyarrow import MAX_INT_BYTES
|
10
11
|
import deltacat.compute.compactor_v2.utils.merge as merge_utils
|
11
12
|
from uuid import uuid4
|
12
13
|
from deltacat import logs
|
@@ -147,10 +148,32 @@ def _merge_tables(
|
|
147
148
|
if compacted_table:
|
148
149
|
compacted_table = all_tables[0]
|
149
150
|
|
151
|
+
compacted_pk_hash_str = compacted_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
152
|
+
incremental_pk_hash_str = incremental_table[sc._PK_HASH_STRING_COLUMN_NAME]
|
153
|
+
|
154
|
+
logger.info(
|
155
|
+
f"Size of compacted pk hash={compacted_pk_hash_str.nbytes} "
|
156
|
+
f"and incremental pk hash={incremental_pk_hash_str.nbytes}."
|
157
|
+
)
|
158
|
+
|
159
|
+
if (
|
160
|
+
compacted_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
161
|
+
or incremental_table[sc._PK_HASH_STRING_COLUMN_NAME].nbytes >= MAX_INT_BYTES
|
162
|
+
):
|
163
|
+
logger.info("Casting compacted and incremental pk hash to large_string...")
|
164
|
+
# is_in combines the chunks of the chunked array passed which can cause
|
165
|
+
# ArrowCapacityError if the total size of string array is over 2GB.
|
166
|
+
# Using a large_string would resolve this issue.
|
167
|
+
# The cast here should be zero-copy in most cases.
|
168
|
+
compacted_pk_hash_str = pc.cast(compacted_pk_hash_str, pa.large_string())
|
169
|
+
incremental_pk_hash_str = pc.cast(
|
170
|
+
incremental_pk_hash_str, pa.large_string()
|
171
|
+
)
|
172
|
+
|
150
173
|
records_to_keep = pc.invert(
|
151
174
|
pc.is_in(
|
152
|
-
|
153
|
-
|
175
|
+
compacted_pk_hash_str,
|
176
|
+
incremental_pk_hash_str,
|
154
177
|
)
|
155
178
|
)
|
156
179
|
|
@@ -492,9 +515,11 @@ def _copy_manifests_from_hash_bucketing(
|
|
492
515
|
def _timed_merge(input: MergeInput) -> MergeResult:
|
493
516
|
task_id = get_current_ray_task_id()
|
494
517
|
worker_id = get_current_ray_worker_id()
|
495
|
-
with
|
496
|
-
f"merge_{worker_id}_{task_id}.bin"
|
497
|
-
|
518
|
+
with (
|
519
|
+
memray.Tracker(f"merge_{worker_id}_{task_id}.bin")
|
520
|
+
if input.enable_profiler
|
521
|
+
else nullcontext()
|
522
|
+
):
|
498
523
|
total_input_records, total_deduped_records = 0, 0
|
499
524
|
total_dropped_records = 0
|
500
525
|
materialized_results: List[MaterializeResult] = []
|
@@ -25,7 +25,7 @@ def _create_chunked_index_array(array: pa.Array) -> pa.Array:
|
|
25
25
|
result[index] = np.arange(cl, dtype="int32")
|
26
26
|
|
27
27
|
chunk_lengths = ([0] + chunk_lengths)[:-1]
|
28
|
-
result = pa.chunked_array(result + np.cumsum(chunk_lengths))
|
28
|
+
result = pa.chunked_array(result + np.cumsum(chunk_lengths), type=pa.int32())
|
29
29
|
return result
|
30
30
|
|
31
31
|
|
{deltacat-1.1.27 → deltacat-1.1.29}/deltacat/compute/compactor_v2/utils/primary_key_index.py
RENAMED
@@ -10,6 +10,7 @@ from deltacat.compute.compactor_v2.constants import (
|
|
10
10
|
TOTAL_BYTES_IN_SHA1_HASH,
|
11
11
|
PK_DELIMITER,
|
12
12
|
MAX_SIZE_OF_RECORD_BATCH_IN_GIB,
|
13
|
+
SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED,
|
13
14
|
)
|
14
15
|
import time
|
15
16
|
from deltacat.compute.compactor.model.delta_file_envelope import DeltaFileEnvelope
|
@@ -48,6 +49,13 @@ def _is_sha1_desired(hash_columns: List[pa.Array]) -> bool:
|
|
48
49
|
f"Found total length of hash column={total_len} and total_size={total_size}"
|
49
50
|
)
|
50
51
|
|
52
|
+
if SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED:
|
53
|
+
logger.info(
|
54
|
+
f"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED is True. "
|
55
|
+
f"Returning False for is_sha1_desired"
|
56
|
+
)
|
57
|
+
return False
|
58
|
+
|
51
59
|
return total_size > TOTAL_BYTES_IN_SHA1_HASH * total_len
|
52
60
|
|
53
61
|
|
@@ -108,9 +116,10 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
108
116
|
record_batches = []
|
109
117
|
result_len = 0
|
110
118
|
for record_batch in table_batches:
|
111
|
-
|
112
|
-
|
113
|
-
|
119
|
+
if (
|
120
|
+
record_batches
|
121
|
+
and current_bytes + record_batch.nbytes >= MAX_SIZE_OF_RECORD_BATCH_IN_GIB
|
122
|
+
):
|
114
123
|
logger.info(
|
115
124
|
f"Total number of record batches without exceeding {MAX_SIZE_OF_RECORD_BATCH_IN_GIB} "
|
116
125
|
f"is {len(record_batches)} and size {current_bytes}"
|
@@ -128,6 +137,9 @@ def _optimized_group_record_batches_by_hash_bucket(
|
|
128
137
|
current_bytes = 0
|
129
138
|
record_batches.clear()
|
130
139
|
|
140
|
+
current_bytes += record_batch.nbytes
|
141
|
+
record_batches.append(record_batch)
|
142
|
+
|
131
143
|
if record_batches:
|
132
144
|
appended_len, append_latency = timed_invocation(
|
133
145
|
_append_table_by_hash_bucket,
|
@@ -601,6 +601,38 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
|
|
601
601
|
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
602
602
|
assert_compaction_audit=None,
|
603
603
|
),
|
604
|
+
"15-incremental-empty-input-with-single-hash-bucket": IncrementalCompactionTestCaseParams(
|
605
|
+
primary_keys={"pk_col_1"},
|
606
|
+
sort_keys=[SortKey.of(key_name="sk_col_1")],
|
607
|
+
partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
|
608
|
+
partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
|
609
|
+
input_deltas=pa.Table.from_arrays(
|
610
|
+
[
|
611
|
+
pa.array([]),
|
612
|
+
pa.array([]),
|
613
|
+
],
|
614
|
+
names=["pk_col_1", "sk_col_1"],
|
615
|
+
),
|
616
|
+
input_deltas_delta_type=DeltaType.UPSERT,
|
617
|
+
expected_terminal_compact_partition_result=pa.Table.from_arrays(
|
618
|
+
[
|
619
|
+
pa.array([]),
|
620
|
+
pa.array([]),
|
621
|
+
],
|
622
|
+
names=["pk_col_1", "sk_col_1"],
|
623
|
+
),
|
624
|
+
expected_terminal_exception=None,
|
625
|
+
expected_terminal_exception_message=None,
|
626
|
+
do_create_placement_group=False,
|
627
|
+
records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
|
628
|
+
hash_bucket_count=1,
|
629
|
+
read_kwargs_provider=None,
|
630
|
+
drop_duplicates=True,
|
631
|
+
is_inplace=False,
|
632
|
+
add_late_deltas=None,
|
633
|
+
skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
|
634
|
+
assert_compaction_audit=assert_compaction_audit_no_hash_bucket,
|
635
|
+
),
|
604
636
|
}
|
605
637
|
|
606
638
|
INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
|
{deltacat-1.1.27 → deltacat-1.1.29}/deltacat/tests/compute/compactor_v2/test_compaction_session.py
RENAMED
@@ -1,6 +1,7 @@
|
|
1
1
|
from typing import Dict, Any
|
2
2
|
import ray
|
3
3
|
import os
|
4
|
+
import pyarrow as pa
|
4
5
|
import pytest
|
5
6
|
import boto3
|
6
7
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
@@ -76,6 +77,17 @@ def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
|
76
77
|
os.remove(DATABASE_FILE_PATH_VALUE)
|
77
78
|
|
78
79
|
|
80
|
+
@pytest.fixture(scope="function")
|
81
|
+
def disable_sha1(monkeypatch):
|
82
|
+
import deltacat.compute.compactor_v2.utils.primary_key_index
|
83
|
+
|
84
|
+
monkeypatch.setattr(
|
85
|
+
deltacat.compute.compactor_v2.utils.primary_key_index,
|
86
|
+
"SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED",
|
87
|
+
True,
|
88
|
+
)
|
89
|
+
|
90
|
+
|
79
91
|
class TestCompactionSession:
|
80
92
|
"""
|
81
93
|
This class adds specific tests that aren't part of the parametrized test suite.
|
@@ -556,3 +568,124 @@ class TestCompactionSession:
|
|
556
568
|
}
|
557
569
|
)
|
558
570
|
)
|
571
|
+
|
572
|
+
def test_compact_partition_when_incremental_pk_hash_is_over_2gb(
|
573
|
+
self, s3_resource, local_deltacat_storage_kwargs, disable_sha1
|
574
|
+
):
|
575
|
+
"""
|
576
|
+
A test case which ensures the compaction succeeds even if the incremental
|
577
|
+
arrow table size is over 2GB. It is added to prevent ArrowCapacityError
|
578
|
+
when running is_in operation during merge.
|
579
|
+
|
580
|
+
Note that we set SHA1_HASHING_FOR_MEMORY_OPTIMIZATION_DISABLED to bypass sha1 hashing
|
581
|
+
which truncates the lengths of pk strings when deduping.
|
582
|
+
"""
|
583
|
+
# setup
|
584
|
+
staged_source = stage_partition_from_file_paths(
|
585
|
+
self.NAMESPACE, ["source"], **local_deltacat_storage_kwargs
|
586
|
+
)
|
587
|
+
# we create chunked array to avoid ArrowCapacityError
|
588
|
+
chunked_pk_array = pa.chunked_array([["13bytesstring"], ["12bytestring"]])
|
589
|
+
table = pa.table([chunked_pk_array], names=["pk"])
|
590
|
+
source_delta = commit_delta_to_staged_partition(
|
591
|
+
staged_source, pa_table=table, **local_deltacat_storage_kwargs
|
592
|
+
)
|
593
|
+
|
594
|
+
staged_dest = stage_partition_from_file_paths(
|
595
|
+
self.NAMESPACE, ["destination"], **local_deltacat_storage_kwargs
|
596
|
+
)
|
597
|
+
dest_partition = ds.commit_partition(
|
598
|
+
staged_dest, **local_deltacat_storage_kwargs
|
599
|
+
)
|
600
|
+
|
601
|
+
# rebase first
|
602
|
+
rebase_url = compact_partition(
|
603
|
+
CompactPartitionParams.of(
|
604
|
+
{
|
605
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
606
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
607
|
+
"dd_max_parallelism_ratio": 1.0,
|
608
|
+
"deltacat_storage": ds,
|
609
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
610
|
+
"destination_partition_locator": dest_partition.locator,
|
611
|
+
"drop_duplicates": True,
|
612
|
+
"hash_bucket_count": 1,
|
613
|
+
"last_stream_position_to_compact": source_delta.stream_position,
|
614
|
+
"list_deltas_kwargs": {
|
615
|
+
**local_deltacat_storage_kwargs,
|
616
|
+
**{"equivalent_table_types": []},
|
617
|
+
},
|
618
|
+
"primary_keys": ["pk"],
|
619
|
+
"rebase_source_partition_locator": source_delta.partition_locator,
|
620
|
+
"rebase_source_partition_high_watermark": source_delta.stream_position,
|
621
|
+
"records_per_compacted_file": 4000,
|
622
|
+
"s3_client_kwargs": {},
|
623
|
+
"source_partition_locator": source_delta.partition_locator,
|
624
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
625
|
+
}
|
626
|
+
)
|
627
|
+
)
|
628
|
+
|
629
|
+
rebased_rcf = get_rcf(s3_resource, rebase_url)
|
630
|
+
|
631
|
+
assert rebased_rcf.compacted_pyarrow_write_result.files == 1
|
632
|
+
assert rebased_rcf.compacted_pyarrow_write_result.records == 2
|
633
|
+
|
634
|
+
# Run incremental with a small delta on source
|
635
|
+
chunked_pk_array = pa.chunked_array(
|
636
|
+
[["13bytesstring" * 95_000_000], ["12bytestring" * 95_000_000]]
|
637
|
+
) # 2.3GB
|
638
|
+
table = pa.table([chunked_pk_array], names=["pk"])
|
639
|
+
|
640
|
+
incremental_source_delta = commit_delta_to_partition(
|
641
|
+
source_delta.partition_locator,
|
642
|
+
pa_table=table,
|
643
|
+
**local_deltacat_storage_kwargs,
|
644
|
+
)
|
645
|
+
assert (
|
646
|
+
incremental_source_delta.partition_locator == source_delta.partition_locator
|
647
|
+
), "source partition locator should not change"
|
648
|
+
dest_partition = ds.get_partition(
|
649
|
+
dest_partition.stream_locator,
|
650
|
+
dest_partition.partition_values,
|
651
|
+
**local_deltacat_storage_kwargs,
|
652
|
+
)
|
653
|
+
|
654
|
+
assert (
|
655
|
+
dest_partition.locator
|
656
|
+
== rebased_rcf.compacted_delta_locator.partition_locator
|
657
|
+
), "The new destination partition should be same as compacted partition"
|
658
|
+
|
659
|
+
# Run incremental
|
660
|
+
incremental_url = compact_partition(
|
661
|
+
CompactPartitionParams.of(
|
662
|
+
{
|
663
|
+
"compaction_artifact_s3_bucket": TEST_S3_RCF_BUCKET_NAME,
|
664
|
+
"compacted_file_content_type": ContentType.PARQUET,
|
665
|
+
"dd_max_parallelism_ratio": 1.0,
|
666
|
+
"deltacat_storage": ds,
|
667
|
+
"deltacat_storage_kwargs": local_deltacat_storage_kwargs,
|
668
|
+
"destination_partition_locator": dest_partition.locator,
|
669
|
+
"drop_duplicates": True,
|
670
|
+
"hash_bucket_count": 1,
|
671
|
+
"last_stream_position_to_compact": incremental_source_delta.stream_position,
|
672
|
+
"list_deltas_kwargs": {
|
673
|
+
**local_deltacat_storage_kwargs,
|
674
|
+
**{"equivalent_table_types": []},
|
675
|
+
},
|
676
|
+
"primary_keys": ["pk"],
|
677
|
+
"records_per_compacted_file": 4000,
|
678
|
+
"s3_client_kwargs": {},
|
679
|
+
"source_partition_locator": incremental_source_delta.partition_locator,
|
680
|
+
"resource_estimation_method": ResourceEstimationMethod.PREVIOUS_INFLATION,
|
681
|
+
}
|
682
|
+
)
|
683
|
+
)
|
684
|
+
|
685
|
+
incremental_rcf = get_rcf(s3_resource, incremental_url)
|
686
|
+
|
687
|
+
assert incremental_rcf.compacted_pyarrow_write_result.files == 1
|
688
|
+
assert (
|
689
|
+
incremental_rcf.compacted_pyarrow_write_result.pyarrow_bytes >= 2300000000
|
690
|
+
)
|
691
|
+
assert incremental_rcf.compacted_pyarrow_write_result.records == 4
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import pyarrow as pa
|
2
|
+
from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
3
|
+
group_by_pk_hash_bucket,
|
4
|
+
)
|
5
|
+
|
6
|
+
|
7
|
+
class TestGroupByPkHashBucket:
|
8
|
+
def test_sanity(self):
|
9
|
+
record = pa.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
10
|
+
pk = pa.array(["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])
|
11
|
+
record_batch = pa.RecordBatch.from_arrays([record, pk], names=["record", "pk"])
|
12
|
+
table = pa.Table.from_batches([record_batch])
|
13
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
14
|
+
|
15
|
+
assert len(grouped_array) == 3
|
16
|
+
total_records = 0
|
17
|
+
for arr in grouped_array:
|
18
|
+
if arr is not None:
|
19
|
+
total_records += len(arr[1])
|
20
|
+
|
21
|
+
assert total_records == len(table)
|
22
|
+
|
23
|
+
def test_when_record_batches_exceed_int_max_size(self):
|
24
|
+
record = pa.array(["12bytestring" * 90_000_000])
|
25
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
26
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
27
|
+
|
28
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
29
|
+
|
30
|
+
assert len(grouped_array) == 3
|
31
|
+
# two record batches are preserved as combining them
|
32
|
+
# would exceed 2GB.
|
33
|
+
assert len(grouped_array[2].to_batches()) == 2
|
34
|
+
|
35
|
+
def test_when_record_batches_less_than_int_max_size(self):
|
36
|
+
record = pa.array(["12bytestring" * 90_000])
|
37
|
+
record_batch = pa.RecordBatch.from_arrays([record], names=["pk"])
|
38
|
+
table = pa.Table.from_batches([record_batch, record_batch])
|
39
|
+
|
40
|
+
grouped_array = group_by_pk_hash_bucket(table, 3, ["pk"])
|
41
|
+
|
42
|
+
assert len(grouped_array) == 3
|
43
|
+
# Combined the arrays into one record batch as the size
|
44
|
+
# would not exceed 2GB.
|
45
|
+
assert len(grouped_array[1].to_batches()) == 1
|
@@ -47,7 +47,8 @@ def stage_partition_from_file_paths(
|
|
47
47
|
|
48
48
|
def commit_delta_to_staged_partition(
|
49
49
|
staged_partition,
|
50
|
-
file_paths: List[str],
|
50
|
+
file_paths: List[str] = None,
|
51
|
+
pa_table: pa.Table = None,
|
51
52
|
content_type: ContentType = ContentType.PARQUET,
|
52
53
|
*args,
|
53
54
|
**kwargs,
|
@@ -57,6 +58,7 @@ def commit_delta_to_staged_partition(
|
|
57
58
|
*args,
|
58
59
|
file_paths=file_paths,
|
59
60
|
content_type=content_type,
|
61
|
+
pa_table=pa_table,
|
60
62
|
**kwargs,
|
61
63
|
)
|
62
64
|
ds.commit_partition(staged_partition, **kwargs)
|
@@ -76,23 +78,28 @@ def download_delta(delta_like: Union[Delta, DeltaLocator], *args, **kwargs) -> D
|
|
76
78
|
|
77
79
|
def commit_delta_to_partition(
|
78
80
|
partition: Union[Partition, PartitionLocator],
|
79
|
-
file_paths: List[str],
|
81
|
+
file_paths: List[str] = None,
|
82
|
+
pa_table: pa.Table = None,
|
80
83
|
content_type: ContentType = ContentType.PARQUET,
|
81
84
|
*args,
|
82
85
|
**kwargs,
|
83
86
|
) -> Delta:
|
84
|
-
tables = []
|
85
87
|
|
86
88
|
if isinstance(partition, PartitionLocator):
|
87
89
|
partition = ds.get_partition(
|
88
90
|
partition.stream_locator, partition.partition_values, *args, **kwargs
|
89
91
|
)
|
92
|
+
if pa_table is None:
|
93
|
+
assert file_paths is not None, "One of pa_table or file_paths must be passed."
|
94
|
+
tables = []
|
95
|
+
for file_path in file_paths:
|
96
|
+
table = pa.csv.read_csv(file_path)
|
97
|
+
tables.append(table)
|
90
98
|
|
91
|
-
|
92
|
-
table = pa.csv.read_csv(file_path)
|
93
|
-
tables.append(table)
|
99
|
+
pa_table = pa.concat_tables(tables)
|
94
100
|
|
95
|
-
|
96
|
-
|
101
|
+
staged_delta = ds.stage_delta(
|
102
|
+
pa_table, partition, content_type=content_type, **kwargs
|
103
|
+
)
|
97
104
|
|
98
105
|
return ds.commit_delta(staged_delta, **kwargs)
|