deltacat 0.2.10__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deltacat-0.2.10 → deltacat-1.0.0}/PKG-INFO +1 -1
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/__init__.py +1 -1
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/aws/s3u.py +250 -111
- deltacat-1.0.0/deltacat/catalog/default_catalog_impl/__init__.py +369 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/compaction_session.py +175 -152
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
- deltacat-1.0.0/deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/model/merge_input.py +8 -24
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/model/merge_result.py +1 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/steps/merge.py +106 -171
- deltacat-1.0.0/deltacat/compute/compactor_v2/utils/delta.py +97 -0
- deltacat-1.0.0/deltacat/compute/compactor_v2/utils/merge.py +126 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/utils/task_options.py +16 -4
- deltacat-1.0.0/deltacat/compute/merge_on_read/__init__.py +4 -0
- deltacat-1.0.0/deltacat/compute/merge_on_read/daft.py +40 -0
- deltacat-1.0.0/deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
- deltacat-1.0.0/deltacat/compute/merge_on_read/utils/delta.py +42 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/interface.py +10 -2
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/types.py +3 -11
- deltacat-1.0.0/deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/compact_partition_test_cases.py +126 -1
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/local_deltacat_storage/__init__.py +19 -2
- deltacat-1.0.0/deltacat/tests/test_utils/pyarrow.py +68 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/utils/test_daft.py +42 -2
- deltacat-1.0.0/deltacat/types/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/types/media.py +5 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/types/tables.py +7 -1
- deltacat-1.0.0/deltacat/utils/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/daft.py +78 -13
- deltacat-1.0.0/deltacat/utils/ray_utils/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat.egg-info/PKG-INFO +1 -1
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat.egg-info/SOURCES.txt +12 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat.egg-info/requires.txt +1 -1
- {deltacat-0.2.10 → deltacat-1.0.0}/setup.py +1 -1
- deltacat-0.2.10/deltacat/tests/test_utils/pyarrow.py +0 -49
- {deltacat-0.2.10 → deltacat-1.0.0}/MANIFEST.in +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/README.md +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/aws/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/aws/clients.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/aws/constants.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/aws/redshift/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/aws/redshift/model/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/aws/redshift/model/manifest.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/benchmarking/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/benchmarking/benchmark_parquet_reads.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/benchmarking/conftest.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/catalog/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/catalog/delegate.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/catalog/interface.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/catalog/model/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/catalog/model/catalog.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/catalog/model/table_definition.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/compaction_session.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/compact_partition_params.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/compaction_session_audit_info.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/compactor_version.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/delta_annotated.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/delta_file_envelope.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/materialize_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/primary_key_index.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/repartition_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/model/round_completion_info.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/repartition_session.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/steps/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/steps/dedupe.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/steps/hash_bucket.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/steps/materialize.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/steps/repartition.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/utils/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/utils/io.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/utils/primary_key_index.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/utils/sort_key.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor/utils/system_columns.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/constants.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/model/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/model/hash_bucket_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/steps/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/utils/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/utils/content_type_params.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/utils/dedupe.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/utils/io.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/compactor_v2/utils/primary_key_index.py +0 -0
- {deltacat-0.2.10/deltacat/compute/metastats → deltacat-1.0.0/deltacat/compute/merge_on_read/model}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/compute/metastats/config → deltacat-1.0.0/deltacat/compute/merge_on_read/utils}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/compute/metastats/model → deltacat-1.0.0/deltacat/compute/metastats}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/compute/metastats/utils → deltacat-1.0.0/deltacat/compute/metastats/config}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/metastats/meta_stats.py +0 -0
- {deltacat-0.2.10/deltacat/compute/stats → deltacat-1.0.0/deltacat/compute/metastats/model}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/metastats/stats.py +0 -0
- {deltacat-0.2.10/deltacat/compute/stats/models → deltacat-1.0.0/deltacat/compute/metastats/utils}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/metastats/utils/constants.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/metastats/utils/io.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
- {deltacat-0.2.10/deltacat/compute/stats/utils → deltacat-1.0.0/deltacat/compute/stats}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/basic.py +0 -0
- {deltacat-0.2.10/deltacat/io → deltacat-1.0.0/deltacat/compute/stats/models}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/models/delta_stats.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/models/stats_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/types.py +0 -0
- {deltacat-0.2.10/deltacat/io/aws → deltacat-1.0.0/deltacat/compute/stats/utils}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/utils/intervals.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/utils/io.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/constants.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/exceptions.py +0 -0
- {deltacat-0.2.10/deltacat/io/aws/redshift → deltacat-1.0.0/deltacat/io}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/storage/model → deltacat-1.0.0/deltacat/io/aws}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/tests → deltacat-1.0.0/deltacat/io/aws/redshift}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/dataset.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/file_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/memcached_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/ray_plasma_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/read_api.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/redis_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/io/s3_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/logs.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/tests/aws → deltacat-1.0.0/deltacat/storage/model}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/delta.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/list_result.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/locator.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/namespace.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/partition.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/sort_key.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/stream.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/table.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/storage/model/table_version.py +0 -0
- {deltacat-0.2.10/deltacat/tests/compute → deltacat-1.0.0/deltacat/tests}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/tests/compute/compactor → deltacat-1.0.0/deltacat/tests/aws}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/aws/test_clients.py +0 -0
- {deltacat-0.2.10/deltacat/tests/compute/compactor/steps → deltacat-1.0.0/deltacat/tests/catalog}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/tests/compute/compactor/utils → deltacat-1.0.0/deltacat/tests/compute}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/tests/compute/compactor_v2 → deltacat-1.0.0/deltacat/tests/compute/compactor}/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/tests/compute/compactor_v2/utils → deltacat-1.0.0/deltacat/tests/compute/compactor/steps}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/compactor/steps/test_repartition.py +0 -0
- {deltacat-0.2.10/deltacat/tests/io → deltacat-1.0.0/deltacat/tests/compute/compactor/utils}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/compactor/utils/test_io.py +0 -0
- {deltacat-0.2.10/deltacat/tests/stats → deltacat-1.0.0/deltacat/tests/compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/compactor_v2/test_hashlib.py +0 -0
- {deltacat-0.2.10/deltacat/tests/test_utils → deltacat-1.0.0/deltacat/tests/compute/compactor_v2/utils}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/test_compact_partition_params.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/test_util_common.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/test_util_constant.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -0
- {deltacat-0.2.10/deltacat/tests/utils → deltacat-1.0.0/deltacat/tests/io}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/io/test_cloudpickle_bug_fix.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/io/test_file_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/io/test_memcached_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/io/test_redis_object_store.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/io/test_s3_object_store.py +0 -0
- {deltacat-0.2.10/deltacat/tests/utils/data → deltacat-1.0.0/deltacat/tests/stats}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/stats/test_intervals.py +0 -0
- {deltacat-0.2.10/deltacat/types → deltacat-1.0.0/deltacat/tests/test_utils}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/test_utils/constants.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/test_utils/storage.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/test_utils/utils.py +0 -0
- {deltacat-0.2.10/deltacat → deltacat-1.0.0/deltacat/tests}/utils/__init__.py +0 -0
- {deltacat-0.2.10/deltacat/utils/ray_utils → deltacat-1.0.0/deltacat/tests/utils/data}/__init__.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/utils/test_cloudpickle.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/utils/test_pyarrow.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/tests/utils/test_resources.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/types/partial_download.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/arguments.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/cloudpickle.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/common.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/metrics.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/numpy.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/pandas.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/performance.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/placement.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/pyarrow.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/ray_utils/collections.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/ray_utils/concurrency.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/ray_utils/dataset.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/ray_utils/performance.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/ray_utils/runtime.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/resources.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/s3fs.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat/utils/schema.py +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat.egg-info/dependency_links.txt +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/deltacat.egg-info/top_level.txt +0 -0
- {deltacat-0.2.10 → deltacat-1.0.0}/setup.cfg +0 -0
@@ -22,7 +22,7 @@ from tenacity import (
|
|
22
22
|
stop_after_delay,
|
23
23
|
wait_random_exponential,
|
24
24
|
)
|
25
|
-
|
25
|
+
from deltacat.utils.ray_utils.concurrency import invoke_parallel
|
26
26
|
import deltacat.aws.clients as aws_utils
|
27
27
|
from deltacat import logs
|
28
28
|
from deltacat.aws.constants import TIMEOUT_ERROR_CODES
|
@@ -35,10 +35,17 @@ from deltacat.storage import (
|
|
35
35
|
ManifestEntry,
|
36
36
|
ManifestEntryList,
|
37
37
|
)
|
38
|
-
from deltacat.types.media import
|
38
|
+
from deltacat.types.media import (
|
39
|
+
ContentEncoding,
|
40
|
+
ContentType,
|
41
|
+
TableType,
|
42
|
+
DistributedDatasetType,
|
43
|
+
)
|
39
44
|
from deltacat.types.tables import (
|
40
45
|
TABLE_CLASS_TO_SIZE_FUNC,
|
41
46
|
TABLE_TYPE_TO_READER_FUNC,
|
47
|
+
TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS,
|
48
|
+
DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC,
|
42
49
|
get_table_length,
|
43
50
|
)
|
44
51
|
from deltacat.types.partial_download import PartialFileDownloadParams
|
@@ -284,59 +291,6 @@ def upload_sliced_table(
|
|
284
291
|
return manifest_entries
|
285
292
|
|
286
293
|
|
287
|
-
@ray.remote
|
288
|
-
def _block_metadata(block: Block) -> BlockMetadata:
|
289
|
-
return BlockAccessor.for_block(block).get_metadata(
|
290
|
-
input_files=None,
|
291
|
-
exec_stats=None,
|
292
|
-
)
|
293
|
-
|
294
|
-
|
295
|
-
def _get_metadata(
|
296
|
-
table: Union[LocalTable, DistributedDataset],
|
297
|
-
write_paths: List[str],
|
298
|
-
block_refs: List[ObjectRef[Block]],
|
299
|
-
) -> List[BlockMetadata]:
|
300
|
-
metadata: List[BlockMetadata] = []
|
301
|
-
if not block_refs:
|
302
|
-
# this must be a local table - ensure it was written to only 1 file
|
303
|
-
assert len(write_paths) == 1, (
|
304
|
-
f"Expected table of type '{type(table)}' to be written to 1 "
|
305
|
-
f"file, but found {len(write_paths)} files."
|
306
|
-
)
|
307
|
-
table_size = None
|
308
|
-
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
309
|
-
if table_size_func:
|
310
|
-
table_size = table_size_func(table)
|
311
|
-
else:
|
312
|
-
logger.warning(f"Unable to estimate '{type(table)}' table size.")
|
313
|
-
metadata.append(
|
314
|
-
BlockMetadata(
|
315
|
-
num_rows=get_table_length(table),
|
316
|
-
size_bytes=table_size,
|
317
|
-
schema=None,
|
318
|
-
input_files=None,
|
319
|
-
exec_stats=None,
|
320
|
-
)
|
321
|
-
)
|
322
|
-
else:
|
323
|
-
# TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
|
324
|
-
# ray 1.10
|
325
|
-
# metadata = dataset._blocks.get_metadata()
|
326
|
-
# ray 2.0.0dev
|
327
|
-
metadata = table._plan.execute().get_metadata()
|
328
|
-
if (
|
329
|
-
not metadata
|
330
|
-
or metadata[0].size_bytes is None
|
331
|
-
or metadata[0].num_rows is None
|
332
|
-
):
|
333
|
-
metadata_futures = [
|
334
|
-
_block_metadata.remote(block_ref) for block_ref in block_refs
|
335
|
-
]
|
336
|
-
metadata = ray.get(metadata_futures)
|
337
|
-
return metadata
|
338
|
-
|
339
|
-
|
340
294
|
def upload_table(
|
341
295
|
table: Union[LocalTable, DistributedDataset],
|
342
296
|
s3_base_url: str,
|
@@ -403,17 +357,7 @@ def download_manifest_entry(
|
|
403
357
|
content_encoding: Optional[ContentEncoding] = None,
|
404
358
|
) -> LocalTable:
|
405
359
|
|
406
|
-
|
407
|
-
s3_client_kwargs = (
|
408
|
-
{
|
409
|
-
"aws_access_key_id": token_holder["accessKeyId"],
|
410
|
-
"aws_secret_access_key": token_holder["secretAccessKey"],
|
411
|
-
"aws_session_token": token_holder["sessionToken"],
|
412
|
-
"config": conf,
|
413
|
-
}
|
414
|
-
if token_holder
|
415
|
-
else {"config": conf}
|
416
|
-
)
|
360
|
+
s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
|
417
361
|
if not content_type:
|
418
362
|
content_type = manifest_entry.meta.content_type
|
419
363
|
assert (
|
@@ -458,51 +402,9 @@ def download_manifest_entry(
|
|
458
402
|
return table
|
459
403
|
|
460
404
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
table_type: TableType = TableType.PYARROW,
|
465
|
-
column_names: Optional[List[str]] = None,
|
466
|
-
include_columns: Optional[List[str]] = None,
|
467
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
468
|
-
) -> LocalDataset:
|
469
|
-
|
470
|
-
return [
|
471
|
-
download_manifest_entry(
|
472
|
-
e,
|
473
|
-
token_holder,
|
474
|
-
table_type,
|
475
|
-
column_names,
|
476
|
-
include_columns,
|
477
|
-
file_reader_kwargs_provider,
|
478
|
-
)
|
479
|
-
for e in manifest.entries
|
480
|
-
]
|
481
|
-
|
482
|
-
|
483
|
-
def _download_manifest_entries_parallel(
|
484
|
-
manifest: Manifest,
|
485
|
-
token_holder: Optional[Dict[str, Any]] = None,
|
486
|
-
table_type: TableType = TableType.PYARROW,
|
487
|
-
max_parallelism: Optional[int] = None,
|
488
|
-
column_names: Optional[List[str]] = None,
|
489
|
-
include_columns: Optional[List[str]] = None,
|
490
|
-
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
491
|
-
) -> LocalDataset:
|
492
|
-
|
493
|
-
tables = []
|
494
|
-
pool = multiprocessing.Pool(max_parallelism)
|
495
|
-
downloader = partial(
|
496
|
-
download_manifest_entry,
|
497
|
-
token_holder=token_holder,
|
498
|
-
table_type=table_type,
|
499
|
-
column_names=column_names,
|
500
|
-
include_columns=include_columns,
|
501
|
-
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
502
|
-
)
|
503
|
-
for table in pool.map(downloader, [e for e in manifest.entries]):
|
504
|
-
tables.append(table)
|
505
|
-
return tables
|
405
|
+
@ray.remote
|
406
|
+
def download_manifest_entry_ray(*args, **kwargs) -> ObjectRef[LocalTable]:
|
407
|
+
return download_manifest_entry(*args, **kwargs)
|
506
408
|
|
507
409
|
|
508
410
|
def download_manifest_entries(
|
@@ -536,6 +438,42 @@ def download_manifest_entries(
|
|
536
438
|
)
|
537
439
|
|
538
440
|
|
441
|
+
def download_manifest_entries_distributed(
|
442
|
+
manifest: Manifest,
|
443
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
444
|
+
table_type: TableType = TableType.PYARROW,
|
445
|
+
max_parallelism: Optional[int] = 1000,
|
446
|
+
column_names: Optional[List[str]] = None,
|
447
|
+
include_columns: Optional[List[str]] = None,
|
448
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
449
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
450
|
+
distributed_dataset_type: Optional[
|
451
|
+
DistributedDatasetType
|
452
|
+
] = DistributedDatasetType.RAY_DATASET,
|
453
|
+
) -> DistributedDataset:
|
454
|
+
|
455
|
+
params = {
|
456
|
+
"manifest": manifest,
|
457
|
+
"token_holder": token_holder,
|
458
|
+
"table_type": table_type,
|
459
|
+
"max_parallelism": max_parallelism,
|
460
|
+
"column_names": column_names,
|
461
|
+
"include_columns": include_columns,
|
462
|
+
"file_reader_kwargs_provider": file_reader_kwargs_provider,
|
463
|
+
"ray_options_provider": ray_options_provider,
|
464
|
+
"distributed_dataset_type": distributed_dataset_type,
|
465
|
+
}
|
466
|
+
|
467
|
+
if distributed_dataset_type == DistributedDatasetType.RAY_DATASET:
|
468
|
+
return _download_manifest_entries_ray_data_distributed(**params)
|
469
|
+
elif distributed_dataset_type is not None:
|
470
|
+
return _download_manifest_entries_all_dataset_distributed(**params)
|
471
|
+
else:
|
472
|
+
raise ValueError(
|
473
|
+
f"Distributed dataset type {distributed_dataset_type} not supported."
|
474
|
+
)
|
475
|
+
|
476
|
+
|
539
477
|
def upload(s3_url: str, body, **s3_client_kwargs) -> Dict[str, Any]:
|
540
478
|
|
541
479
|
# TODO (pdames): add tenacity retrying
|
@@ -574,3 +512,204 @@ def download(
|
|
574
512
|
else:
|
575
513
|
logger.info(f"file not found: {s3_url}")
|
576
514
|
return None
|
515
|
+
|
516
|
+
|
517
|
+
def _download_manifest_entries_parallel(
|
518
|
+
manifest: Manifest,
|
519
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
520
|
+
table_type: TableType = TableType.PYARROW,
|
521
|
+
max_parallelism: Optional[int] = None,
|
522
|
+
column_names: Optional[List[str]] = None,
|
523
|
+
include_columns: Optional[List[str]] = None,
|
524
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
525
|
+
) -> LocalDataset:
|
526
|
+
|
527
|
+
tables = []
|
528
|
+
pool = multiprocessing.Pool(max_parallelism)
|
529
|
+
downloader = partial(
|
530
|
+
download_manifest_entry,
|
531
|
+
token_holder=token_holder,
|
532
|
+
table_type=table_type,
|
533
|
+
column_names=column_names,
|
534
|
+
include_columns=include_columns,
|
535
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
536
|
+
)
|
537
|
+
for table in pool.map(downloader, [e for e in manifest.entries]):
|
538
|
+
tables.append(table)
|
539
|
+
return tables
|
540
|
+
|
541
|
+
|
542
|
+
def _download_manifest_entries(
|
543
|
+
manifest: Manifest,
|
544
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
545
|
+
table_type: TableType = TableType.PYARROW,
|
546
|
+
column_names: Optional[List[str]] = None,
|
547
|
+
include_columns: Optional[List[str]] = None,
|
548
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
549
|
+
) -> LocalDataset:
|
550
|
+
|
551
|
+
return [
|
552
|
+
download_manifest_entry(
|
553
|
+
manifest_entry=e,
|
554
|
+
token_holder=token_holder,
|
555
|
+
table_type=table_type,
|
556
|
+
column_names=column_names,
|
557
|
+
include_columns=include_columns,
|
558
|
+
file_reader_kwargs_provider=file_reader_kwargs_provider,
|
559
|
+
)
|
560
|
+
for e in manifest.entries
|
561
|
+
]
|
562
|
+
|
563
|
+
|
564
|
+
@ray.remote
|
565
|
+
def _block_metadata(block: Block) -> BlockMetadata:
|
566
|
+
return BlockAccessor.for_block(block).get_metadata(
|
567
|
+
input_files=None,
|
568
|
+
exec_stats=None,
|
569
|
+
)
|
570
|
+
|
571
|
+
|
572
|
+
def _get_s3_client_kwargs_from_token(token_holder) -> Dict[Any, Any]:
|
573
|
+
conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
574
|
+
return (
|
575
|
+
{
|
576
|
+
"aws_access_key_id": token_holder["accessKeyId"],
|
577
|
+
"aws_secret_access_key": token_holder["secretAccessKey"],
|
578
|
+
"aws_session_token": token_holder["sessionToken"],
|
579
|
+
"config": conf,
|
580
|
+
}
|
581
|
+
if token_holder
|
582
|
+
else {"config": conf}
|
583
|
+
)
|
584
|
+
|
585
|
+
|
586
|
+
def _get_metadata(
|
587
|
+
table: Union[LocalTable, DistributedDataset],
|
588
|
+
write_paths: List[str],
|
589
|
+
block_refs: List[ObjectRef[Block]],
|
590
|
+
) -> List[BlockMetadata]:
|
591
|
+
metadata: List[BlockMetadata] = []
|
592
|
+
if not block_refs:
|
593
|
+
# this must be a local table - ensure it was written to only 1 file
|
594
|
+
assert len(write_paths) == 1, (
|
595
|
+
f"Expected table of type '{type(table)}' to be written to 1 "
|
596
|
+
f"file, but found {len(write_paths)} files."
|
597
|
+
)
|
598
|
+
table_size = None
|
599
|
+
table_size_func = TABLE_CLASS_TO_SIZE_FUNC.get(type(table))
|
600
|
+
if table_size_func:
|
601
|
+
table_size = table_size_func(table)
|
602
|
+
else:
|
603
|
+
logger.warning(f"Unable to estimate '{type(table)}' table size.")
|
604
|
+
metadata.append(
|
605
|
+
BlockMetadata(
|
606
|
+
num_rows=get_table_length(table),
|
607
|
+
size_bytes=table_size,
|
608
|
+
schema=None,
|
609
|
+
input_files=None,
|
610
|
+
exec_stats=None,
|
611
|
+
)
|
612
|
+
)
|
613
|
+
else:
|
614
|
+
# TODO(pdames): Expose BlockList metadata getter from Ray Dataset?
|
615
|
+
# ray 1.10
|
616
|
+
# metadata = dataset._blocks.get_metadata()
|
617
|
+
# ray 2.0.0dev
|
618
|
+
metadata = table._plan.execute().get_metadata()
|
619
|
+
if (
|
620
|
+
not metadata
|
621
|
+
or metadata[0].size_bytes is None
|
622
|
+
or metadata[0].num_rows is None
|
623
|
+
):
|
624
|
+
metadata_futures = [
|
625
|
+
_block_metadata.remote(block_ref) for block_ref in block_refs
|
626
|
+
]
|
627
|
+
metadata = ray.get(metadata_futures)
|
628
|
+
return metadata
|
629
|
+
|
630
|
+
|
631
|
+
def _download_manifest_entries_ray_data_distributed(
|
632
|
+
manifest: Manifest,
|
633
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
634
|
+
table_type: TableType = TableType.PYARROW,
|
635
|
+
max_parallelism: Optional[int] = 1000,
|
636
|
+
column_names: Optional[List[str]] = None,
|
637
|
+
include_columns: Optional[List[str]] = None,
|
638
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
639
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
640
|
+
) -> DistributedDataset:
|
641
|
+
|
642
|
+
table_pending_ids = []
|
643
|
+
manifest_entries = manifest.entries
|
644
|
+
if manifest_entries:
|
645
|
+
table_pending_ids = invoke_parallel(
|
646
|
+
manifest_entries,
|
647
|
+
download_manifest_entry_ray,
|
648
|
+
token_holder,
|
649
|
+
table_type,
|
650
|
+
column_names,
|
651
|
+
include_columns,
|
652
|
+
file_reader_kwargs_provider,
|
653
|
+
max_parallelism=max_parallelism,
|
654
|
+
options_provider=ray_options_provider,
|
655
|
+
)
|
656
|
+
return TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS[table_type](table_pending_ids)
|
657
|
+
|
658
|
+
|
659
|
+
def _download_manifest_entries_all_dataset_distributed(
|
660
|
+
manifest: Manifest,
|
661
|
+
token_holder: Optional[Dict[str, Any]] = None,
|
662
|
+
table_type: TableType = TableType.PYARROW,
|
663
|
+
max_parallelism: Optional[int] = 1000,
|
664
|
+
column_names: Optional[List[str]] = None,
|
665
|
+
include_columns: Optional[List[str]] = None,
|
666
|
+
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
667
|
+
ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
|
668
|
+
distributed_dataset_type: Optional[
|
669
|
+
DistributedDatasetType
|
670
|
+
] = DistributedDatasetType.RAY_DATASET,
|
671
|
+
) -> DistributedDataset:
|
672
|
+
|
673
|
+
entry_content_type = None
|
674
|
+
entry_content_encoding = None
|
675
|
+
uris = []
|
676
|
+
for entry in manifest.entries or []:
|
677
|
+
if (
|
678
|
+
entry_content_type is not None
|
679
|
+
and entry_content_type != entry.meta.content_type
|
680
|
+
):
|
681
|
+
raise ValueError(
|
682
|
+
f"Mixed content types of ({entry_content_type},"
|
683
|
+
f" {entry.meta.content_type}) is not supported."
|
684
|
+
)
|
685
|
+
|
686
|
+
if (
|
687
|
+
entry_content_encoding is not None
|
688
|
+
and entry_content_encoding != entry.meta.content_encoding
|
689
|
+
):
|
690
|
+
raise ValueError(
|
691
|
+
f"Mixed content encoding of {entry_content_encoding},"
|
692
|
+
f" {entry.meta.content_encoding} is not supported."
|
693
|
+
)
|
694
|
+
|
695
|
+
entry_content_type = entry.meta.content_type
|
696
|
+
entry_content_encoding = entry.meta.content_encoding
|
697
|
+
uris.append(entry.uri)
|
698
|
+
|
699
|
+
s3_client_kwargs = _get_s3_client_kwargs_from_token(token_holder=token_holder)
|
700
|
+
|
701
|
+
if distributed_dataset_type in DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC:
|
702
|
+
return DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC[distributed_dataset_type.value](
|
703
|
+
uris=uris,
|
704
|
+
content_type=entry_content_type,
|
705
|
+
content_encoding=entry_content_encoding,
|
706
|
+
column_names=column_names,
|
707
|
+
include_columns=include_columns,
|
708
|
+
read_func_kwargs_provider=file_reader_kwargs_provider,
|
709
|
+
ray_options_provider=ray_options_provider,
|
710
|
+
s3_client_kwargs=s3_client_kwargs,
|
711
|
+
)
|
712
|
+
else:
|
713
|
+
raise ValueError(
|
714
|
+
f"Unsupported distributed dataset type={distributed_dataset_type}"
|
715
|
+
)
|