deltacat 0.1.18b13__tar.gz → 0.1.18b15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/PKG-INFO +14 -2
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/README.md +13 -1
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/__init__.py +3 -2
- deltacat-0.1.18b15/deltacat/aws/clients.py +189 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/redshift/model/manifest.py +4 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/s3u.py +24 -1
- deltacat-0.1.18b15/deltacat/benchmarking/benchmark_parquet_reads.py +53 -0
- deltacat-0.1.18b15/deltacat/benchmarking/conftest.py +61 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/catalog/delegate.py +1 -1
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/catalog/interface.py +1 -1
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/__init__.py +0 -3
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/compaction_session.py +45 -20
- deltacat-0.1.18b15/deltacat/compute/compactor/model/compact_partition_params.py +382 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/compaction_session_audit_info.py +150 -9
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/delta_annotated.py +91 -9
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/delta_file_envelope.py +15 -3
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/primary_key_index.py +1 -1
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/round_completion_info.py +17 -1
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/repartition_session.py +5 -3
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/steps/dedupe.py +10 -8
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/steps/hash_bucket.py +25 -4
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/steps/materialize.py +11 -6
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/steps/repartition.py +16 -1
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/utils/io.py +40 -23
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/utils/primary_key_index.py +1 -15
- deltacat-0.1.18b15/deltacat/compute/compactor/utils/sort_key.py +57 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/utils/system_columns.py +43 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/compaction_session.py +506 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/constants.py +34 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/model/hash_bucket_input.py +78 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/model/hash_bucket_result.py +12 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/model/merge_input.py +127 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/model/merge_result.py +12 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/steps/hash_bucket.py +203 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/steps/merge.py +41 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils/content_type_params.py +37 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils/io.py +149 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils/primary_key_index.py +308 -0
- deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils/task_options.py +228 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/meta_stats.py +4 -2
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/stats.py +1 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/utils/io.py +4 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/utils/io.py +20 -5
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/exceptions.py +4 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/memcached_object_store.py +37 -14
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/logs.py +4 -3
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/__init__.py +3 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/interface.py +11 -2
- deltacat-0.1.18b15/deltacat/storage/model/sort_key.py +33 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/table_version.py +11 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/types.py +2 -1
- deltacat-0.1.18b15/deltacat/tests/aws/test_clients.py +80 -0
- deltacat-0.1.18b15/deltacat/tests/compute/common.py +96 -0
- {deltacat-0.1.18b13/deltacat/tests → deltacat-0.1.18b15/deltacat/tests/compute/compactor/steps}/test_repartition.py +22 -8
- deltacat-0.1.18b15/deltacat/tests/compute/compactor/utils/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/tests → deltacat-0.1.18b15/deltacat/tests/compute}/compactor/utils/test_io.py +47 -5
- deltacat-0.1.18b15/deltacat/tests/compute/compactor_v2/__init__.py +0 -0
- deltacat-0.1.18b15/deltacat/tests/compute/compactor_v2/steps/__init__.py +0 -0
- deltacat-0.1.18b15/deltacat/tests/compute/compactor_v2/steps/test_hash_bucket.py +199 -0
- {deltacat-0.1.18b13/deltacat/tests/compactor → deltacat-0.1.18b15/deltacat/tests/compute}/test_compact_partition_params.py +14 -30
- deltacat-0.1.18b15/deltacat/tests/compute/test_compaction_session_incremental.py +348 -0
- deltacat-0.1.18b15/deltacat/tests/compute/testcases.py +390 -0
- deltacat-0.1.18b15/deltacat/tests/io/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_memcached_object_store.py +5 -4
- deltacat-0.1.18b15/deltacat/tests/local_deltacat_storage/__init__.py +1109 -0
- deltacat-0.1.18b15/deltacat/tests/stats/__init__.py +0 -0
- deltacat-0.1.18b15/deltacat/tests/test_utils/__init__.py +0 -0
- deltacat-0.1.18b15/deltacat/tests/test_utils/pyarrow.py +32 -0
- deltacat-0.1.18b15/deltacat/tests/test_utils/utils.py +13 -0
- deltacat-0.1.18b15/deltacat/tests/utils/__init__.py +0 -0
- deltacat-0.1.18b15/deltacat/tests/utils/data/__init__.py +0 -0
- deltacat-0.1.18b15/deltacat/tests/utils/test_daft.py +76 -0
- deltacat-0.1.18b15/deltacat/tests/utils/test_pyarrow.py +133 -0
- deltacat-0.1.18b15/deltacat/tests/utils/test_resources.py +48 -0
- deltacat-0.1.18b15/deltacat/types/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/types/media.py +1 -0
- deltacat-0.1.18b15/deltacat/types/partial_download.py +82 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/types/tables.py +1 -0
- deltacat-0.1.18b15/deltacat/utils/__init__.py +0 -0
- deltacat-0.1.18b15/deltacat/utils/arguments.py +26 -0
- deltacat-0.1.18b15/deltacat/utils/daft.py +87 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/performance.py +4 -2
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/placement.py +20 -3
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/pyarrow.py +213 -1
- deltacat-0.1.18b15/deltacat/utils/ray_utils/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/concurrency.py +26 -1
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/resources.py +72 -1
- deltacat-0.1.18b15/deltacat/utils/s3fs.py +21 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/PKG-INFO +14 -2
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/SOURCES.txt +46 -6
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/requires.txt +3 -1
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/setup.py +3 -1
- deltacat-0.1.18b13/deltacat/aws/clients.py +0 -69
- deltacat-0.1.18b13/deltacat/compute/compactor/model/compact_partition_params.py +0 -153
- deltacat-0.1.18b13/deltacat/compute/compactor/model/sort_key.py +0 -98
- deltacat-0.1.18b13/deltacat/tests/utils/test_resources.py +0 -45
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/MANIFEST.in +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/constants.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/redshift/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/aws/redshift/model/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/catalog → deltacat-0.1.18b15/deltacat/benchmarking}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/catalog/model → deltacat-0.1.18b15/deltacat/catalog}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute → deltacat-0.1.18b15/deltacat/catalog/model}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/catalog/model/catalog.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/catalog/model/table_definition.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/compactor/model → deltacat-0.1.18b15/deltacat/compute}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/compactor/steps → deltacat-0.1.18b15/deltacat/compute/compactor/model}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/dedupe_result.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/delta_file_locator.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/hash_bucket_result.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/materialize_result.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/pyarrow_write_result.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/model/repartition_result.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/compactor/utils → deltacat-0.1.18b15/deltacat/compute/compactor/steps}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/metastats → deltacat-0.1.18b15/deltacat/compute/compactor/utils}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/compactor/utils/round_completion_file.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/metastats/config → deltacat-0.1.18b15/deltacat/compute/compactor_v2}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/metastats → deltacat-0.1.18b15/deltacat/compute/compactor_v2}/model/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/metastats/utils → deltacat-0.1.18b15/deltacat/compute/compactor_v2/steps}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/stats → deltacat-0.1.18b15/deltacat/compute/compactor_v2/utils}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/stats/models → deltacat-0.1.18b15/deltacat/compute/metastats}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/compute/stats/utils → deltacat-0.1.18b15/deltacat/compute/metastats/config}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/io → deltacat-0.1.18b15/deltacat/compute/metastats/model}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/model/partition_stats_dict.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/model/stats_cluster_size_estimator.py +0 -0
- {deltacat-0.1.18b13/deltacat/io/aws → deltacat-0.1.18b15/deltacat/compute/metastats/utils}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/utils/constants.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/metastats/utils/ray_utils.py +0 -0
- {deltacat-0.1.18b13/deltacat/io/aws/redshift → deltacat-0.1.18b15/deltacat/compute/stats}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/basic.py +0 -0
- {deltacat-0.1.18b13/deltacat/storage/model → deltacat-0.1.18b15/deltacat/compute/stats/models}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/delta_column_stats.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/delta_stats.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/delta_stats_cache_result.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/manifest_entry_stats.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/models/stats_result.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/types.py +0 -0
- {deltacat-0.1.18b13/deltacat/tests → deltacat-0.1.18b15/deltacat/compute/stats/utils}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/utils/intervals.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/compute/stats/utils/manifest_stats_file.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/constants.py +0 -0
- {deltacat-0.1.18b13/deltacat/tests/compactor → deltacat-0.1.18b15/deltacat/io}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/tests/compactor/utils → deltacat-0.1.18b15/deltacat/io/aws}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/tests/io → deltacat-0.1.18b15/deltacat/io/aws/redshift}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/aws/redshift/redshift_datasource.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/dataset.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/file_object_store.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/object_store.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/ray_plasma_object_store.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/read_api.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/redis_object_store.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/io/s3_object_store.py +0 -0
- {deltacat-0.1.18b13/deltacat/tests/stats → deltacat-0.1.18b15/deltacat/storage/model}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/delta.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/list_result.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/locator.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/namespace.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/partition.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/stream.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/storage/model/table.py +0 -0
- {deltacat-0.1.18b13/deltacat/tests/test_utils → deltacat-0.1.18b15/deltacat/tests}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/tests/utils → deltacat-0.1.18b15/deltacat/tests/aws}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/types → deltacat-0.1.18b15/deltacat/tests/compute}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/utils → deltacat-0.1.18b15/deltacat/tests/compute/compactor}/__init__.py +0 -0
- {deltacat-0.1.18b13/deltacat/utils/ray_utils → deltacat-0.1.18b15/deltacat/tests/compute/compactor/steps}/__init__.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_file_object_store.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_ray_plasma_object_store.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_redis_object_store.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/io/test_s3_object_store.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/stats/test_intervals.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/test_utils/constants.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/tests/utils/test_record_batch_tables.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/common.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/metrics.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/numpy.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/pandas.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/collections.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/dataset.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/performance.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat/utils/ray_utils/runtime.py +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/dependency_links.txt +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/deltacat.egg-info/top_level.txt +0 -0
- {deltacat-0.1.18b13 → deltacat-0.1.18b15}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: deltacat
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18b15
|
4
4
|
Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
|
5
5
|
Home-page: https://github.com/ray-project/deltacat
|
6
6
|
Author: Ray Team
|
@@ -18,12 +18,24 @@ Description: # DeltaCAT
|
|
18
18
|
change-data-capture, data consistency checks, and table repair.
|
19
19
|
|
20
20
|
## Getting Started
|
21
|
-
|
21
|
+
|
22
22
|
### Install
|
23
|
+
|
23
24
|
```
|
24
25
|
pip install deltacat
|
25
26
|
```
|
26
27
|
|
28
|
+
### Running Tests
|
29
|
+
|
30
|
+
```
|
31
|
+
pip3 install virtualenv
|
32
|
+
virtualenv test_env
|
33
|
+
source test_env/bin/activate
|
34
|
+
pip3 install -r requirements.txt
|
35
|
+
|
36
|
+
pytest
|
37
|
+
```
|
38
|
+
|
27
39
|
Platform: UNKNOWN
|
28
40
|
Classifier: Development Status :: 4 - Beta
|
29
41
|
Classifier: Intended Audience :: Developers
|
@@ -11,8 +11,20 @@ for common table management tasks, including petabyte-scale
|
|
11
11
|
change-data-capture, data consistency checks, and table repair.
|
12
12
|
|
13
13
|
## Getting Started
|
14
|
-
|
14
|
+
|
15
15
|
### Install
|
16
|
+
|
16
17
|
```
|
17
18
|
pip install deltacat
|
18
19
|
```
|
20
|
+
|
21
|
+
### Running Tests
|
22
|
+
|
23
|
+
```
|
24
|
+
pip3 install virtualenv
|
25
|
+
virtualenv test_env
|
26
|
+
source test_env/bin/activate
|
27
|
+
pip3 install -r requirements.txt
|
28
|
+
|
29
|
+
pytest
|
30
|
+
```
|
@@ -28,7 +28,6 @@ from deltacat.catalog.model.catalog import ( # noqa: F401
|
|
28
28
|
init,
|
29
29
|
)
|
30
30
|
from deltacat.catalog.model.table_definition import TableDefinition
|
31
|
-
from deltacat.compute.compactor import SortKey, SortOrder
|
32
31
|
from deltacat.storage import (
|
33
32
|
DistributedDataset,
|
34
33
|
LifecycleState,
|
@@ -37,13 +36,15 @@ from deltacat.storage import (
|
|
37
36
|
LocalTable,
|
38
37
|
Namespace,
|
39
38
|
SchemaConsistencyType,
|
39
|
+
SortKey,
|
40
|
+
SortOrder,
|
40
41
|
)
|
41
42
|
from deltacat.types.media import ContentEncoding, ContentType, TableType
|
42
43
|
from deltacat.types.tables import TableWriteMode
|
43
44
|
|
44
45
|
deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
|
45
46
|
|
46
|
-
__version__ = "0.1.
|
47
|
+
__version__ = "0.1.18b15"
|
47
48
|
|
48
49
|
|
49
50
|
__all__ = [
|
@@ -0,0 +1,189 @@
|
|
1
|
+
import logging
|
2
|
+
from functools import lru_cache
|
3
|
+
from typing import Optional, FrozenSet
|
4
|
+
from http import HTTPStatus
|
5
|
+
|
6
|
+
import boto3
|
7
|
+
from boto3.exceptions import ResourceNotExistsError
|
8
|
+
from boto3.resources.base import ServiceResource
|
9
|
+
from botocore.client import BaseClient
|
10
|
+
from botocore.config import Config
|
11
|
+
from requests.adapters import Response
|
12
|
+
from tenacity import (
|
13
|
+
RetryError,
|
14
|
+
Retrying,
|
15
|
+
wait_fixed,
|
16
|
+
retry_if_exception,
|
17
|
+
stop_after_delay,
|
18
|
+
)
|
19
|
+
|
20
|
+
from deltacat import logs
|
21
|
+
from deltacat.aws.constants import BOTO_MAX_RETRIES
|
22
|
+
import requests
|
23
|
+
|
24
|
+
|
25
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
26
|
+
|
27
|
+
BOTO3_PROFILE_NAME_KWARG_KEY = "boto3_profile_name"
|
28
|
+
INSTANCE_METADATA_SERVICE_IPV4_URI = "http://169.254.169.254/latest/meta-data/" # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
29
|
+
RETRYABLE_HTTP_STATUS_CODES = [
|
30
|
+
# 429
|
31
|
+
HTTPStatus.TOO_MANY_REQUESTS,
|
32
|
+
# 5xx
|
33
|
+
HTTPStatus.INTERNAL_SERVER_ERROR,
|
34
|
+
HTTPStatus.NOT_IMPLEMENTED,
|
35
|
+
HTTPStatus.BAD_GATEWAY,
|
36
|
+
HTTPStatus.SERVICE_UNAVAILABLE,
|
37
|
+
HTTPStatus.GATEWAY_TIMEOUT,
|
38
|
+
]
|
39
|
+
|
40
|
+
|
41
|
+
class RetryIfRetryableHTTPStatusCode(retry_if_exception):
|
42
|
+
"""
|
43
|
+
Retry strategy that retries if the exception is an ``HTTPError`` with
|
44
|
+
a status code in the retryable errors list.
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(self):
|
48
|
+
def is_retryable_error(exception):
|
49
|
+
return (
|
50
|
+
isinstance(exception, requests.exceptions.HTTPError)
|
51
|
+
and exception.response.status_code in RETRYABLE_HTTP_STATUS_CODES
|
52
|
+
)
|
53
|
+
|
54
|
+
super().__init__(predicate=is_retryable_error)
|
55
|
+
|
56
|
+
|
57
|
+
def _log_attempt_number(retry_state):
|
58
|
+
"""return the result of the last call attempt"""
|
59
|
+
logger.warning(f"Retrying: {retry_state.attempt_number}...")
|
60
|
+
|
61
|
+
|
62
|
+
def _get_url(url: str, get_url_kwargs=None):
|
63
|
+
if get_url_kwargs is None:
|
64
|
+
get_url_kwargs = {}
|
65
|
+
resp = requests.get(url, **get_url_kwargs)
|
66
|
+
resp.raise_for_status()
|
67
|
+
return resp
|
68
|
+
|
69
|
+
|
70
|
+
def retrying_get(
|
71
|
+
url: str,
|
72
|
+
retry_strategy,
|
73
|
+
wait_strategy,
|
74
|
+
stop_strategy,
|
75
|
+
short_circuit_on_status: FrozenSet[int] = {HTTPStatus.OK},
|
76
|
+
) -> Optional[Response]:
|
77
|
+
"""Retries a request to the given URL until it succeeds.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
retry_strategy (Callable): A function that returns a retry strategy.
|
81
|
+
wait_strategy (Callable): A function that returns a wait strategy.
|
82
|
+
stop_strategy (Callable): A function that returns a stop strategy.
|
83
|
+
url (str): The URL to retry.
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
Optional[Response]: The response from the URL, or None if the request
|
87
|
+
failed after the maximum number of retries.
|
88
|
+
"""
|
89
|
+
try:
|
90
|
+
resp = _get_url(url)
|
91
|
+
if resp.status_code in short_circuit_on_status:
|
92
|
+
return resp
|
93
|
+
for attempt in Retrying(
|
94
|
+
retry=retry_strategy(),
|
95
|
+
wait=wait_strategy,
|
96
|
+
stop=stop_strategy,
|
97
|
+
after=_log_attempt_number,
|
98
|
+
):
|
99
|
+
with attempt:
|
100
|
+
resp = _get_url(url)
|
101
|
+
return resp
|
102
|
+
except RetryError as re:
|
103
|
+
logger.error(f"Failed to retry URL: {url} - {re}")
|
104
|
+
logger.info(f"Unable to get from URL: {url}")
|
105
|
+
return None
|
106
|
+
|
107
|
+
|
108
|
+
def block_until_instance_metadata_service_returns_success(
|
109
|
+
url=INSTANCE_METADATA_SERVICE_IPV4_URI,
|
110
|
+
retry_strategy=RetryIfRetryableHTTPStatusCode,
|
111
|
+
wait_strategy=wait_fixed(2), # wait 2 seconds before retrying,
|
112
|
+
stop_strategy=stop_after_delay(60 * 10), # stop trying after 10 minutes
|
113
|
+
) -> Optional[Response]:
|
114
|
+
"""Blocks until the instance metadata service returns a successful response.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
retry_strategy (Callable): A function that returns a retry strategy.
|
118
|
+
wait_strategy (Callable): A function that returns a wait strategy.
|
119
|
+
stop_strategy (Callable): A function that returns a stop strategy.
|
120
|
+
url (str): The URL of the instance metadata service.
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
Optional[Response]: The response from the instance metadata service,
|
124
|
+
or None if the request failed after the maximum number of retries.
|
125
|
+
|
126
|
+
https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
127
|
+
"""
|
128
|
+
# We will get a 403 HTTP status code if running deltacat not in an EC2 instance. In that case we won't want to block.
|
129
|
+
return retrying_get(
|
130
|
+
url,
|
131
|
+
retry_strategy,
|
132
|
+
wait_strategy,
|
133
|
+
stop_strategy,
|
134
|
+
short_circuit_on_status={HTTPStatus.OK, HTTPStatus.FORBIDDEN},
|
135
|
+
)
|
136
|
+
|
137
|
+
|
138
|
+
def _get_session_from_kwargs(input_kwargs):
|
139
|
+
block_until_instance_metadata_service_returns_success()
|
140
|
+
if input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY) is not None:
|
141
|
+
boto3_session = boto3.Session(
|
142
|
+
profile_name=input_kwargs.get(BOTO3_PROFILE_NAME_KWARG_KEY)
|
143
|
+
)
|
144
|
+
input_kwargs.pop(BOTO3_PROFILE_NAME_KWARG_KEY)
|
145
|
+
return boto3_session
|
146
|
+
else:
|
147
|
+
return boto3.Session()
|
148
|
+
|
149
|
+
|
150
|
+
def _resource(name: str, region: Optional[str], **kwargs) -> ServiceResource:
|
151
|
+
boto3_session = _get_session_from_kwargs(kwargs)
|
152
|
+
|
153
|
+
boto_config = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
154
|
+
return boto3_session.resource(
|
155
|
+
name,
|
156
|
+
region,
|
157
|
+
config=boto_config,
|
158
|
+
**kwargs,
|
159
|
+
)
|
160
|
+
|
161
|
+
|
162
|
+
def _client(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
163
|
+
try:
|
164
|
+
# try to re-use a client from the resource cache first
|
165
|
+
return resource_cache(name, region, **kwargs).meta.client
|
166
|
+
except ResourceNotExistsError:
|
167
|
+
# fall back for clients without an associated resource
|
168
|
+
boto3_session = _get_session_from_kwargs(kwargs)
|
169
|
+
boto_config = Config(
|
170
|
+
retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"}
|
171
|
+
)
|
172
|
+
return boto3_session.client(
|
173
|
+
name,
|
174
|
+
region,
|
175
|
+
config=boto_config,
|
176
|
+
**kwargs,
|
177
|
+
)
|
178
|
+
|
179
|
+
|
180
|
+
def resource_cache(name: str, region: Optional[str], **kwargs) -> ServiceResource:
|
181
|
+
# we don't use the @lru_cache decorator because Ray can't pickle it
|
182
|
+
cached_function = lru_cache()(_resource)
|
183
|
+
return cached_function(name, region, **kwargs)
|
184
|
+
|
185
|
+
|
186
|
+
def client_cache(name: str, region: Optional[str], **kwargs) -> BaseClient:
|
187
|
+
# we don't use the @lru_cache decorator because Ray can't pickle it
|
188
|
+
cached_function = lru_cache()(_client)
|
189
|
+
return cached_function(name, region, **kwargs)
|
@@ -170,6 +170,10 @@ class ManifestMeta(dict):
|
|
170
170
|
def content_type_parameters(self) -> Optional[List[Dict[str, str]]]:
|
171
171
|
return self.get("content_type_parameters")
|
172
172
|
|
173
|
+
@content_type_parameters.setter
|
174
|
+
def content_type_parameters(self, params: List[Dict[str, str]]) -> None:
|
175
|
+
self["content_type_parameters"] = params
|
176
|
+
|
173
177
|
@property
|
174
178
|
def credentials(self) -> Optional[Dict[str, str]]:
|
175
179
|
return self.get("credentials")
|
@@ -3,6 +3,8 @@ import multiprocessing
|
|
3
3
|
from functools import partial
|
4
4
|
from typing import Any, Callable, Dict, Generator, List, Optional, Union
|
5
5
|
from uuid import uuid4
|
6
|
+
from botocore.config import Config
|
7
|
+
from deltacat.aws.constants import BOTO_MAX_RETRIES
|
6
8
|
|
7
9
|
import pyarrow as pa
|
8
10
|
import ray
|
@@ -39,6 +41,7 @@ from deltacat.types.tables import (
|
|
39
41
|
TABLE_TYPE_TO_READER_FUNC,
|
40
42
|
get_table_length,
|
41
43
|
)
|
44
|
+
from deltacat.types.partial_download import PartialFileDownloadParams
|
42
45
|
from deltacat.utils.common import ReadKwargsProvider
|
43
46
|
|
44
47
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
@@ -197,6 +200,7 @@ def read_file(
|
|
197
200
|
column_names: Optional[List[str]] = None,
|
198
201
|
include_columns: Optional[List[str]] = None,
|
199
202
|
file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
203
|
+
partial_file_download_params: Optional[PartialFileDownloadParams] = None,
|
200
204
|
**s3_client_kwargs,
|
201
205
|
) -> LocalTable:
|
202
206
|
|
@@ -209,6 +213,7 @@ def read_file(
|
|
209
213
|
column_names,
|
210
214
|
include_columns,
|
211
215
|
file_reader_kwargs_provider,
|
216
|
+
partial_file_download_params,
|
212
217
|
**s3_client_kwargs,
|
213
218
|
)
|
214
219
|
return table
|
@@ -217,6 +222,13 @@ def read_file(
|
|
217
222
|
# Timeout error not caught by botocore
|
218
223
|
raise RetryableError(f"Retry table download from: {s3_url}") from e
|
219
224
|
raise NonRetryableError(f"Failed table download from: {s3_url}") from e
|
225
|
+
except BaseException as e:
|
226
|
+
logger.warn(
|
227
|
+
f"Read has failed for {s3_url} and content_type={content_type} "
|
228
|
+
f"and encoding={content_encoding}. Error: {e}",
|
229
|
+
exc_info=True,
|
230
|
+
)
|
231
|
+
raise e
|
220
232
|
|
221
233
|
|
222
234
|
def upload_sliced_table(
|
@@ -385,14 +397,16 @@ def download_manifest_entry(
|
|
385
397
|
content_encoding: Optional[ContentEncoding] = None,
|
386
398
|
) -> LocalTable:
|
387
399
|
|
400
|
+
conf = Config(retries={"max_attempts": BOTO_MAX_RETRIES, "mode": "adaptive"})
|
388
401
|
s3_client_kwargs = (
|
389
402
|
{
|
390
403
|
"aws_access_key_id": token_holder["accessKeyId"],
|
391
404
|
"aws_secret_access_key": token_holder["secretAccessKey"],
|
392
405
|
"aws_session_token": token_holder["sessionToken"],
|
406
|
+
"config": conf,
|
393
407
|
}
|
394
408
|
if token_holder
|
395
|
-
else {}
|
409
|
+
else {"config": conf}
|
396
410
|
)
|
397
411
|
if not content_type:
|
398
412
|
content_type = manifest_entry.meta.content_type
|
@@ -409,6 +423,14 @@ def download_manifest_entry(
|
|
409
423
|
s3_url = manifest_entry.uri
|
410
424
|
if s3_url is None:
|
411
425
|
s3_url = manifest_entry.url
|
426
|
+
|
427
|
+
partial_file_download_params = None
|
428
|
+
if manifest_entry.meta and manifest_entry.meta.content_type_parameters:
|
429
|
+
for type_params in manifest_entry.meta.content_type_parameters:
|
430
|
+
if isinstance(type_params, PartialFileDownloadParams):
|
431
|
+
partial_file_download_params = type_params
|
432
|
+
break
|
433
|
+
|
412
434
|
# @retry decorator can't be pickled by Ray, so wrap download in Retrying
|
413
435
|
retrying = Retrying(
|
414
436
|
wait=wait_random_exponential(multiplier=1, max=60),
|
@@ -424,6 +446,7 @@ def download_manifest_entry(
|
|
424
446
|
column_names,
|
425
447
|
include_columns,
|
426
448
|
file_reader_kwargs_provider,
|
449
|
+
partial_file_download_params,
|
427
450
|
**s3_client_kwargs,
|
428
451
|
)
|
429
452
|
return table
|
@@ -0,0 +1,53 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
|
6
|
+
# Benchmarks for retrieving a single column in the Parquet file
|
7
|
+
SINGLE_COLUMN_BENCHMARKS = {
|
8
|
+
"mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", ["a"]),
|
9
|
+
"TPCH-lineitems-200MB-2RG": (
|
10
|
+
"s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
|
11
|
+
["L_ORDERKEY"],
|
12
|
+
),
|
13
|
+
}
|
14
|
+
|
15
|
+
# Benchmarks for retrieving all columns in the Parquet file
|
16
|
+
ALL_COLUMN_BENCHMARKS = {
|
17
|
+
"mvp": ("s3://daft-public-data/test_fixtures/parquet-dev/mvp.parquet", None),
|
18
|
+
"TPCH-lineitems-200MB-2RG": (
|
19
|
+
"s3://daft-public-data/test_fixtures/parquet-dev/daft_200MB_lineitem_chunk.RG-2.parquet",
|
20
|
+
None,
|
21
|
+
),
|
22
|
+
}
|
23
|
+
|
24
|
+
|
25
|
+
@pytest.mark.benchmark(group="num_rowgroups_single_column")
|
26
|
+
@pytest.mark.parametrize(
|
27
|
+
["name", "path", "columns"],
|
28
|
+
[
|
29
|
+
(name, path, columns)
|
30
|
+
for name, (path, columns) in SINGLE_COLUMN_BENCHMARKS.items()
|
31
|
+
],
|
32
|
+
ids=[name for name in SINGLE_COLUMN_BENCHMARKS],
|
33
|
+
)
|
34
|
+
def test_read_parquet_num_rowgroups_single_column(
|
35
|
+
name, path, columns, read_fn, benchmark
|
36
|
+
):
|
37
|
+
data = benchmark(read_fn, path, columns=columns)
|
38
|
+
if columns is not None:
|
39
|
+
assert data.column_names == columns
|
40
|
+
|
41
|
+
|
42
|
+
@pytest.mark.benchmark(group="num_rowgroups_all_columns")
|
43
|
+
@pytest.mark.parametrize(
|
44
|
+
["name", "path", "columns"],
|
45
|
+
[(name, path, columns) for name, (path, columns) in ALL_COLUMN_BENCHMARKS.items()],
|
46
|
+
ids=[name for name in ALL_COLUMN_BENCHMARKS],
|
47
|
+
)
|
48
|
+
def test_read_parquet_num_rowgroups_all_columns(
|
49
|
+
name, path, columns, read_fn, benchmark
|
50
|
+
):
|
51
|
+
data = benchmark(read_fn, path, columns=columns)
|
52
|
+
if columns is not None:
|
53
|
+
assert data.column_names == columns
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
import pyarrow.fs as pafs
|
5
|
+
import pyarrow.parquet as papq
|
6
|
+
import pytest
|
7
|
+
|
8
|
+
from deltacat.utils.pyarrow import s3_file_to_table
|
9
|
+
from deltacat.types.media import (
|
10
|
+
ContentEncoding,
|
11
|
+
ContentType,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
def pyarrow_read(path: str, columns: list[str] | None = None) -> pa.Table:
|
16
|
+
assert path.startswith(
|
17
|
+
"s3://"
|
18
|
+
), f"Expected file path to start with 's3://', but got {path}."
|
19
|
+
fs = pafs.S3FileSystem()
|
20
|
+
path = path.replace("s3://", "")
|
21
|
+
return papq.read_table(path, columns=columns, filesystem=fs)
|
22
|
+
|
23
|
+
|
24
|
+
def deltacat_read(path: str, columns: list[str] | None = None) -> pa.Table:
|
25
|
+
assert path.startswith("s3://")
|
26
|
+
return s3_file_to_table(
|
27
|
+
path,
|
28
|
+
content_type=ContentType.PARQUET,
|
29
|
+
content_encoding=ContentEncoding.IDENTITY,
|
30
|
+
column_names=None, # Parquet files are schemaful
|
31
|
+
include_columns=columns,
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
def daft_table_read(path: str, columns: list[str] | None = None) -> pa.Table:
|
36
|
+
try:
|
37
|
+
import daft
|
38
|
+
except ImportError:
|
39
|
+
raise ImportError(
|
40
|
+
"Daft not installed. Install Daft using pip to run these benchmarks: `pip install getdaft`"
|
41
|
+
)
|
42
|
+
|
43
|
+
tbl = daft.table.Table.read_parquet(path, columns=columns)
|
44
|
+
return tbl.to_arrow()
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.fixture(
|
48
|
+
params=[
|
49
|
+
daft_table_read,
|
50
|
+
pyarrow_read,
|
51
|
+
deltacat_read,
|
52
|
+
],
|
53
|
+
ids=[
|
54
|
+
"daft_table",
|
55
|
+
"pyarrow",
|
56
|
+
"deltacat",
|
57
|
+
],
|
58
|
+
)
|
59
|
+
def read_fn(request):
|
60
|
+
"""Fixture which returns the function to read a PyArrow table from a path"""
|
61
|
+
return request.param
|
@@ -5,7 +5,7 @@ import ray
|
|
5
5
|
|
6
6
|
from deltacat.catalog.model.catalog import Catalog, all_catalogs
|
7
7
|
from deltacat.catalog.model.table_definition import TableDefinition
|
8
|
-
from deltacat.
|
8
|
+
from deltacat.storage.model.sort_key import SortKey
|
9
9
|
from deltacat.storage.model.list_result import ListResult
|
10
10
|
from deltacat.storage.model.namespace import Namespace
|
11
11
|
from deltacat.storage.model.types import (
|
@@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Union
|
|
3
3
|
import pyarrow as pa
|
4
4
|
|
5
5
|
from deltacat.catalog.model.table_definition import TableDefinition
|
6
|
-
from deltacat.
|
6
|
+
from deltacat.storage.model.sort_key import SortKey
|
7
7
|
from deltacat.storage.model.list_result import ListResult
|
8
8
|
from deltacat.storage.model.namespace import Namespace
|
9
9
|
from deltacat.storage.model.types import (
|
@@ -13,7 +13,6 @@ from deltacat.compute.compactor.model.round_completion_info import (
|
|
13
13
|
RoundCompletionInfo,
|
14
14
|
HighWatermark,
|
15
15
|
)
|
16
|
-
from deltacat.compute.compactor.model.sort_key import SortKey, SortOrder
|
17
16
|
|
18
17
|
__all__ = [
|
19
18
|
"DeltaAnnotated",
|
@@ -27,6 +26,4 @@ __all__ = [
|
|
27
26
|
"PyArrowWriteResult",
|
28
27
|
"RoundCompletionInfo",
|
29
28
|
"HighWatermark",
|
30
|
-
"SortKey",
|
31
|
-
"SortOrder",
|
32
29
|
]
|