deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
|
-
|
2
|
+
import tempfile
|
3
3
|
import unittest
|
4
|
+
import uuid
|
4
5
|
|
5
6
|
|
6
7
|
class TestCompactPartitionParams(unittest.TestCase):
|
@@ -8,9 +9,14 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
8
9
|
def setUpClass(cls):
|
9
10
|
from deltacat.types.media import ContentType
|
10
11
|
from deltacat.utils.metrics import MetricsConfig, MetricsTarget
|
12
|
+
from deltacat.catalog import CatalogProperties
|
13
|
+
|
14
|
+
# Create a temporary catalog for testing
|
15
|
+
tmpdir = tempfile.mkdtemp()
|
16
|
+
cls.test_catalog = CatalogProperties(root=tmpdir)
|
11
17
|
|
12
18
|
cls.VALID_COMPACT_PARTITION_PARAMS = {
|
13
|
-
"
|
19
|
+
"catalog": cls.test_catalog,
|
14
20
|
"compacted_file_content_type": ContentType.PARQUET,
|
15
21
|
"deltacat_storage": "foobar",
|
16
22
|
"destination_partition_locator": {
|
@@ -23,15 +29,16 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
23
29
|
"tableVersion": "1",
|
24
30
|
},
|
25
31
|
"streamId": "foobar",
|
26
|
-
"
|
32
|
+
"format": "fooType",
|
27
33
|
},
|
28
34
|
"partitionValues": [],
|
29
|
-
"partitionId":
|
35
|
+
"partitionId": str(uuid.uuid4()),
|
30
36
|
},
|
31
37
|
"hash_bucket_count": 200,
|
32
38
|
"last_stream_position_to_compact": 168000000000,
|
33
39
|
"list_deltas_kwargs": {"equivalent_table_types": []},
|
34
40
|
"primary_keys": {"id"},
|
41
|
+
"all_column_names": ["id", "foo", "bar", "baz"],
|
35
42
|
"properties": {
|
36
43
|
"parent_stream_position": "1688000000000",
|
37
44
|
},
|
@@ -47,12 +54,12 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
47
54
|
"table_version": "1",
|
48
55
|
},
|
49
56
|
"streamId": "foobar",
|
50
|
-
"
|
57
|
+
"format": "fooType",
|
51
58
|
},
|
52
59
|
"partitionValues": [],
|
53
60
|
"partitionId": "79612ea39ac5493eae925abe60767d42",
|
54
61
|
},
|
55
|
-
"
|
62
|
+
"table_writer_kwargs": {
|
56
63
|
"version": "1.0",
|
57
64
|
"flavor": "foobar",
|
58
65
|
"coerce_timestamps": "ms",
|
@@ -67,7 +74,7 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
67
74
|
"tableVersion": "2",
|
68
75
|
},
|
69
76
|
"streamId": "foobar",
|
70
|
-
"
|
77
|
+
"format": "fooType",
|
71
78
|
},
|
72
79
|
"partitionValues": [],
|
73
80
|
"partitionId": "79612ea39ac5493eae925abe60767d42",
|
@@ -103,10 +110,8 @@ class TestCompactPartitionParams(unittest.TestCase):
|
|
103
110
|
json.loads(serialized_params)["compacted_file_content_type"]
|
104
111
|
== params.compacted_file_content_type
|
105
112
|
)
|
106
|
-
|
107
|
-
|
108
|
-
== params.compaction_artifact_s3_bucket
|
109
|
-
)
|
113
|
+
catalog_json = json.loads(serialized_params)["catalog"]
|
114
|
+
assert catalog_json["_root"] == params.catalog.root
|
110
115
|
assert (
|
111
116
|
json.loads(serialized_params)["hash_bucket_count"]
|
112
117
|
== params.hash_bucket_count
|
@@ -1,43 +1,40 @@
|
|
1
|
-
import
|
1
|
+
import tempfile
|
2
2
|
import os
|
3
|
-
from
|
3
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
4
4
|
import pytest
|
5
|
-
import boto3
|
6
|
-
from boto3.resources.base import ServiceResource
|
7
5
|
import pyarrow as pa
|
6
|
+
import ray
|
7
|
+
|
8
8
|
from deltacat.io.file_object_store import FileObjectStore
|
9
9
|
from pytest_benchmark.fixture import BenchmarkFixture
|
10
|
-
import tempfile
|
11
10
|
|
12
11
|
from deltacat.tests.compute.test_util_constant import (
|
13
|
-
TEST_S3_RCF_BUCKET_NAME,
|
14
12
|
DEFAULT_NUM_WORKERS,
|
15
13
|
DEFAULT_WORKER_INSTANCE_CPUS,
|
16
14
|
)
|
17
15
|
from deltacat.tests.compute.test_util_common import (
|
18
|
-
|
16
|
+
get_rci_from_partition,
|
17
|
+
read_audit_file,
|
18
|
+
PartitionKey,
|
19
|
+
get_compacted_delta_locator_from_partition,
|
19
20
|
)
|
20
|
-
from deltacat.tests.test_utils.utils import read_s3_contents
|
21
|
-
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
22
21
|
from deltacat.tests.compute.test_util_common import (
|
23
|
-
|
22
|
+
create_src_w_deltas_destination_rebase_w_deltas_strategy_main,
|
24
23
|
)
|
24
|
+
|
25
|
+
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
25
26
|
from deltacat.compute.compactor.model.compaction_session_audit_info import (
|
26
27
|
CompactionSessionAuditInfo,
|
27
28
|
)
|
28
|
-
from deltacat.tests.compute.test_util_create_table_deltas_repo import (
|
29
|
-
create_src_w_deltas_destination_rebase_w_deltas_strategy,
|
30
|
-
)
|
31
29
|
from deltacat.tests.compute.compact_partition_rebase_test_cases import (
|
32
30
|
REBASE_TEST_CASES,
|
33
31
|
)
|
34
|
-
from
|
35
|
-
from deltacat.types.media import StorageType
|
32
|
+
from deltacat.types.media import StorageType, ContentType
|
36
33
|
from deltacat.storage import (
|
37
34
|
DeltaLocator,
|
38
35
|
Partition,
|
36
|
+
metastore,
|
39
37
|
)
|
40
|
-
from deltacat.types.media import ContentType
|
41
38
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
42
39
|
CompactPartitionParams,
|
43
40
|
)
|
@@ -48,11 +45,6 @@ from deltacat.utils.placement import (
|
|
48
45
|
PlacementGroupManager,
|
49
46
|
)
|
50
47
|
|
51
|
-
DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
|
52
|
-
"db_file_path",
|
53
|
-
"deltacat/tests/local_deltacat_storage/db_test.sqlite",
|
54
|
-
)
|
55
|
-
|
56
48
|
|
57
49
|
"""
|
58
50
|
MODULE scoped fixtures
|
@@ -66,54 +58,11 @@ def setup_ray_cluster():
|
|
66
58
|
ray.shutdown()
|
67
59
|
|
68
60
|
|
69
|
-
@pytest.fixture(autouse=True, scope="module")
|
70
|
-
def mock_aws_credential():
|
71
|
-
os.environ["AWS_ACCESS_KEY_ID"] = "testing"
|
72
|
-
os.environ["AWS_SECRET_ACCESS_ID"] = "testing"
|
73
|
-
os.environ["AWS_SECURITY_TOKEN"] = "testing"
|
74
|
-
os.environ["AWS_SESSION_TOKEN"] = "testing"
|
75
|
-
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"
|
76
|
-
yield
|
77
|
-
|
78
|
-
|
79
|
-
@pytest.fixture(autouse=True, scope="module")
|
80
|
-
def cleanup_the_database_file_after_all_compaction_session_package_tests_complete():
|
81
|
-
# make sure the database file is deleted after all the compactor package tests are completed
|
82
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
83
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
84
|
-
|
85
|
-
|
86
|
-
@pytest.fixture(scope="module")
|
87
|
-
def s3_resource(mock_aws_credential):
|
88
|
-
with mock_s3():
|
89
|
-
yield boto3.resource("s3")
|
90
|
-
|
91
|
-
|
92
|
-
@pytest.fixture(autouse=True, scope="module")
|
93
|
-
def setup_compaction_artifacts_s3_bucket(s3_resource: ServiceResource):
|
94
|
-
s3_resource.create_bucket(
|
95
|
-
ACL="authenticated-read",
|
96
|
-
Bucket=TEST_S3_RCF_BUCKET_NAME,
|
97
|
-
)
|
98
|
-
yield
|
99
|
-
|
100
|
-
|
101
61
|
"""
|
102
62
|
FUNCTION scoped fixtures
|
103
63
|
"""
|
104
64
|
|
105
65
|
|
106
|
-
@pytest.fixture(scope="function")
|
107
|
-
def local_deltacat_storage_kwargs(request: pytest.FixtureRequest):
|
108
|
-
# see deltacat/tests/local_deltacat_storage/README.md for documentation
|
109
|
-
kwargs_for_local_deltacat_storage: Dict[str, Any] = {
|
110
|
-
DATABASE_FILE_PATH_KEY: DATABASE_FILE_PATH_VALUE,
|
111
|
-
}
|
112
|
-
yield kwargs_for_local_deltacat_storage
|
113
|
-
if os.path.exists(DATABASE_FILE_PATH_VALUE):
|
114
|
-
os.remove(DATABASE_FILE_PATH_VALUE)
|
115
|
-
|
116
|
-
|
117
66
|
@pytest.fixture(autouse=True, scope="function")
|
118
67
|
def enable_bucketing_spec_validation(monkeypatch):
|
119
68
|
"""
|
@@ -199,14 +148,13 @@ def enable_bucketing_spec_validation(monkeypatch):
|
|
199
148
|
],
|
200
149
|
ids=[test_name for test_name in REBASE_TEST_CASES],
|
201
150
|
)
|
202
|
-
def
|
151
|
+
def test_compact_partition_rebase_same_source_and_destination_main(
|
203
152
|
mocker,
|
204
|
-
|
205
|
-
local_deltacat_storage_kwargs: Dict[str, Any],
|
153
|
+
main_deltacat_storage_kwargs: Dict[str, Any],
|
206
154
|
test_name: str,
|
207
155
|
primary_keys: Set[str],
|
208
156
|
sort_keys: List[Optional[Any]],
|
209
|
-
partition_keys_param: Optional[List[
|
157
|
+
partition_keys_param: Optional[List[PartitionKey]],
|
210
158
|
partition_values_param: List[Optional[str]],
|
211
159
|
input_deltas_param: List[pa.Array],
|
212
160
|
input_deltas_delta_type: str,
|
@@ -225,21 +173,20 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
225
173
|
compact_partition_func: Callable,
|
226
174
|
benchmark: BenchmarkFixture,
|
227
175
|
):
|
228
|
-
|
229
|
-
|
230
|
-
ds_mock_kwargs = local_deltacat_storage_kwargs
|
176
|
+
ds_mock_kwargs = main_deltacat_storage_kwargs
|
231
177
|
"""
|
232
178
|
This test tests the scenario where source partition locator == destination partition locator,
|
233
179
|
but rebase source partition locator is different.
|
234
180
|
This scenario could occur when hash bucket count changes.
|
181
|
+
|
182
|
+
This version uses the main metastore implementation instead of local storage.
|
235
183
|
"""
|
236
184
|
partition_keys = partition_keys_param
|
237
185
|
(
|
238
186
|
source_table_stream,
|
239
187
|
_,
|
240
188
|
rebased_table_stream,
|
241
|
-
) =
|
242
|
-
primary_keys,
|
189
|
+
) = create_src_w_deltas_destination_rebase_w_deltas_strategy_main(
|
243
190
|
sort_keys,
|
244
191
|
partition_keys,
|
245
192
|
input_deltas_param,
|
@@ -247,14 +194,31 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
247
194
|
partition_values_param,
|
248
195
|
ds_mock_kwargs,
|
249
196
|
)
|
250
|
-
|
197
|
+
|
198
|
+
# Convert partition values for partition lookup (same as in the helper function)
|
199
|
+
converted_partition_values_for_lookup = partition_values_param
|
200
|
+
if partition_values_param and partition_keys:
|
201
|
+
converted_partition_values_for_lookup = []
|
202
|
+
for i, (value, pk) in enumerate(zip(partition_values_param, partition_keys)):
|
203
|
+
if pk.key_type.value == "int": # Use .value to get string representation
|
204
|
+
converted_partition_values_for_lookup.append(int(value))
|
205
|
+
else:
|
206
|
+
converted_partition_values_for_lookup.append(value)
|
207
|
+
|
208
|
+
source_partition: Partition = metastore.get_partition(
|
251
209
|
source_table_stream.locator,
|
252
|
-
|
210
|
+
converted_partition_values_for_lookup,
|
253
211
|
**ds_mock_kwargs,
|
254
212
|
)
|
255
|
-
rebased_partition: Partition =
|
213
|
+
rebased_partition: Partition = metastore.get_partition(
|
256
214
|
rebased_table_stream.locator,
|
257
|
-
|
215
|
+
converted_partition_values_for_lookup,
|
216
|
+
**ds_mock_kwargs,
|
217
|
+
)
|
218
|
+
all_column_names = metastore.get_table_version_column_names(
|
219
|
+
rebased_table_stream.locator.table_locator.namespace,
|
220
|
+
rebased_table_stream.locator.table_locator.table_name,
|
221
|
+
rebased_table_stream.locator.table_version_locator.table_version,
|
258
222
|
**ds_mock_kwargs,
|
259
223
|
)
|
260
224
|
num_workers, worker_instance_cpu = DEFAULT_NUM_WORKERS, DEFAULT_WORKER_INSTANCE_CPUS
|
@@ -269,10 +233,10 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
269
233
|
with tempfile.TemporaryDirectory() as test_dir:
|
270
234
|
compact_partition_params = CompactPartitionParams.of(
|
271
235
|
{
|
272
|
-
"
|
236
|
+
"catalog": ds_mock_kwargs.get("inner"),
|
273
237
|
"compacted_file_content_type": ContentType.PARQUET,
|
274
238
|
"dd_max_parallelism_ratio": 1.0,
|
275
|
-
"deltacat_storage":
|
239
|
+
"deltacat_storage": metastore,
|
276
240
|
"deltacat_storage_kwargs": ds_mock_kwargs,
|
277
241
|
"destination_partition_locator": rebased_partition.locator,
|
278
242
|
"hash_bucket_count": hash_bucket_count_param,
|
@@ -284,11 +248,11 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
284
248
|
"object_store": FileObjectStore(test_dir),
|
285
249
|
"pg_config": pgm,
|
286
250
|
"primary_keys": primary_keys,
|
251
|
+
"all_column_names": all_column_names,
|
287
252
|
"read_kwargs_provider": read_kwargs_provider_param,
|
288
253
|
"rebase_source_partition_locator": source_partition.locator,
|
289
254
|
"rebase_source_partition_high_watermark": rebased_partition.stream_position,
|
290
255
|
"records_per_compacted_file": records_per_compacted_file_param,
|
291
|
-
"s3_client_kwargs": {},
|
292
256
|
"source_partition_locator": rebased_partition.locator,
|
293
257
|
"sort_keys": sort_keys if sort_keys else None,
|
294
258
|
"drop_duplicates": drop_duplicates_param,
|
@@ -305,16 +269,14 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
305
269
|
object_store_put_many_spy = mocker.spy(FileObjectStore, "put_many")
|
306
270
|
|
307
271
|
# execute
|
308
|
-
|
272
|
+
benchmark(compact_partition_func, compact_partition_params)
|
309
273
|
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
audit_bucket, audit_key = RoundCompletionInfo.get_audit_bucket_name_and_key(
|
314
|
-
round_completion_info.compaction_audit_url
|
274
|
+
# Get RoundCompletionInfo from the compacted partition
|
275
|
+
round_completion_info: RoundCompletionInfo = get_rci_from_partition(
|
276
|
+
rebased_partition.locator, metastore, catalog=ds_mock_kwargs.get("inner")
|
315
277
|
)
|
316
278
|
|
317
|
-
# assert if
|
279
|
+
# assert if RCI covers all files
|
318
280
|
if compactor_version != CompactorVersion.V1.value:
|
319
281
|
previous_end = None
|
320
282
|
for start, end in round_completion_info.hb_index_to_entry_range.values():
|
@@ -325,8 +287,12 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
325
287
|
== round_completion_info.compacted_pyarrow_write_result.files
|
326
288
|
)
|
327
289
|
|
328
|
-
|
329
|
-
|
290
|
+
# Get catalog root for audit file resolution
|
291
|
+
catalog = ds_mock_kwargs.get("inner")
|
292
|
+
catalog_root = catalog.root
|
293
|
+
|
294
|
+
compaction_audit_obj: Dict[str, Any] = read_audit_file(
|
295
|
+
round_completion_info.compaction_audit_url, catalog_root
|
330
296
|
)
|
331
297
|
compaction_audit: CompactionSessionAuditInfo = CompactionSessionAuditInfo(
|
332
298
|
**compaction_audit_obj
|
@@ -336,13 +302,17 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
336
302
|
assert (
|
337
303
|
execute_compaction_result_spy.call_args.args[-1] is False
|
338
304
|
), "Table version erroneously marked as in-place compacted!"
|
339
|
-
compacted_delta_locator: DeltaLocator =
|
340
|
-
|
305
|
+
compacted_delta_locator: DeltaLocator = (
|
306
|
+
get_compacted_delta_locator_from_partition(
|
307
|
+
rebased_partition.locator,
|
308
|
+
metastore,
|
309
|
+
catalog=ds_mock_kwargs.get("inner"),
|
310
|
+
)
|
341
311
|
)
|
342
312
|
assert (
|
343
313
|
compacted_delta_locator.stream_position == last_stream_position_to_compact
|
344
314
|
), "Compacted delta locator must be equal to last stream position"
|
345
|
-
tables =
|
315
|
+
tables = metastore.download_delta(
|
346
316
|
compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
|
347
317
|
)
|
348
318
|
actual_rebase_compacted_table = pa.concat_tables(tables)
|
@@ -351,7 +321,7 @@ def test_compact_partition_rebase_same_source_and_destination(
|
|
351
321
|
if primary_keys:
|
352
322
|
sorting_cols.extend([(val, "ascending") for val in primary_keys])
|
353
323
|
if sort_keys:
|
354
|
-
sorting_cols.extend(sort_keys)
|
324
|
+
sorting_cols.extend([pa_key for key in sort_keys for pa_key in key.arrow])
|
355
325
|
|
356
326
|
rebase_expected_compact_partition_result = (
|
357
327
|
rebase_expected_compact_partition_result.combine_chunks().sort_by(
|