deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,6 @@ from deltacat.compute.compactor_v2.model.evaluate_compaction_result import (
|
|
14
14
|
ExecutionCompactionResult,
|
15
15
|
)
|
16
16
|
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
|
17
|
-
from deltacat.compute.compactor.utils import round_completion_file as rcf
|
18
17
|
from deltacat.compute.compactor import DeltaAnnotated
|
19
18
|
from deltacat.compute.compactor_v2.deletes.delete_strategy import (
|
20
19
|
DeleteStrategy,
|
@@ -27,9 +26,9 @@ from deltacat.compute.compactor_v2.deletes.delete_file_envelope import (
|
|
27
26
|
from deltacat.storage import (
|
28
27
|
Delta,
|
29
28
|
DeltaLocator,
|
30
|
-
|
31
|
-
Partition,
|
29
|
+
PartitionLocator,
|
32
30
|
)
|
31
|
+
from deltacat.storage.model.manifest import Manifest
|
33
32
|
from deltacat.compute.compactor.model.compact_partition_params import (
|
34
33
|
CompactPartitionParams,
|
35
34
|
)
|
@@ -37,13 +36,14 @@ from deltacat.utils.resources import (
|
|
37
36
|
get_current_process_peak_memory_usage_in_bytes,
|
38
37
|
)
|
39
38
|
from deltacat.compute.compactor_v2.private.compaction_utils import (
|
39
|
+
_get_rci_source_partition_locator,
|
40
40
|
_fetch_compaction_metadata,
|
41
41
|
_build_uniform_deltas,
|
42
42
|
_group_uniform_deltas,
|
43
43
|
_stage_new_partition,
|
44
44
|
_run_hash_and_merge,
|
45
45
|
_process_merge_results,
|
46
|
-
|
46
|
+
_create_round_completion_info,
|
47
47
|
_commit_compaction_result,
|
48
48
|
)
|
49
49
|
from deltacat.utils.metrics import metrics
|
@@ -65,7 +65,7 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
|
65
65
|
|
66
66
|
@metrics(prefix=COMPACT_PARTITION_METRIC_PREFIX)
|
67
67
|
@categorize_errors
|
68
|
-
def compact_partition(params: CompactPartitionParams, **kwargs) ->
|
68
|
+
def compact_partition(params: CompactPartitionParams, **kwargs) -> None:
|
69
69
|
assert (
|
70
70
|
params.hash_bucket_count is not None and params.hash_bucket_count >= 1
|
71
71
|
), "hash_bucket_count is a required arg for compactor v2"
|
@@ -85,7 +85,6 @@ def compact_partition(params: CompactPartitionParams, **kwargs) -> Optional[str]
|
|
85
85
|
**kwargs,
|
86
86
|
)
|
87
87
|
_commit_compaction_result(params, execute_compaction_result)
|
88
|
-
return execute_compaction_result.round_completion_file_s3_url
|
89
88
|
|
90
89
|
|
91
90
|
def _execute_compaction(
|
@@ -100,12 +99,12 @@ def _execute_compaction(
|
|
100
99
|
previous_compacted_delta_manifest,
|
101
100
|
round_completion_info,
|
102
101
|
) = fetch_compaction_metadata_result
|
103
|
-
|
104
|
-
params
|
102
|
+
rci_source_partition_locator: PartitionLocator = _get_rci_source_partition_locator(
|
103
|
+
params
|
105
104
|
)
|
106
105
|
|
107
|
-
base_audit_url: str =
|
108
|
-
f"
|
106
|
+
base_audit_url: str = rci_source_partition_locator.path(
|
107
|
+
f"{params.compaction_artifact_path}/compaction-audit"
|
109
108
|
)
|
110
109
|
audit_url: str = f"{base_audit_url}.json"
|
111
110
|
logger.info(f"Compaction audit will be written to {audit_url}")
|
@@ -140,9 +139,9 @@ def _execute_compaction(
|
|
140
139
|
)
|
141
140
|
if not input_deltas:
|
142
141
|
logger.info("No input deltas found to compact.")
|
143
|
-
return ExecutionCompactionResult(None, None,
|
142
|
+
return ExecutionCompactionResult(None, None, False)
|
144
143
|
build_uniform_deltas_result: tuple[
|
145
|
-
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
144
|
+
List[DeltaAnnotated], DeleteStrategy, List[DeleteFileEnvelope]
|
146
145
|
] = _build_uniform_deltas(
|
147
146
|
params, compaction_audit, input_deltas, delta_discovery_start
|
148
147
|
)
|
@@ -203,13 +202,13 @@ def _execute_compaction(
|
|
203
202
|
|
204
203
|
compaction_audit.save_round_completion_stats(mat_results)
|
205
204
|
|
206
|
-
compaction_result: ExecutionCompactionResult =
|
205
|
+
compaction_result: ExecutionCompactionResult = _create_round_completion_info(
|
207
206
|
params,
|
208
207
|
compaction_audit,
|
209
208
|
compacted_partition,
|
210
209
|
audit_url,
|
211
210
|
hb_id_to_entry_indices_range,
|
212
|
-
|
211
|
+
rci_source_partition_locator,
|
213
212
|
new_compacted_delta_locator,
|
214
213
|
pyarrow_write_result,
|
215
214
|
round_completion_info,
|
@@ -49,7 +49,7 @@ def _aggregate_delete_deltas(input_deltas: List[Delta]) -> Dict[int, List[Delta]
|
|
49
49
|
] = [
|
50
50
|
(is_delete, list(delete_delta_group))
|
51
51
|
for (is_delete, _), delete_delta_group in itertools.groupby(
|
52
|
-
input_deltas, lambda d: (d.type is DeltaType.DELETE, d.
|
52
|
+
input_deltas, lambda d: (d.type is DeltaType.DELETE, d.meta.entry_params)
|
53
53
|
)
|
54
54
|
]
|
55
55
|
for (
|
@@ -89,11 +89,11 @@ def _get_delete_file_envelopes(
|
|
89
89
|
consecutive_delete_tables: List[pa.Table] = []
|
90
90
|
for delete_delta in delete_delta_sequence:
|
91
91
|
assert (
|
92
|
-
delete_delta.
|
92
|
+
delete_delta.meta.entry_params is not None
|
93
93
|
), "Delete type deltas are required to have delete parameters defined"
|
94
94
|
delete_columns: Optional[
|
95
95
|
List[str]
|
96
|
-
] = delete_delta.
|
96
|
+
] = delete_delta.meta.entry_params.equality_field_locators
|
97
97
|
assert len(delete_columns) > 0, "At least 1 delete column is required"
|
98
98
|
# delete columns should exist in underlying table
|
99
99
|
delete_dataset = params.deltacat_storage.download_delta(
|
@@ -13,7 +13,6 @@ from typing import Optional
|
|
13
13
|
class ExecutionCompactionResult:
|
14
14
|
new_compacted_partition: Optional[Partition]
|
15
15
|
new_round_completion_info: Optional[RoundCompletionInfo]
|
16
|
-
round_completion_file_s3_url: Optional[str]
|
17
16
|
is_inplace_compacted: bool
|
18
17
|
|
19
18
|
def __iter__(self):
|
@@ -4,7 +4,7 @@ from typing import Dict, List, Optional, Any
|
|
4
4
|
from deltacat.utils.metrics import MetricsConfig
|
5
5
|
from deltacat.utils.common import ReadKwargsProvider
|
6
6
|
from deltacat.io.object_store import IObjectStore
|
7
|
-
from deltacat.storage import
|
7
|
+
from deltacat.storage import metastore
|
8
8
|
from deltacat.compute.compactor import DeltaAnnotated
|
9
9
|
|
10
10
|
|
@@ -15,12 +15,13 @@ class HashBucketInput(Dict):
|
|
15
15
|
primary_keys: List[str],
|
16
16
|
num_hash_buckets: int,
|
17
17
|
num_hash_groups: int,
|
18
|
+
all_column_names: List[str],
|
18
19
|
hb_task_index: Optional[int] = 0,
|
19
20
|
enable_profiler: Optional[bool] = False,
|
20
21
|
metrics_config: Optional[MetricsConfig] = None,
|
21
22
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
22
23
|
object_store: Optional[IObjectStore] = None,
|
23
|
-
deltacat_storage=
|
24
|
+
deltacat_storage=metastore,
|
24
25
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
25
26
|
memory_logs_enabled: Optional[bool] = None,
|
26
27
|
) -> HashBucketInput:
|
@@ -31,6 +32,7 @@ class HashBucketInput(Dict):
|
|
31
32
|
result["hb_task_index"] = hb_task_index
|
32
33
|
result["num_hash_buckets"] = num_hash_buckets
|
33
34
|
result["num_hash_groups"] = num_hash_groups
|
35
|
+
result["all_column_names"] = all_column_names
|
34
36
|
result["enable_profiler"] = enable_profiler
|
35
37
|
result["metrics_config"] = metrics_config
|
36
38
|
result["read_kwargs_provider"] = read_kwargs_provider
|
@@ -61,6 +63,10 @@ class HashBucketInput(Dict):
|
|
61
63
|
def num_hash_groups(self) -> int:
|
62
64
|
return self["num_hash_groups"]
|
63
65
|
|
66
|
+
@property
|
67
|
+
def all_column_names(self) -> List[str]:
|
68
|
+
return self["all_column_names"]
|
69
|
+
|
64
70
|
@property
|
65
71
|
def enable_profiler(self) -> Optional[bool]:
|
66
72
|
return self.get("enable_profiler")
|
@@ -78,7 +84,7 @@ class HashBucketInput(Dict):
|
|
78
84
|
return self.get("object_store")
|
79
85
|
|
80
86
|
@property
|
81
|
-
def deltacat_storage(self) ->
|
87
|
+
def deltacat_storage(self) -> metastore:
|
82
88
|
return self.get("deltacat_storage")
|
83
89
|
|
84
90
|
@property
|
@@ -16,7 +16,7 @@ from deltacat.compute.compactor_v2.utils.primary_key_index import (
|
|
16
16
|
hash_group_index_to_hash_bucket_indices,
|
17
17
|
)
|
18
18
|
|
19
|
-
from deltacat.storage import
|
19
|
+
from deltacat.storage import metastore
|
20
20
|
|
21
21
|
from deltacat.io.object_store import IObjectStore
|
22
22
|
|
@@ -87,11 +87,13 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
|
|
87
87
|
def __init__(
|
88
88
|
self,
|
89
89
|
uniform_deltas: List[DeltaAnnotated],
|
90
|
+
all_column_names: List[str],
|
90
91
|
read_kwargs_provider: Optional[ReadKwargsProvider],
|
91
|
-
deltacat_storage=
|
92
|
+
deltacat_storage=metastore,
|
92
93
|
deltacat_storage_kwargs: Optional[dict] = None,
|
93
94
|
):
|
94
95
|
self._deltas = uniform_deltas
|
96
|
+
self._all_column_names = all_column_names
|
95
97
|
self._read_kwargs_provider = read_kwargs_provider
|
96
98
|
self._deltacat_storage = deltacat_storage
|
97
99
|
self._deltacat_storage_kwargs = deltacat_storage_kwargs
|
@@ -110,6 +112,7 @@ class LocalMergeFileGroupsProvider(MergeFileGroupsProvider):
|
|
110
112
|
total_size_bytes,
|
111
113
|
) = read_delta_file_envelopes(
|
112
114
|
annotated_delta,
|
115
|
+
self._all_column_names,
|
113
116
|
self._read_kwargs_provider,
|
114
117
|
self._deltacat_storage,
|
115
118
|
self._deltacat_storage_kwargs,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import Dict, List, Optional, Any
|
3
|
+
from typing import Dict, List, Optional, Any, Set
|
4
4
|
|
5
5
|
from deltacat.compute.compactor_v2.model.merge_file_group import (
|
6
6
|
MergeFileGroupsProvider,
|
@@ -12,9 +12,10 @@ from deltacat.utils.metrics import MetricsConfig
|
|
12
12
|
from deltacat.utils.common import ReadKwargsProvider
|
13
13
|
from deltacat.io.object_store import IObjectStore
|
14
14
|
from deltacat.storage import (
|
15
|
+
Manifest,
|
15
16
|
Partition,
|
16
17
|
SortKey,
|
17
|
-
|
18
|
+
metastore,
|
18
19
|
)
|
19
20
|
from deltacat.compute.compactor_v2.constants import (
|
20
21
|
DROP_DUPLICATES,
|
@@ -32,23 +33,26 @@ class MergeInput(Dict):
|
|
32
33
|
write_to_partition: Partition,
|
33
34
|
compacted_file_content_type: ContentType,
|
34
35
|
primary_keys: List[str],
|
36
|
+
all_column_names: List[str],
|
35
37
|
drop_duplicates: Optional[bool] = DROP_DUPLICATES,
|
36
38
|
sort_keys: Optional[List[SortKey]] = None,
|
37
39
|
merge_task_index: Optional[int] = 0,
|
38
40
|
max_records_per_output_file: Optional[int] = MAX_RECORDS_PER_COMPACTED_FILE,
|
39
41
|
enable_profiler: Optional[bool] = False,
|
40
42
|
metrics_config: Optional[MetricsConfig] = None,
|
41
|
-
|
43
|
+
table_writer_kwargs: Optional[Dict[str, Any]] = None,
|
42
44
|
read_kwargs_provider: Optional[ReadKwargsProvider] = None,
|
43
45
|
round_completion_info: Optional[RoundCompletionInfo] = None,
|
44
46
|
object_store: Optional[IObjectStore] = None,
|
45
47
|
delete_strategy: Optional[DeleteStrategy] = None,
|
46
|
-
delete_file_envelopes: Optional[List] = None,
|
47
|
-
deltacat_storage=
|
48
|
+
delete_file_envelopes: Optional[List[DeleteFileEnvelope]] = None,
|
49
|
+
deltacat_storage=metastore,
|
48
50
|
deltacat_storage_kwargs: Optional[Dict[str, Any]] = None,
|
49
51
|
memory_logs_enabled: Optional[bool] = None,
|
50
52
|
disable_copy_by_reference: Optional[bool] = None,
|
51
53
|
hash_bucket_count: Optional[int] = None,
|
54
|
+
original_fields: Optional[Set[str]] = None,
|
55
|
+
compacted_manifest: Optional[Manifest] = None,
|
52
56
|
) -> MergeInput:
|
53
57
|
|
54
58
|
result = MergeInput()
|
@@ -56,13 +60,14 @@ class MergeInput(Dict):
|
|
56
60
|
result["write_to_partition"] = write_to_partition
|
57
61
|
result["compacted_file_content_type"] = compacted_file_content_type
|
58
62
|
result["primary_keys"] = primary_keys
|
63
|
+
result["all_column_names"] = all_column_names
|
59
64
|
result["drop_duplicates"] = drop_duplicates
|
60
65
|
result["sort_keys"] = sort_keys
|
61
66
|
result["merge_task_index"] = merge_task_index
|
62
67
|
result["max_records_per_output_file"] = max_records_per_output_file
|
63
68
|
result["enable_profiler"] = enable_profiler
|
64
69
|
result["metrics_config"] = metrics_config
|
65
|
-
result["
|
70
|
+
result["table_writer_kwargs"] = table_writer_kwargs or {}
|
66
71
|
result["read_kwargs_provider"] = read_kwargs_provider
|
67
72
|
result["round_completion_info"] = round_completion_info
|
68
73
|
result["object_store"] = object_store
|
@@ -73,6 +78,8 @@ class MergeInput(Dict):
|
|
73
78
|
result["memory_logs_enabled"] = memory_logs_enabled
|
74
79
|
result["disable_copy_by_reference"] = disable_copy_by_reference
|
75
80
|
result["hash_bucket_count"] = hash_bucket_count
|
81
|
+
result["original_fields"] = original_fields
|
82
|
+
result["compacted_manifest"] = compacted_manifest
|
76
83
|
return result
|
77
84
|
|
78
85
|
@property
|
@@ -91,6 +98,10 @@ class MergeInput(Dict):
|
|
91
98
|
def primary_keys(self) -> List[str]:
|
92
99
|
return self["primary_keys"]
|
93
100
|
|
101
|
+
@property
|
102
|
+
def all_column_names(self) -> List[str]:
|
103
|
+
return self["all_column_names"]
|
104
|
+
|
94
105
|
@property
|
95
106
|
def drop_duplicates(self) -> int:
|
96
107
|
return self["drop_duplicates"]
|
@@ -116,8 +127,8 @@ class MergeInput(Dict):
|
|
116
127
|
return self.get("metrics_config")
|
117
128
|
|
118
129
|
@property
|
119
|
-
def
|
120
|
-
return self.get("
|
130
|
+
def table_writer_kwargs(self) -> Optional[Dict[str, Any]]:
|
131
|
+
return self.get("table_writer_kwargs")
|
121
132
|
|
122
133
|
@property
|
123
134
|
def read_kwargs_provider(self) -> Optional[ReadKwargsProvider]:
|
@@ -132,7 +143,7 @@ class MergeInput(Dict):
|
|
132
143
|
return self.get("object_store")
|
133
144
|
|
134
145
|
@property
|
135
|
-
def deltacat_storage(self) ->
|
146
|
+
def deltacat_storage(self) -> metastore:
|
136
147
|
return self["deltacat_storage"]
|
137
148
|
|
138
149
|
@property
|
@@ -160,3 +171,11 @@ class MergeInput(Dict):
|
|
160
171
|
@property
|
161
172
|
def hash_bucket_count(self) -> int:
|
162
173
|
return self["hash_bucket_count"]
|
174
|
+
|
175
|
+
@property
|
176
|
+
def original_fields(self) -> Optional[Set[str]]:
|
177
|
+
return self.get("original_fields")
|
178
|
+
|
179
|
+
@property
|
180
|
+
def compacted_manifest(self) -> Optional[Manifest]:
|
181
|
+
return self.get("compacted_manifest")
|