deltacat 1.1.38__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +150 -12
- deltacat/annotations.py +36 -0
- deltacat/api.py +578 -0
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +84 -0
- deltacat/benchmarking/benchmark_report.py +86 -0
- deltacat/benchmarking/benchmark_suite.py +11 -0
- deltacat/benchmarking/conftest.py +22 -19
- deltacat/benchmarking/data/random_row_generator.py +94 -0
- deltacat/benchmarking/data/row_generator.py +10 -0
- deltacat/benchmarking/test_benchmark_pipeline.py +108 -0
- deltacat/catalog/__init__.py +73 -0
- deltacat/catalog/delegate.py +615 -140
- deltacat/catalog/interface.py +404 -81
- deltacat/catalog/main/impl.py +2882 -0
- deltacat/catalog/model/catalog.py +348 -46
- deltacat/catalog/model/properties.py +155 -0
- deltacat/catalog/model/table_definition.py +32 -1
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +23 -30
- deltacat/compute/compactor/model/delta_annotated.py +3 -3
- deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
- deltacat/compute/compactor/model/delta_file_locator.py +3 -1
- deltacat/compute/compactor/model/round_completion_info.py +19 -9
- deltacat/compute/compactor/model/table_object_store.py +3 -2
- deltacat/compute/compactor/repartition_session.py +9 -22
- deltacat/compute/compactor/steps/dedupe.py +11 -4
- deltacat/compute/compactor/steps/hash_bucket.py +6 -6
- deltacat/compute/compactor/steps/materialize.py +15 -9
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +7 -6
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/sort_key.py +9 -2
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +13 -14
- deltacat/compute/compactor_v2/deletes/utils.py +3 -3
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +28 -9
- deltacat/compute/compactor_v2/private/compaction_utils.py +171 -73
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +156 -53
- deltacat/compute/compactor_v2/utils/content_type_params.py +17 -6
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +10 -3
- deltacat/compute/compactor_v2/utils/merge.py +14 -2
- deltacat/compute/compactor_v2/utils/task_options.py +2 -10
- deltacat/compute/converter/constants.py +9 -0
- deltacat/compute/converter/converter_session.py +298 -0
- deltacat/compute/converter/model/convert_input.py +96 -0
- deltacat/compute/converter/model/convert_input_files.py +78 -0
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +144 -0
- deltacat/compute/converter/pyiceberg/catalog.py +78 -0
- deltacat/compute/converter/pyiceberg/overrides.py +263 -0
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +299 -0
- deltacat/compute/converter/steps/convert.py +366 -0
- deltacat/compute/converter/steps/dedupe.py +94 -0
- deltacat/compute/converter/utils/__init__.py +0 -0
- deltacat/compute/converter/utils/convert_task_options.py +132 -0
- deltacat/compute/converter/utils/converter_session_utils.py +175 -0
- deltacat/compute/converter/utils/iceberg_columns.py +87 -0
- deltacat/compute/converter/utils/io.py +203 -0
- deltacat/compute/converter/utils/s3u.py +148 -0
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/__init__.py +0 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +11 -1
- deltacat/constants.py +90 -1
- deltacat/docs/__init__.py +0 -0
- deltacat/docs/autogen/__init__.py +0 -0
- deltacat/docs/autogen/schema/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/__init__.py +0 -0
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +61 -0
- deltacat/examples/__init__.py +0 -0
- deltacat/examples/basic_logging.py +101 -0
- deltacat/examples/compactor/__init__.py +0 -0
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/__init__.py +0 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/experimental/iceberg/iceberg_bucket_writer.py +184 -0
- deltacat/examples/experimental/iceberg/iceberg_reader.py +147 -0
- deltacat/examples/hello_world.py +29 -0
- deltacat/examples/indexer/__init__.py +0 -0
- deltacat/examples/indexer/aws/__init__.py +0 -0
- deltacat/examples/indexer/gcp/__init__.py +0 -0
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +116 -12
- deltacat/experimental/__init__.py +0 -0
- deltacat/experimental/catalog/__init__.py +0 -0
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/experimental/catalog/iceberg/iceberg_catalog_config.py +26 -0
- deltacat/experimental/catalog/iceberg/impl.py +399 -0
- deltacat/experimental/catalog/iceberg/overrides.py +72 -0
- deltacat/experimental/compatibility/__init__.py +0 -0
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/experimental/storage/iceberg/impl.py +739 -0
- deltacat/experimental/storage/iceberg/model.py +713 -0
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/arrow/serializer.py +78 -0
- deltacat/experimental/storage/rivulet/dataset.py +745 -0
- deltacat/experimental/storage/rivulet/dataset_executor.py +79 -0
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/feather/file_reader.py +138 -0
- deltacat/experimental/storage/rivulet/feather/serializer.py +35 -0
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/fs/file_provider.py +105 -0
- deltacat/experimental/storage/rivulet/fs/file_store.py +130 -0
- deltacat/experimental/storage/rivulet/fs/input_file.py +76 -0
- deltacat/experimental/storage/rivulet/fs/output_file.py +86 -0
- deltacat/experimental/storage/rivulet/logical_plan.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/metastore/delta.py +188 -0
- deltacat/experimental/storage/rivulet/metastore/json_sst.py +105 -0
- deltacat/experimental/storage/rivulet/metastore/sst.py +82 -0
- deltacat/experimental/storage/rivulet/metastore/sst_interval_tree.py +260 -0
- deltacat/experimental/storage/rivulet/mvp/Table.py +101 -0
- deltacat/experimental/storage/rivulet/mvp/__init__.py +5 -0
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/experimental/storage/rivulet/parquet/file_reader.py +129 -0
- deltacat/experimental/storage/rivulet/parquet/serializer.py +37 -0
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/reader/block_scanner.py +389 -0
- deltacat/experimental/storage/rivulet/reader/data_reader.py +136 -0
- deltacat/experimental/storage/rivulet/reader/data_scan.py +65 -0
- deltacat/experimental/storage/rivulet/reader/dataset_metastore.py +179 -0
- deltacat/experimental/storage/rivulet/reader/dataset_reader.py +158 -0
- deltacat/experimental/storage/rivulet/reader/pyarrow_data_reader.py +124 -0
- deltacat/experimental/storage/rivulet/reader/query_expression.py +99 -0
- deltacat/experimental/storage/rivulet/reader/reader_type_registrar.py +84 -0
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/schema/datatype.py +128 -0
- deltacat/experimental/storage/rivulet/schema/schema.py +251 -0
- deltacat/experimental/storage/rivulet/serializer.py +40 -0
- deltacat/experimental/storage/rivulet/serializer_factory.py +46 -0
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/writer/dataset_writer.py +29 -0
- deltacat/experimental/storage/rivulet/writer/memtable_dataset_writer.py +305 -0
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/logs.py +4 -1
- deltacat/storage/__init__.py +138 -28
- deltacat/storage/interface.py +260 -155
- deltacat/storage/main/__init__.py +0 -0
- deltacat/storage/main/impl.py +3030 -0
- deltacat/storage/model/delta.py +142 -71
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/interop.py +24 -0
- deltacat/storage/model/list_result.py +8 -0
- deltacat/storage/model/locator.py +93 -9
- deltacat/storage/model/manifest.py +643 -0
- deltacat/storage/model/metafile.py +1421 -0
- deltacat/storage/model/namespace.py +41 -18
- deltacat/storage/model/partition.py +443 -43
- deltacat/storage/model/scan/__init__.py +0 -0
- deltacat/storage/model/scan/push_down.py +46 -0
- deltacat/storage/model/scan/scan_plan.py +10 -0
- deltacat/storage/model/scan/scan_task.py +34 -0
- deltacat/storage/model/schema.py +3160 -0
- deltacat/storage/model/shard.py +51 -0
- deltacat/storage/model/sort_key.py +210 -13
- deltacat/storage/model/stream.py +215 -80
- deltacat/storage/model/table.py +134 -29
- deltacat/storage/model/table_version.py +333 -46
- deltacat/storage/model/transaction.py +1733 -0
- deltacat/storage/model/transform.py +274 -58
- deltacat/storage/model/types.py +138 -16
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/__init__.py +1 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +321 -0
- deltacat/tests/catalog/test_default_catalog_impl.py +12154 -66
- deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
- deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
- deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
- deltacat/tests/compute/compact_partition_test_cases.py +23 -30
- deltacat/tests/compute/compactor/steps/test_repartition.py +14 -14
- deltacat/tests/compute/compactor/utils/test_io.py +125 -123
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +387 -830
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +70 -57
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +0 -3
- deltacat/tests/compute/conftest.py +39 -0
- deltacat/tests/compute/converter/__init__.py +0 -0
- deltacat/tests/compute/converter/conftest.py +80 -0
- deltacat/tests/compute/converter/test_convert_session.py +826 -0
- deltacat/tests/compute/converter/utils.py +132 -0
- deltacat/tests/compute/resource_estimation/test_delta.py +88 -104
- deltacat/tests/compute/test_compact_partition_incremental.py +91 -98
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +79 -97
- deltacat/tests/compute/test_compact_partition_params.py +16 -11
- deltacat/tests/compute/test_compact_partition_rebase.py +63 -93
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +249 -220
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +726 -46
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/conftest.py +25 -0
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/conftest.py +149 -0
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/fs/test_file_location_provider.py +94 -0
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/schema/test_schema.py +241 -0
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/experimental/storage/rivulet/test_dataset.py +408 -0
- deltacat/tests/experimental/storage/rivulet/test_manifest.py +67 -0
- deltacat/tests/experimental/storage/rivulet/test_sst_interval_tree.py +232 -0
- deltacat/tests/experimental/storage/rivulet/test_utils.py +124 -0
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_write_then_read.py +343 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_dataset_writer.py +79 -0
- deltacat/tests/experimental/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
- deltacat/tests/storage/__init__.py +0 -0
- deltacat/tests/storage/main/__init__.py +0 -0
- deltacat/tests/storage/main/test_main_storage.py +8204 -0
- deltacat/tests/storage/model/__init__.py +0 -0
- deltacat/tests/storage/model/test_delete_parameters.py +21 -0
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +2440 -0
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +479 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +24 -0
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_table_version.py +110 -0
- deltacat/tests/storage/model/test_transaction.py +653 -0
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1064 -0
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/filesystem.py +14 -0
- deltacat/tests/test_utils/message_pack_utils.py +54 -0
- deltacat/tests/test_utils/pyarrow.py +50 -26
- deltacat/tests/test_utils/storage.py +256 -4
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +124 -34
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1107 -258
- deltacat/types/media.py +345 -37
- deltacat/types/partial_download.py +1 -1
- deltacat/types/tables.py +2345 -47
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +61 -0
- deltacat/utils/filesystem.py +450 -0
- deltacat/utils/metafile_locator.py +74 -0
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1212 -178
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/aws/redshift/__init__.py +0 -19
- deltacat/aws/redshift/model/manifest.py +0 -394
- deltacat/catalog/default_catalog_impl/__init__.py +0 -369
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/io/dataset.py +0 -73
- deltacat/io/read_api.py +0 -143
- deltacat/storage/model/delete_parameters.py +0 -40
- deltacat/storage/model/partition_spec.py +0 -71
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -397
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1262
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-1.1.38.dist-info/METADATA +0 -64
- deltacat-1.1.38.dist-info/RECORD +0 -219
- /deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/model → catalog/main}/__init__.py +0 -0
- /deltacat/compute/{merge_on_read/utils → converter}/__init__.py +0 -0
- /deltacat/{io/aws → compute/converter/model}/__init__.py +0 -0
- /deltacat/{io/aws/redshift → compute/converter/pyiceberg}/__init__.py +0 -0
- /deltacat/{tests/io → compute/converter/steps}/__init__.py +0 -0
- /deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
- /deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-1.1.38.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,9 @@
|
|
1
|
+
DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
|
2
|
+
|
3
|
+
# Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
|
4
|
+
DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
|
5
|
+
|
6
|
+
|
7
|
+
# Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
|
8
|
+
# e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
|
9
|
+
IDENTIFIER_FIELD_DELIMITER = "c303282d"
|
@@ -0,0 +1,298 @@
|
|
1
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
2
|
+
from deltacat.utils.ray_utils.concurrency import (
|
3
|
+
invoke_parallel,
|
4
|
+
task_resource_options_provider,
|
5
|
+
)
|
6
|
+
import ray
|
7
|
+
import functools
|
8
|
+
from deltacat.compute.converter.utils.convert_task_options import (
|
9
|
+
convert_resource_options_provider,
|
10
|
+
)
|
11
|
+
import logging
|
12
|
+
from deltacat import logs
|
13
|
+
from deltacat.compute.converter.model.converter_session_params import (
|
14
|
+
ConverterSessionParams,
|
15
|
+
)
|
16
|
+
from typing import Dict, List, Any, Callable
|
17
|
+
from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
18
|
+
from deltacat.compute.converter.steps.convert import convert
|
19
|
+
from deltacat.compute.converter.model.convert_input import ConvertInput
|
20
|
+
from deltacat.compute.converter.pyiceberg.overrides import (
|
21
|
+
fetch_all_bucket_files,
|
22
|
+
)
|
23
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
24
|
+
construct_iceberg_table_prefix,
|
25
|
+
)
|
26
|
+
from deltacat.compute.converter.pyiceberg.update_snapshot_overrides import (
|
27
|
+
commit_replace_snapshot,
|
28
|
+
commit_append_snapshot,
|
29
|
+
)
|
30
|
+
from deltacat.compute.converter.pyiceberg.catalog import load_table
|
31
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
32
|
+
group_all_files_to_each_bucket,
|
33
|
+
)
|
34
|
+
from deltacat.compute.converter.model.convert_result import ConvertResult
|
35
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
36
|
+
_get_snapshot_action_description,
|
37
|
+
_determine_snapshot_type,
|
38
|
+
SnapshotType,
|
39
|
+
)
|
40
|
+
|
41
|
+
from pyiceberg.manifest import DataFile
|
42
|
+
from pyiceberg.table.metadata import TableMetadata
|
43
|
+
|
44
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
45
|
+
|
46
|
+
|
47
|
+
def converter_session(params: ConverterSessionParams, **kwargs: Any) -> TableMetadata:
|
48
|
+
"""
|
49
|
+
Convert equality deletes to position deletes with option to enforce primary key uniqueness.
|
50
|
+
|
51
|
+
This function processes Iceberg table files to convert equality delete files to position delete files.
|
52
|
+
It can optionally enforce primary key uniqueness by keeping only the latest version of each
|
53
|
+
primary key across all data files.
|
54
|
+
|
55
|
+
**Memory Requirements:**
|
56
|
+
- Minimum 512MB of free memory is required to run the converter
|
57
|
+
|
58
|
+
**Process Overview:**
|
59
|
+
1. Fetches all bucket files (data files, equality deletes, position deletes)
|
60
|
+
2. Groups files by bucket for parallel processing
|
61
|
+
3. Converts equality deletes to position deletes using Ray parallel tasks
|
62
|
+
4. Enforces primary key uniqueness if enabled
|
63
|
+
5. Commits appropriate snapshot (append, replace, or delete) to the Iceberg table
|
64
|
+
|
65
|
+
|
66
|
+
Args:
|
67
|
+
params: ConverterSessionParams containing all configuration parameters
|
68
|
+
- catalog: Iceberg catalog instance
|
69
|
+
- iceberg_table_name: Name of the target Iceberg table
|
70
|
+
- enforce_primary_key_uniqueness: Whether to enforce PK uniqueness
|
71
|
+
- iceberg_warehouse_bucket_name: S3 bucket for Iceberg warehouse
|
72
|
+
- iceberg_namespace: Iceberg namespace
|
73
|
+
- merge_keys: Optional list of merge key fields (uses table identifier fields if not provided)
|
74
|
+
- compact_previous_position_delete_files: Whether to compact existing position delete files
|
75
|
+
- task_max_parallelism: Maximum number of parallel Ray tasks
|
76
|
+
- s3_client_kwargs: Additional S3 client configuration
|
77
|
+
- s3_file_system: S3 file system instance
|
78
|
+
- location_provider_prefix_override: Optional prefix override for file locations
|
79
|
+
- position_delete_for_multiple_data_files: Whether to generate position deletes for multiple data files
|
80
|
+
**kwargs: Additional keyword arguments (currently unused)
|
81
|
+
|
82
|
+
Raises:
|
83
|
+
Exception: If snapshot commitment fails or other critical errors occur
|
84
|
+
|
85
|
+
"""
|
86
|
+
|
87
|
+
catalog = params.catalog
|
88
|
+
table_name = params.iceberg_table_name
|
89
|
+
if "." not in table_name:
|
90
|
+
iceberg_namespace = params.iceberg_namespace or DEFAULT_NAMESPACE
|
91
|
+
table_name = params.iceberg_table_name
|
92
|
+
table_identifier = f"{iceberg_namespace}.{table_name}"
|
93
|
+
else:
|
94
|
+
table_identifier = table_name
|
95
|
+
identifier_parts = table_identifier.split(".")
|
96
|
+
iceberg_namespace = identifier_parts[0]
|
97
|
+
table_name = identifier_parts[1]
|
98
|
+
iceberg_table = load_table(catalog, table_identifier)
|
99
|
+
enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
|
100
|
+
iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
|
101
|
+
merge_keys = params.merge_keys
|
102
|
+
compact_previous_position_delete_files = (
|
103
|
+
params.compact_previous_position_delete_files
|
104
|
+
)
|
105
|
+
task_max_parallelism = params.task_max_parallelism
|
106
|
+
s3_client_kwargs = params.s3_client_kwargs
|
107
|
+
s3_file_system = params.filesystem
|
108
|
+
location_provider_prefix_override = params.location_provider_prefix_override
|
109
|
+
position_delete_for_multiple_data_files = (
|
110
|
+
params.position_delete_for_multiple_data_files
|
111
|
+
)
|
112
|
+
|
113
|
+
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
|
114
|
+
iceberg_table
|
115
|
+
)
|
116
|
+
|
117
|
+
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
118
|
+
data_file_dict=data_file_dict,
|
119
|
+
equality_delete_dict=equality_delete_dict,
|
120
|
+
pos_delete_dict=pos_delete_dict,
|
121
|
+
)
|
122
|
+
|
123
|
+
if not location_provider_prefix_override:
|
124
|
+
iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
|
125
|
+
iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
|
126
|
+
table_name=table_name,
|
127
|
+
iceberg_namespace=iceberg_namespace,
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
iceberg_table_warehouse_prefix = location_provider_prefix_override
|
131
|
+
|
132
|
+
# Using table identifier fields as merge keys if merge keys not provided
|
133
|
+
if not merge_keys:
|
134
|
+
identifier_fields_set = iceberg_table.schema().identifier_field_names()
|
135
|
+
identifier_fields = list(identifier_fields_set)
|
136
|
+
else:
|
137
|
+
identifier_fields = merge_keys
|
138
|
+
|
139
|
+
convert_options_provider: Callable = functools.partial(
|
140
|
+
task_resource_options_provider,
|
141
|
+
resource_amount_provider=convert_resource_options_provider,
|
142
|
+
)
|
143
|
+
|
144
|
+
# TODO (zyiqin): max_parallel_data_file_download should be determined by memory requirement for each bucket.
|
145
|
+
# Specifically, for case when files for one bucket memory requirement exceed one worker node's memory limit, WITHOUT rebasing with larger hash bucket count,
|
146
|
+
# 1. We can control parallel files to download by adjusting max_parallel_data_file_download.
|
147
|
+
# 2. Implement two-layer converter tasks, with convert tasks to spin up child convert tasks.
|
148
|
+
# Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
|
149
|
+
max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
150
|
+
|
151
|
+
def convert_input_provider(index: int, item: Any) -> Dict[str, ConvertInput]:
|
152
|
+
task_opts = convert_options_provider(index, item)
|
153
|
+
return {
|
154
|
+
"convert_input": ConvertInput.of(
|
155
|
+
convert_input_files=item,
|
156
|
+
convert_task_index=index,
|
157
|
+
iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
|
158
|
+
identifier_fields=identifier_fields,
|
159
|
+
compact_previous_position_delete_files=compact_previous_position_delete_files,
|
160
|
+
table_io=iceberg_table.io,
|
161
|
+
table_metadata=iceberg_table.metadata,
|
162
|
+
enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
|
163
|
+
position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
|
164
|
+
max_parallel_data_file_download=max_parallel_data_file_download,
|
165
|
+
s3_client_kwargs=s3_client_kwargs,
|
166
|
+
filesystem=s3_file_system,
|
167
|
+
task_memory=task_opts["memory"],
|
168
|
+
)
|
169
|
+
}
|
170
|
+
|
171
|
+
logger.info(f"Getting remote convert tasks...")
|
172
|
+
# Ray remote task: convert
|
173
|
+
# TODO: Add split mechanism to split large buckets
|
174
|
+
convert_tasks_pending = invoke_parallel(
|
175
|
+
items=convert_input_files_for_all_buckets,
|
176
|
+
ray_task=convert,
|
177
|
+
max_parallelism=task_max_parallelism,
|
178
|
+
options_provider=convert_options_provider,
|
179
|
+
kwargs_provider=convert_input_provider,
|
180
|
+
)
|
181
|
+
|
182
|
+
to_be_deleted_files_list: List[List[DataFile]] = []
|
183
|
+
logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
|
184
|
+
|
185
|
+
convert_results: List[ConvertResult] = ray.get(convert_tasks_pending)
|
186
|
+
logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
|
187
|
+
|
188
|
+
total_position_delete_record_count = sum(
|
189
|
+
convert_result.position_delete_record_count
|
190
|
+
for convert_result in convert_results
|
191
|
+
)
|
192
|
+
total_input_data_file_record_count = sum(
|
193
|
+
convert_result.input_data_files_record_count
|
194
|
+
for convert_result in convert_results
|
195
|
+
)
|
196
|
+
total_data_file_hash_columns_in_memory_sizes = sum(
|
197
|
+
convert_result.input_data_files_hash_columns_in_memory_sizes
|
198
|
+
for convert_result in convert_results
|
199
|
+
)
|
200
|
+
total_position_delete_file_in_memory_sizes = sum(
|
201
|
+
convert_result.position_delete_in_memory_sizes
|
202
|
+
for convert_result in convert_results
|
203
|
+
)
|
204
|
+
total_position_delete_on_disk_sizes = sum(
|
205
|
+
convert_result.position_delete_on_disk_sizes
|
206
|
+
for convert_result in convert_results
|
207
|
+
)
|
208
|
+
total_input_data_files_on_disk_size = sum(
|
209
|
+
convert_result.input_data_files_on_disk_size
|
210
|
+
for convert_result in convert_results
|
211
|
+
)
|
212
|
+
|
213
|
+
# Calculate memory usage statistics
|
214
|
+
max_peak_memory_usage = max(
|
215
|
+
convert_result.peak_memory_usage_bytes for convert_result in convert_results
|
216
|
+
)
|
217
|
+
avg_memory_usage_percentage = sum(
|
218
|
+
convert_result.memory_usage_percentage for convert_result in convert_results
|
219
|
+
) / len(convert_results)
|
220
|
+
max_memory_usage_percentage = max(
|
221
|
+
convert_result.memory_usage_percentage for convert_result in convert_results
|
222
|
+
)
|
223
|
+
|
224
|
+
logger.info(
|
225
|
+
f"Aggregated stats for {table_identifier}: "
|
226
|
+
f"total position delete record count: {total_position_delete_record_count}, "
|
227
|
+
f"total input data file record count: {total_input_data_file_record_count}, "
|
228
|
+
f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
|
229
|
+
f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
|
230
|
+
f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}, "
|
231
|
+
f"total input data files on disk size: {total_input_data_files_on_disk_size}, "
|
232
|
+
f"max peak memory usage: {max_peak_memory_usage} bytes, "
|
233
|
+
f"average memory usage percentage: {avg_memory_usage_percentage:.2f}%, "
|
234
|
+
f"max memory usage percentage: {max_memory_usage_percentage:.2f}%"
|
235
|
+
)
|
236
|
+
|
237
|
+
to_be_added_files_list: List[DataFile] = []
|
238
|
+
for convert_result in convert_results:
|
239
|
+
to_be_added_files = convert_result.to_be_added_files
|
240
|
+
to_be_deleted_files = convert_result.to_be_deleted_files
|
241
|
+
|
242
|
+
to_be_deleted_files_list.extend(to_be_deleted_files.values())
|
243
|
+
to_be_added_files_list.extend(to_be_added_files)
|
244
|
+
|
245
|
+
logger.info(f"To be deleted files list length: {len(to_be_deleted_files_list)}")
|
246
|
+
logger.info(f"To be added files list length: {len(to_be_added_files_list)}")
|
247
|
+
|
248
|
+
# Determine snapshot type and commit
|
249
|
+
snapshot_type = _determine_snapshot_type(
|
250
|
+
to_be_deleted_files_list, to_be_added_files_list
|
251
|
+
)
|
252
|
+
|
253
|
+
if snapshot_type == SnapshotType.NONE:
|
254
|
+
logger.info(
|
255
|
+
_get_snapshot_action_description(
|
256
|
+
snapshot_type, to_be_deleted_files_list, to_be_added_files_list
|
257
|
+
)
|
258
|
+
)
|
259
|
+
return
|
260
|
+
|
261
|
+
logger.info(
|
262
|
+
f"Snapshot action: {_get_snapshot_action_description(snapshot_type, to_be_deleted_files_list, to_be_added_files_list)}"
|
263
|
+
)
|
264
|
+
|
265
|
+
try:
|
266
|
+
if snapshot_type == SnapshotType.APPEND:
|
267
|
+
logger.info(f"Committing append snapshot for {table_identifier}.")
|
268
|
+
updated_table_metadata = commit_append_snapshot(
|
269
|
+
iceberg_table=iceberg_table,
|
270
|
+
new_position_delete_files=to_be_added_files_list,
|
271
|
+
)
|
272
|
+
elif snapshot_type == SnapshotType.REPLACE:
|
273
|
+
logger.info(f"Committing replace snapshot for {table_identifier}.")
|
274
|
+
updated_table_metadata = commit_replace_snapshot(
|
275
|
+
iceberg_table=iceberg_table,
|
276
|
+
to_be_deleted_files=to_be_deleted_files_list,
|
277
|
+
new_position_delete_files=to_be_added_files_list,
|
278
|
+
)
|
279
|
+
elif snapshot_type == SnapshotType.DELETE:
|
280
|
+
logger.info(f"Committing delete snapshot for {table_identifier}.")
|
281
|
+
updated_table_metadata = commit_replace_snapshot(
|
282
|
+
iceberg_table=iceberg_table,
|
283
|
+
to_be_deleted_files=to_be_deleted_files_list,
|
284
|
+
new_position_delete_files=[], # No new files to add
|
285
|
+
)
|
286
|
+
else:
|
287
|
+
logger.warning(f"Unexpected snapshot type: {snapshot_type}")
|
288
|
+
return
|
289
|
+
|
290
|
+
logger.info(
|
291
|
+
f"Committed new Iceberg snapshot for {table_identifier}: {updated_table_metadata.current_snapshot_id}"
|
292
|
+
)
|
293
|
+
|
294
|
+
# Return the updated table metadata with the new snapshot
|
295
|
+
return updated_table_metadata
|
296
|
+
except Exception as e:
|
297
|
+
logger.error(f"Failed to commit snapshot for {table_identifier}: {str(e)}")
|
298
|
+
raise
|
@@ -0,0 +1,96 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Dict, List, Any, Optional
|
3
|
+
from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
|
4
|
+
from fsspec import AbstractFileSystem
|
5
|
+
|
6
|
+
|
7
|
+
class ConvertInput(Dict):
|
8
|
+
@staticmethod
|
9
|
+
def of(
|
10
|
+
convert_input_files: ConvertInputFiles,
|
11
|
+
convert_task_index: int,
|
12
|
+
iceberg_table_warehouse_prefix: str,
|
13
|
+
identifier_fields: List[str],
|
14
|
+
table_io: Any,
|
15
|
+
table_metadata: Any,
|
16
|
+
compact_previous_position_delete_files: bool,
|
17
|
+
enforce_primary_key_uniqueness: bool,
|
18
|
+
position_delete_for_multiple_data_files: bool,
|
19
|
+
max_parallel_data_file_download: int,
|
20
|
+
filesystem: Optional[AbstractFileSystem],
|
21
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
22
|
+
task_memory: float,
|
23
|
+
) -> ConvertInput:
|
24
|
+
|
25
|
+
result = ConvertInput()
|
26
|
+
result["convert_input_files"] = convert_input_files
|
27
|
+
result["convert_task_index"] = convert_task_index
|
28
|
+
result["identifier_fields"] = identifier_fields
|
29
|
+
result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
|
30
|
+
result["table_io"] = table_io
|
31
|
+
result["table_metadata"] = table_metadata
|
32
|
+
result[
|
33
|
+
"compact_previous_position_delete_files"
|
34
|
+
] = compact_previous_position_delete_files
|
35
|
+
result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
|
36
|
+
result[
|
37
|
+
"position_delete_for_multiple_data_files"
|
38
|
+
] = position_delete_for_multiple_data_files
|
39
|
+
result["max_parallel_data_file_download"] = max_parallel_data_file_download
|
40
|
+
result["filesystem"] = filesystem
|
41
|
+
result["s3_client_kwargs"] = s3_client_kwargs
|
42
|
+
result["task_memory"] = task_memory
|
43
|
+
|
44
|
+
return result
|
45
|
+
|
46
|
+
@property
|
47
|
+
def convert_input_files(self) -> ConvertInputFiles:
|
48
|
+
return self["convert_input_files"]
|
49
|
+
|
50
|
+
@property
|
51
|
+
def identifier_fields(self) -> List[str]:
|
52
|
+
return self["identifier_fields"]
|
53
|
+
|
54
|
+
@property
|
55
|
+
def convert_task_index(self) -> int:
|
56
|
+
return self["convert_task_index"]
|
57
|
+
|
58
|
+
@property
|
59
|
+
def iceberg_table_warehouse_prefix(self) -> str:
|
60
|
+
return self["iceberg_table_warehouse_prefix"]
|
61
|
+
|
62
|
+
@property
|
63
|
+
def table_io(self) -> Any:
|
64
|
+
return self["table_io"]
|
65
|
+
|
66
|
+
@property
|
67
|
+
def table_metadata(self) -> Any:
|
68
|
+
return self["table_metadata"]
|
69
|
+
|
70
|
+
@property
|
71
|
+
def compact_previous_position_delete_files(self) -> bool:
|
72
|
+
return self["compact_previous_position_delete_files"]
|
73
|
+
|
74
|
+
@property
|
75
|
+
def enforce_primary_key_uniqueness(self) -> bool:
|
76
|
+
return self["enforce_primary_key_uniqueness"]
|
77
|
+
|
78
|
+
@property
|
79
|
+
def position_delete_for_multiple_data_files(self) -> bool:
|
80
|
+
return self["position_delete_for_multiple_data_files"]
|
81
|
+
|
82
|
+
@property
|
83
|
+
def max_parallel_data_file_download(self) -> int:
|
84
|
+
return self["max_parallel_data_file_download"]
|
85
|
+
|
86
|
+
@property
|
87
|
+
def filesystem(self) -> Optional[AbstractFileSystem]:
|
88
|
+
return self["filesystem"]
|
89
|
+
|
90
|
+
@property
|
91
|
+
def s3_client_kwargs(self) -> Optional[Dict[str, Any]]:
|
92
|
+
return self["s3_client_kwargs"]
|
93
|
+
|
94
|
+
@property
|
95
|
+
def task_memory(self) -> float:
|
96
|
+
return self["task_memory"]
|
@@ -0,0 +1,78 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Dict, List, Any, Optional, Tuple
|
3
|
+
from pyiceberg.manifest import DataFile
|
4
|
+
|
5
|
+
# Type aliases to simplify nested types
|
6
|
+
DataFileWithSequence = Tuple[int, DataFile] # (sequence_number, data_file)
|
7
|
+
DataFileList = List[DataFileWithSequence] # List of data files with sequence numbers
|
8
|
+
DataFileListGroup = List[DataFileList] # Group of data file lists
|
9
|
+
|
10
|
+
|
11
|
+
class ConvertInputFiles(Dict):
|
12
|
+
@staticmethod
|
13
|
+
def of(
|
14
|
+
partition_value: Any,
|
15
|
+
all_data_files_for_dedupe: Optional[DataFileList] = None,
|
16
|
+
applicable_data_files: Optional[DataFileListGroup] = None,
|
17
|
+
applicable_equality_delete_files: Optional[DataFileListGroup] = None,
|
18
|
+
existing_position_delete_files: Optional[DataFileList] = None,
|
19
|
+
) -> ConvertInputFiles:
|
20
|
+
|
21
|
+
result = ConvertInputFiles()
|
22
|
+
result["partition_value"] = partition_value
|
23
|
+
result["all_data_files_for_dedupe"] = all_data_files_for_dedupe
|
24
|
+
result["applicable_data_files"] = applicable_data_files
|
25
|
+
result["applicable_equality_delete_files"] = applicable_equality_delete_files
|
26
|
+
result["existing_position_delete_files"] = existing_position_delete_files
|
27
|
+
return result
|
28
|
+
|
29
|
+
@property
|
30
|
+
def partition_value(self) -> Any:
|
31
|
+
return self["partition_value"]
|
32
|
+
|
33
|
+
@property
|
34
|
+
def all_data_files_for_dedupe(self) -> Optional[DataFileList]:
|
35
|
+
return self["all_data_files_for_dedupe"]
|
36
|
+
|
37
|
+
@property
|
38
|
+
def applicable_data_files(self) -> Optional[DataFileListGroup]:
|
39
|
+
return self["applicable_data_files"]
|
40
|
+
|
41
|
+
@property
|
42
|
+
def applicable_equality_delete_files(
|
43
|
+
self,
|
44
|
+
) -> Optional[DataFileListGroup]:
|
45
|
+
return self["applicable_equality_delete_files"]
|
46
|
+
|
47
|
+
@property
|
48
|
+
def existing_position_delete_files(self) -> Optional[DataFileList]:
|
49
|
+
return self["existing_position_delete_files"]
|
50
|
+
|
51
|
+
@partition_value.setter
|
52
|
+
def partition_value(self, partition_value: Any) -> None:
|
53
|
+
self["partition_value"] = partition_value
|
54
|
+
|
55
|
+
@all_data_files_for_dedupe.setter
|
56
|
+
def all_data_files_for_dedupe(
|
57
|
+
self, all_data_files_for_dedupe: Optional[DataFileList]
|
58
|
+
) -> None:
|
59
|
+
self["all_data_files_for_dedupe"] = all_data_files_for_dedupe
|
60
|
+
|
61
|
+
@applicable_data_files.setter
|
62
|
+
def applicable_data_files(
|
63
|
+
self, applicable_data_files: Optional[DataFileListGroup]
|
64
|
+
) -> None:
|
65
|
+
self["applicable_data_files"] = applicable_data_files
|
66
|
+
|
67
|
+
@applicable_equality_delete_files.setter
|
68
|
+
def applicable_equality_delete_files(
|
69
|
+
self,
|
70
|
+
applicable_equality_delete_files: Optional[DataFileListGroup],
|
71
|
+
) -> None:
|
72
|
+
self["applicable_equality_delete_files"] = applicable_equality_delete_files
|
73
|
+
|
74
|
+
@existing_position_delete_files.setter
|
75
|
+
def existing_position_delete_files(
|
76
|
+
self, existing_position_delete_files: Optional[DataFileList]
|
77
|
+
) -> None:
|
78
|
+
self["existing_position_delete_files"] = existing_position_delete_files
|
@@ -0,0 +1,80 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Dict, List, Any
|
3
|
+
from pyiceberg.manifest import DataFile
|
4
|
+
|
5
|
+
|
6
|
+
class ConvertResult(Dict):
|
7
|
+
@staticmethod
|
8
|
+
def of(
|
9
|
+
convert_task_index: int,
|
10
|
+
to_be_added_files: List[DataFile],
|
11
|
+
to_be_deleted_files: Dict[Any, List[DataFile]],
|
12
|
+
position_delete_record_count: int,
|
13
|
+
input_data_files_record_count: int,
|
14
|
+
input_data_files_hash_columns_in_memory_sizes: int,
|
15
|
+
position_delete_in_memory_sizes: int,
|
16
|
+
position_delete_on_disk_sizes: int,
|
17
|
+
input_data_files_on_disk_size: int,
|
18
|
+
peak_memory_usage_bytes: int,
|
19
|
+
memory_usage_percentage: float,
|
20
|
+
) -> ConvertResult:
|
21
|
+
|
22
|
+
result = ConvertResult()
|
23
|
+
result["convert_task_index"] = convert_task_index
|
24
|
+
result["to_be_added_files"] = to_be_added_files
|
25
|
+
result["to_be_deleted_files"] = to_be_deleted_files
|
26
|
+
result["position_delete_record_count"] = position_delete_record_count
|
27
|
+
result["input_data_files_record_count"] = input_data_files_record_count
|
28
|
+
result[
|
29
|
+
"input_data_files_hash_columns_in_memory_sizes"
|
30
|
+
] = input_data_files_hash_columns_in_memory_sizes
|
31
|
+
result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
|
32
|
+
result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
|
33
|
+
result["input_data_files_on_disk_size"] = input_data_files_on_disk_size
|
34
|
+
result["peak_memory_usage_bytes"] = peak_memory_usage_bytes
|
35
|
+
result["memory_usage_percentage"] = memory_usage_percentage
|
36
|
+
return result
|
37
|
+
|
38
|
+
@property
|
39
|
+
def convert_task_index(self) -> int:
|
40
|
+
return self["convert_task_index"]
|
41
|
+
|
42
|
+
@property
|
43
|
+
def to_be_added_files(self) -> List[DataFile]:
|
44
|
+
return self["to_be_added_files"]
|
45
|
+
|
46
|
+
@property
|
47
|
+
def to_be_deleted_files(self) -> Dict[Any, List[DataFile]]:
|
48
|
+
return self["to_be_deleted_files"]
|
49
|
+
|
50
|
+
@property
|
51
|
+
def position_delete_record_count(self) -> int:
|
52
|
+
return self["position_delete_record_count"]
|
53
|
+
|
54
|
+
@property
|
55
|
+
def input_data_files_record_count(self) -> int:
|
56
|
+
return self["input_data_files_record_count"]
|
57
|
+
|
58
|
+
@property
|
59
|
+
def input_data_files_hash_columns_in_memory_sizes(self) -> int:
|
60
|
+
return self["input_data_files_hash_columns_in_memory_sizes"]
|
61
|
+
|
62
|
+
@property
|
63
|
+
def position_delete_in_memory_sizes(self) -> int:
|
64
|
+
return self["position_delete_in_memory_sizes"]
|
65
|
+
|
66
|
+
@property
|
67
|
+
def position_delete_on_disk_sizes(self) -> int:
|
68
|
+
return self["position_delete_on_disk_sizes"]
|
69
|
+
|
70
|
+
@property
|
71
|
+
def input_data_files_on_disk_size(self) -> int:
|
72
|
+
return self["input_data_files_on_disk_size"]
|
73
|
+
|
74
|
+
@property
|
75
|
+
def peak_memory_usage_bytes(self) -> int:
|
76
|
+
return self["peak_memory_usage_bytes"]
|
77
|
+
|
78
|
+
@property
|
79
|
+
def memory_usage_percentage(self) -> float:
|
80
|
+
return self["memory_usage_percentage"]
|