deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
from deltacat.constants import DEFAULT_NAMESPACE
|
2
2
|
from deltacat.utils.ray_utils.concurrency import (
|
3
3
|
invoke_parallel,
|
4
4
|
task_resource_options_provider,
|
@@ -13,14 +13,12 @@ from deltacat import logs
|
|
13
13
|
from deltacat.compute.converter.model.converter_session_params import (
|
14
14
|
ConverterSessionParams,
|
15
15
|
)
|
16
|
-
|
17
|
-
|
16
|
+
from typing import Dict, List, Any, Callable
|
18
17
|
from deltacat.compute.converter.constants import DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
19
18
|
from deltacat.compute.converter.steps.convert import convert
|
20
19
|
from deltacat.compute.converter.model.convert_input import ConvertInput
|
21
20
|
from deltacat.compute.converter.pyiceberg.overrides import (
|
22
21
|
fetch_all_bucket_files,
|
23
|
-
parquet_files_dict_to_iceberg_data_files,
|
24
22
|
)
|
25
23
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
26
24
|
construct_iceberg_table_prefix,
|
@@ -33,48 +31,112 @@ from deltacat.compute.converter.pyiceberg.catalog import load_table
|
|
33
31
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
34
32
|
group_all_files_to_each_bucket,
|
35
33
|
)
|
34
|
+
from deltacat.compute.converter.model.convert_result import ConvertResult
|
35
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
36
|
+
_get_snapshot_action_description,
|
37
|
+
_determine_snapshot_type,
|
38
|
+
SnapshotType,
|
39
|
+
)
|
40
|
+
|
41
|
+
from pyiceberg.manifest import DataFile
|
42
|
+
from pyiceberg.table.metadata import TableMetadata
|
36
43
|
|
37
44
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
38
45
|
|
39
46
|
|
40
|
-
def converter_session(params: ConverterSessionParams, **kwargs):
|
47
|
+
def converter_session(params: ConverterSessionParams, **kwargs: Any) -> TableMetadata:
|
41
48
|
"""
|
42
|
-
Convert equality
|
43
|
-
|
44
|
-
|
49
|
+
Convert equality deletes to position deletes with option to enforce primary key uniqueness.
|
50
|
+
|
51
|
+
This function processes Iceberg table files to convert equality delete files to position delete files.
|
52
|
+
It can optionally enforce primary key uniqueness by keeping only the latest version of each
|
53
|
+
primary key across all data files.
|
54
|
+
|
55
|
+
**Memory Requirements:**
|
56
|
+
- Minimum 512MB of free memory is required to run the converter
|
57
|
+
|
58
|
+
**Process Overview:**
|
59
|
+
1. Fetches all bucket files (data files, equality deletes, position deletes)
|
60
|
+
2. Groups files by bucket for parallel processing
|
61
|
+
3. Converts equality deletes to position deletes using Ray parallel tasks
|
62
|
+
4. Enforces primary key uniqueness if enabled
|
63
|
+
5. Commits appropriate snapshot (append, replace, or delete) to the Iceberg table
|
64
|
+
|
65
|
+
|
66
|
+
Args:
|
67
|
+
params: ConverterSessionParams containing all configuration parameters
|
68
|
+
- catalog: Iceberg catalog instance
|
69
|
+
- iceberg_table_name: Name of the target Iceberg table
|
70
|
+
- enforce_primary_key_uniqueness: Whether to enforce PK uniqueness
|
71
|
+
- iceberg_warehouse_bucket_name: S3 bucket for Iceberg warehouse
|
72
|
+
- iceberg_namespace: Iceberg namespace
|
73
|
+
- merge_keys: Optional list of merge key fields (uses table identifier fields if not provided)
|
74
|
+
- compact_previous_position_delete_files: Whether to compact existing position delete files
|
75
|
+
- task_max_parallelism: Maximum number of parallel Ray tasks
|
76
|
+
- s3_client_kwargs: Additional S3 client configuration
|
77
|
+
- s3_file_system: S3 file system instance
|
78
|
+
- location_provider_prefix_override: Optional prefix override for file locations
|
79
|
+
- position_delete_for_multiple_data_files: Whether to generate position deletes for multiple data files
|
80
|
+
**kwargs: Additional keyword arguments (currently unused)
|
81
|
+
|
82
|
+
Raises:
|
83
|
+
Exception: If snapshot commitment fails or other critical errors occur
|
84
|
+
|
45
85
|
"""
|
46
86
|
|
47
87
|
catalog = params.catalog
|
48
88
|
table_name = params.iceberg_table_name
|
49
|
-
|
89
|
+
if "." not in table_name:
|
90
|
+
iceberg_namespace = params.iceberg_namespace or DEFAULT_NAMESPACE
|
91
|
+
table_name = params.iceberg_table_name
|
92
|
+
table_identifier = f"{iceberg_namespace}.{table_name}"
|
93
|
+
else:
|
94
|
+
table_identifier = table_name
|
95
|
+
identifier_parts = table_identifier.split(".")
|
96
|
+
iceberg_namespace = identifier_parts[0]
|
97
|
+
table_name = identifier_parts[1]
|
98
|
+
iceberg_table = load_table(catalog, table_identifier)
|
50
99
|
enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
|
100
|
+
iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
|
101
|
+
merge_keys = params.merge_keys
|
102
|
+
compact_previous_position_delete_files = (
|
103
|
+
params.compact_previous_position_delete_files
|
104
|
+
)
|
105
|
+
task_max_parallelism = params.task_max_parallelism
|
106
|
+
s3_client_kwargs = params.s3_client_kwargs
|
107
|
+
s3_file_system = params.filesystem
|
108
|
+
location_provider_prefix_override = params.location_provider_prefix_override
|
109
|
+
position_delete_for_multiple_data_files = (
|
110
|
+
params.position_delete_for_multiple_data_files
|
111
|
+
)
|
112
|
+
|
51
113
|
data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
|
52
114
|
iceberg_table
|
53
115
|
)
|
116
|
+
|
54
117
|
convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
|
55
118
|
data_file_dict=data_file_dict,
|
56
119
|
equality_delete_dict=equality_delete_dict,
|
57
120
|
pos_delete_dict=pos_delete_dict,
|
58
121
|
)
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
122
|
+
|
123
|
+
if not location_provider_prefix_override:
|
124
|
+
iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
|
125
|
+
iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
|
126
|
+
table_name=table_name,
|
127
|
+
iceberg_namespace=iceberg_namespace,
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
iceberg_table_warehouse_prefix = location_provider_prefix_override
|
131
|
+
|
67
132
|
# Using table identifier fields as merge keys if merge keys not provided
|
68
133
|
if not merge_keys:
|
69
134
|
identifier_fields_set = iceberg_table.schema().identifier_field_names()
|
70
135
|
identifier_fields = list(identifier_fields_set)
|
71
136
|
else:
|
72
137
|
identifier_fields = merge_keys
|
73
|
-
|
74
|
-
|
75
|
-
f"Multiple identifier fields lookup not supported yet."
|
76
|
-
)
|
77
|
-
convert_options_provider = functools.partial(
|
138
|
+
|
139
|
+
convert_options_provider: Callable = functools.partial(
|
78
140
|
task_resource_options_provider,
|
79
141
|
resource_amount_provider=convert_resource_options_provider,
|
80
142
|
)
|
@@ -86,58 +148,151 @@ def converter_session(params: ConverterSessionParams, **kwargs):
|
|
86
148
|
# Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
|
87
149
|
max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
|
88
150
|
|
89
|
-
|
90
|
-
|
91
|
-
params.position_delete_for_multiple_data_files
|
92
|
-
)
|
93
|
-
task_max_parallelism = params.task_max_parallelism
|
94
|
-
|
95
|
-
def convert_input_provider(index, item):
|
151
|
+
def convert_input_provider(index: int, item: Any) -> Dict[str, ConvertInput]:
|
152
|
+
task_opts = convert_options_provider(index, item)
|
96
153
|
return {
|
97
154
|
"convert_input": ConvertInput.of(
|
98
|
-
|
155
|
+
convert_input_files=item,
|
99
156
|
convert_task_index=index,
|
100
157
|
iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
|
101
158
|
identifier_fields=identifier_fields,
|
102
|
-
|
159
|
+
compact_previous_position_delete_files=compact_previous_position_delete_files,
|
160
|
+
table_io=iceberg_table.io,
|
161
|
+
table_metadata=iceberg_table.metadata,
|
103
162
|
enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
|
104
163
|
position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
|
105
164
|
max_parallel_data_file_download=max_parallel_data_file_download,
|
165
|
+
s3_client_kwargs=s3_client_kwargs,
|
166
|
+
filesystem=s3_file_system,
|
167
|
+
task_memory=task_opts["memory"],
|
106
168
|
)
|
107
169
|
}
|
108
170
|
|
171
|
+
logger.info(f"Getting remote convert tasks...")
|
109
172
|
# Ray remote task: convert
|
110
|
-
# Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
|
111
173
|
# TODO: Add split mechanism to split large buckets
|
112
174
|
convert_tasks_pending = invoke_parallel(
|
113
|
-
items=convert_input_files_for_all_buckets
|
175
|
+
items=convert_input_files_for_all_buckets,
|
114
176
|
ray_task=convert,
|
115
177
|
max_parallelism=task_max_parallelism,
|
116
178
|
options_provider=convert_options_provider,
|
117
179
|
kwargs_provider=convert_input_provider,
|
118
180
|
)
|
119
|
-
|
120
|
-
|
121
|
-
|
181
|
+
|
182
|
+
to_be_deleted_files_list: List[List[DataFile]] = []
|
183
|
+
logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
|
184
|
+
|
185
|
+
convert_results: List[ConvertResult] = ray.get(convert_tasks_pending)
|
186
|
+
logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
|
187
|
+
|
188
|
+
total_position_delete_record_count = sum(
|
189
|
+
convert_result.position_delete_record_count
|
190
|
+
for convert_result in convert_results
|
191
|
+
)
|
192
|
+
total_input_data_file_record_count = sum(
|
193
|
+
convert_result.input_data_files_record_count
|
194
|
+
for convert_result in convert_results
|
195
|
+
)
|
196
|
+
total_data_file_hash_columns_in_memory_sizes = sum(
|
197
|
+
convert_result.input_data_files_hash_columns_in_memory_sizes
|
198
|
+
for convert_result in convert_results
|
199
|
+
)
|
200
|
+
total_position_delete_file_in_memory_sizes = sum(
|
201
|
+
convert_result.position_delete_in_memory_sizes
|
202
|
+
for convert_result in convert_results
|
203
|
+
)
|
204
|
+
total_position_delete_on_disk_sizes = sum(
|
205
|
+
convert_result.position_delete_on_disk_sizes
|
206
|
+
for convert_result in convert_results
|
207
|
+
)
|
208
|
+
total_input_data_files_on_disk_size = sum(
|
209
|
+
convert_result.input_data_files_on_disk_size
|
210
|
+
for convert_result in convert_results
|
211
|
+
)
|
212
|
+
|
213
|
+
# Calculate memory usage statistics
|
214
|
+
max_peak_memory_usage = max(
|
215
|
+
convert_result.peak_memory_usage_bytes for convert_result in convert_results
|
216
|
+
)
|
217
|
+
avg_memory_usage_percentage = sum(
|
218
|
+
convert_result.memory_usage_percentage for convert_result in convert_results
|
219
|
+
) / len(convert_results)
|
220
|
+
max_memory_usage_percentage = max(
|
221
|
+
convert_result.memory_usage_percentage for convert_result in convert_results
|
222
|
+
)
|
223
|
+
|
224
|
+
logger.info(
|
225
|
+
f"Aggregated stats for {table_identifier}: "
|
226
|
+
f"total position delete record count: {total_position_delete_record_count}, "
|
227
|
+
f"total input data file record count: {total_input_data_file_record_count}, "
|
228
|
+
f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
|
229
|
+
f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
|
230
|
+
f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}, "
|
231
|
+
f"total input data files on disk size: {total_input_data_files_on_disk_size}, "
|
232
|
+
f"max peak memory usage: {max_peak_memory_usage} bytes, "
|
233
|
+
f"average memory usage percentage: {avg_memory_usage_percentage:.2f}%, "
|
234
|
+
f"max memory usage percentage: {max_memory_usage_percentage:.2f}%"
|
235
|
+
)
|
236
|
+
|
237
|
+
to_be_added_files_list: List[DataFile] = []
|
122
238
|
for convert_result in convert_results:
|
123
|
-
|
124
|
-
|
239
|
+
to_be_added_files = convert_result.to_be_added_files
|
240
|
+
to_be_deleted_files = convert_result.to_be_deleted_files
|
241
|
+
|
242
|
+
to_be_deleted_files_list.extend(to_be_deleted_files.values())
|
243
|
+
to_be_added_files_list.extend(to_be_added_files)
|
125
244
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
245
|
+
logger.info(f"To be deleted files list length: {len(to_be_deleted_files_list)}")
|
246
|
+
logger.info(f"To be added files list length: {len(to_be_added_files_list)}")
|
247
|
+
|
248
|
+
# Determine snapshot type and commit
|
249
|
+
snapshot_type = _determine_snapshot_type(
|
250
|
+
to_be_deleted_files_list, to_be_added_files_list
|
130
251
|
)
|
131
252
|
|
132
|
-
if
|
133
|
-
|
134
|
-
|
135
|
-
|
253
|
+
if snapshot_type == SnapshotType.NONE:
|
254
|
+
logger.info(
|
255
|
+
_get_snapshot_action_description(
|
256
|
+
snapshot_type, to_be_deleted_files_list, to_be_added_files_list
|
257
|
+
)
|
136
258
|
)
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
259
|
+
return
|
260
|
+
|
261
|
+
logger.info(
|
262
|
+
f"Snapshot action: {_get_snapshot_action_description(snapshot_type, to_be_deleted_files_list, to_be_added_files_list)}"
|
263
|
+
)
|
264
|
+
|
265
|
+
try:
|
266
|
+
if snapshot_type == SnapshotType.APPEND:
|
267
|
+
logger.info(f"Committing append snapshot for {table_identifier}.")
|
268
|
+
updated_table_metadata = commit_append_snapshot(
|
269
|
+
iceberg_table=iceberg_table,
|
270
|
+
new_position_delete_files=to_be_added_files_list,
|
271
|
+
)
|
272
|
+
elif snapshot_type == SnapshotType.REPLACE:
|
273
|
+
logger.info(f"Committing replace snapshot for {table_identifier}.")
|
274
|
+
updated_table_metadata = commit_replace_snapshot(
|
275
|
+
iceberg_table=iceberg_table,
|
276
|
+
to_be_deleted_files=to_be_deleted_files_list,
|
277
|
+
new_position_delete_files=to_be_added_files_list,
|
278
|
+
)
|
279
|
+
elif snapshot_type == SnapshotType.DELETE:
|
280
|
+
logger.info(f"Committing delete snapshot for {table_identifier}.")
|
281
|
+
updated_table_metadata = commit_replace_snapshot(
|
282
|
+
iceberg_table=iceberg_table,
|
283
|
+
to_be_deleted_files=to_be_deleted_files_list,
|
284
|
+
new_position_delete_files=[], # No new files to add
|
285
|
+
)
|
286
|
+
else:
|
287
|
+
logger.warning(f"Unexpected snapshot type: {snapshot_type}")
|
288
|
+
return
|
289
|
+
|
290
|
+
logger.info(
|
291
|
+
f"Committed new Iceberg snapshot for {table_identifier}: {updated_table_metadata.current_snapshot_id}"
|
143
292
|
)
|
293
|
+
|
294
|
+
# Return the updated table metadata with the new snapshot
|
295
|
+
return updated_table_metadata
|
296
|
+
except Exception as e:
|
297
|
+
logger.error(f"Failed to commit snapshot for {table_identifier}: {str(e)}")
|
298
|
+
raise
|
@@ -1,20 +1,25 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Dict, List
|
2
|
+
from typing import Dict, List, Any, Optional
|
3
3
|
from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
|
4
|
+
from fsspec import AbstractFileSystem
|
4
5
|
|
5
6
|
|
6
7
|
class ConvertInput(Dict):
|
7
8
|
@staticmethod
|
8
9
|
def of(
|
9
|
-
convert_input_files,
|
10
|
-
convert_task_index,
|
11
|
-
iceberg_table_warehouse_prefix,
|
12
|
-
identifier_fields,
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
10
|
+
convert_input_files: ConvertInputFiles,
|
11
|
+
convert_task_index: int,
|
12
|
+
iceberg_table_warehouse_prefix: str,
|
13
|
+
identifier_fields: List[str],
|
14
|
+
table_io: Any,
|
15
|
+
table_metadata: Any,
|
16
|
+
compact_previous_position_delete_files: bool,
|
17
|
+
enforce_primary_key_uniqueness: bool,
|
18
|
+
position_delete_for_multiple_data_files: bool,
|
19
|
+
max_parallel_data_file_download: int,
|
20
|
+
filesystem: Optional[AbstractFileSystem],
|
21
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
22
|
+
task_memory: float,
|
18
23
|
) -> ConvertInput:
|
19
24
|
|
20
25
|
result = ConvertInput()
|
@@ -22,13 +27,19 @@ class ConvertInput(Dict):
|
|
22
27
|
result["convert_task_index"] = convert_task_index
|
23
28
|
result["identifier_fields"] = identifier_fields
|
24
29
|
result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
|
25
|
-
result["
|
30
|
+
result["table_io"] = table_io
|
31
|
+
result["table_metadata"] = table_metadata
|
32
|
+
result[
|
33
|
+
"compact_previous_position_delete_files"
|
34
|
+
] = compact_previous_position_delete_files
|
26
35
|
result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
|
27
36
|
result[
|
28
37
|
"position_delete_for_multiple_data_files"
|
29
38
|
] = position_delete_for_multiple_data_files
|
30
39
|
result["max_parallel_data_file_download"] = max_parallel_data_file_download
|
31
|
-
result["
|
40
|
+
result["filesystem"] = filesystem
|
41
|
+
result["s3_client_kwargs"] = s3_client_kwargs
|
42
|
+
result["task_memory"] = task_memory
|
32
43
|
|
33
44
|
return result
|
34
45
|
|
@@ -49,8 +60,16 @@ class ConvertInput(Dict):
|
|
49
60
|
return self["iceberg_table_warehouse_prefix"]
|
50
61
|
|
51
62
|
@property
|
52
|
-
def
|
53
|
-
return self["
|
63
|
+
def table_io(self) -> Any:
|
64
|
+
return self["table_io"]
|
65
|
+
|
66
|
+
@property
|
67
|
+
def table_metadata(self) -> Any:
|
68
|
+
return self["table_metadata"]
|
69
|
+
|
70
|
+
@property
|
71
|
+
def compact_previous_position_delete_files(self) -> bool:
|
72
|
+
return self["compact_previous_position_delete_files"]
|
54
73
|
|
55
74
|
@property
|
56
75
|
def enforce_primary_key_uniqueness(self) -> bool:
|
@@ -65,5 +84,13 @@ class ConvertInput(Dict):
|
|
65
84
|
return self["max_parallel_data_file_download"]
|
66
85
|
|
67
86
|
@property
|
68
|
-
def
|
69
|
-
return self["
|
87
|
+
def filesystem(self) -> Optional[AbstractFileSystem]:
|
88
|
+
return self["filesystem"]
|
89
|
+
|
90
|
+
@property
|
91
|
+
def s3_client_kwargs(self) -> Optional[Dict[str, Any]]:
|
92
|
+
return self["s3_client_kwargs"]
|
93
|
+
|
94
|
+
@property
|
95
|
+
def task_memory(self) -> float:
|
96
|
+
return self["task_memory"]
|
@@ -1,15 +1,21 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import Dict
|
2
|
+
from typing import Dict, List, Any, Optional, Tuple
|
3
|
+
from pyiceberg.manifest import DataFile
|
4
|
+
|
5
|
+
# Type aliases to simplify nested types
|
6
|
+
DataFileWithSequence = Tuple[int, DataFile] # (sequence_number, data_file)
|
7
|
+
DataFileList = List[DataFileWithSequence] # List of data files with sequence numbers
|
8
|
+
DataFileListGroup = List[DataFileList] # Group of data file lists
|
3
9
|
|
4
10
|
|
5
11
|
class ConvertInputFiles(Dict):
|
6
12
|
@staticmethod
|
7
13
|
def of(
|
8
|
-
partition_value,
|
9
|
-
all_data_files_for_dedupe=None,
|
10
|
-
applicable_data_files=None,
|
11
|
-
applicable_equality_delete_files=None,
|
12
|
-
existing_position_delete_files=None,
|
14
|
+
partition_value: Any,
|
15
|
+
all_data_files_for_dedupe: Optional[DataFileList] = None,
|
16
|
+
applicable_data_files: Optional[DataFileListGroup] = None,
|
17
|
+
applicable_equality_delete_files: Optional[DataFileListGroup] = None,
|
18
|
+
existing_position_delete_files: Optional[DataFileList] = None,
|
13
19
|
) -> ConvertInputFiles:
|
14
20
|
|
15
21
|
result = ConvertInputFiles()
|
@@ -21,41 +27,52 @@ class ConvertInputFiles(Dict):
|
|
21
27
|
return result
|
22
28
|
|
23
29
|
@property
|
24
|
-
def partition_value(self):
|
30
|
+
def partition_value(self) -> Any:
|
25
31
|
return self["partition_value"]
|
26
32
|
|
27
33
|
@property
|
28
|
-
def all_data_files_for_dedupe(self):
|
34
|
+
def all_data_files_for_dedupe(self) -> Optional[DataFileList]:
|
29
35
|
return self["all_data_files_for_dedupe"]
|
30
36
|
|
31
37
|
@property
|
32
|
-
def applicable_data_files(self):
|
38
|
+
def applicable_data_files(self) -> Optional[DataFileListGroup]:
|
33
39
|
return self["applicable_data_files"]
|
34
40
|
|
35
41
|
@property
|
36
|
-
def applicable_equality_delete_files(
|
42
|
+
def applicable_equality_delete_files(
|
43
|
+
self,
|
44
|
+
) -> Optional[DataFileListGroup]:
|
37
45
|
return self["applicable_equality_delete_files"]
|
38
46
|
|
39
47
|
@property
|
40
|
-
def existing_position_delete_files(self):
|
48
|
+
def existing_position_delete_files(self) -> Optional[DataFileList]:
|
41
49
|
return self["existing_position_delete_files"]
|
42
50
|
|
43
51
|
@partition_value.setter
|
44
|
-
def partition_value(self, partition_value):
|
52
|
+
def partition_value(self, partition_value: Any) -> None:
|
45
53
|
self["partition_value"] = partition_value
|
46
54
|
|
47
55
|
@all_data_files_for_dedupe.setter
|
48
|
-
def all_data_files_for_dedupe(
|
56
|
+
def all_data_files_for_dedupe(
|
57
|
+
self, all_data_files_for_dedupe: Optional[DataFileList]
|
58
|
+
) -> None:
|
49
59
|
self["all_data_files_for_dedupe"] = all_data_files_for_dedupe
|
50
60
|
|
51
61
|
@applicable_data_files.setter
|
52
|
-
def applicable_data_files(
|
62
|
+
def applicable_data_files(
|
63
|
+
self, applicable_data_files: Optional[DataFileListGroup]
|
64
|
+
) -> None:
|
53
65
|
self["applicable_data_files"] = applicable_data_files
|
54
66
|
|
55
67
|
@applicable_equality_delete_files.setter
|
56
|
-
def applicable_equality_delete_files(
|
68
|
+
def applicable_equality_delete_files(
|
69
|
+
self,
|
70
|
+
applicable_equality_delete_files: Optional[DataFileListGroup],
|
71
|
+
) -> None:
|
57
72
|
self["applicable_equality_delete_files"] = applicable_equality_delete_files
|
58
73
|
|
59
74
|
@existing_position_delete_files.setter
|
60
|
-
def existing_position_delete_files(
|
75
|
+
def existing_position_delete_files(
|
76
|
+
self, existing_position_delete_files: Optional[DataFileList]
|
77
|
+
) -> None:
|
61
78
|
self["existing_position_delete_files"] = existing_position_delete_files
|
@@ -0,0 +1,80 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Dict, List, Any
|
3
|
+
from pyiceberg.manifest import DataFile
|
4
|
+
|
5
|
+
|
6
|
+
class ConvertResult(Dict):
|
7
|
+
@staticmethod
|
8
|
+
def of(
|
9
|
+
convert_task_index: int,
|
10
|
+
to_be_added_files: List[DataFile],
|
11
|
+
to_be_deleted_files: Dict[Any, List[DataFile]],
|
12
|
+
position_delete_record_count: int,
|
13
|
+
input_data_files_record_count: int,
|
14
|
+
input_data_files_hash_columns_in_memory_sizes: int,
|
15
|
+
position_delete_in_memory_sizes: int,
|
16
|
+
position_delete_on_disk_sizes: int,
|
17
|
+
input_data_files_on_disk_size: int,
|
18
|
+
peak_memory_usage_bytes: int,
|
19
|
+
memory_usage_percentage: float,
|
20
|
+
) -> ConvertResult:
|
21
|
+
|
22
|
+
result = ConvertResult()
|
23
|
+
result["convert_task_index"] = convert_task_index
|
24
|
+
result["to_be_added_files"] = to_be_added_files
|
25
|
+
result["to_be_deleted_files"] = to_be_deleted_files
|
26
|
+
result["position_delete_record_count"] = position_delete_record_count
|
27
|
+
result["input_data_files_record_count"] = input_data_files_record_count
|
28
|
+
result[
|
29
|
+
"input_data_files_hash_columns_in_memory_sizes"
|
30
|
+
] = input_data_files_hash_columns_in_memory_sizes
|
31
|
+
result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
|
32
|
+
result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
|
33
|
+
result["input_data_files_on_disk_size"] = input_data_files_on_disk_size
|
34
|
+
result["peak_memory_usage_bytes"] = peak_memory_usage_bytes
|
35
|
+
result["memory_usage_percentage"] = memory_usage_percentage
|
36
|
+
return result
|
37
|
+
|
38
|
+
@property
|
39
|
+
def convert_task_index(self) -> int:
|
40
|
+
return self["convert_task_index"]
|
41
|
+
|
42
|
+
@property
|
43
|
+
def to_be_added_files(self) -> List[DataFile]:
|
44
|
+
return self["to_be_added_files"]
|
45
|
+
|
46
|
+
@property
|
47
|
+
def to_be_deleted_files(self) -> Dict[Any, List[DataFile]]:
|
48
|
+
return self["to_be_deleted_files"]
|
49
|
+
|
50
|
+
@property
|
51
|
+
def position_delete_record_count(self) -> int:
|
52
|
+
return self["position_delete_record_count"]
|
53
|
+
|
54
|
+
@property
|
55
|
+
def input_data_files_record_count(self) -> int:
|
56
|
+
return self["input_data_files_record_count"]
|
57
|
+
|
58
|
+
@property
|
59
|
+
def input_data_files_hash_columns_in_memory_sizes(self) -> int:
|
60
|
+
return self["input_data_files_hash_columns_in_memory_sizes"]
|
61
|
+
|
62
|
+
@property
|
63
|
+
def position_delete_in_memory_sizes(self) -> int:
|
64
|
+
return self["position_delete_in_memory_sizes"]
|
65
|
+
|
66
|
+
@property
|
67
|
+
def position_delete_on_disk_sizes(self) -> int:
|
68
|
+
return self["position_delete_on_disk_sizes"]
|
69
|
+
|
70
|
+
@property
|
71
|
+
def input_data_files_on_disk_size(self) -> int:
|
72
|
+
return self["input_data_files_on_disk_size"]
|
73
|
+
|
74
|
+
@property
|
75
|
+
def peak_memory_usage_bytes(self) -> int:
|
76
|
+
return self["peak_memory_usage_bytes"]
|
77
|
+
|
78
|
+
@property
|
79
|
+
def memory_usage_percentage(self) -> float:
|
80
|
+
return self["memory_usage_percentage"]
|