deltacat 2.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deltacat/__init__.py +117 -18
- deltacat/api.py +536 -126
- deltacat/aws/constants.py +0 -23
- deltacat/aws/s3u.py +4 -631
- deltacat/benchmarking/benchmark_engine.py +4 -2
- deltacat/benchmarking/conftest.py +1 -19
- deltacat/benchmarking/test_benchmark_pipeline.py +6 -4
- deltacat/catalog/__init__.py +64 -5
- deltacat/catalog/delegate.py +445 -63
- deltacat/catalog/interface.py +188 -62
- deltacat/catalog/main/impl.py +2444 -282
- deltacat/catalog/model/catalog.py +208 -113
- deltacat/catalog/model/properties.py +63 -24
- deltacat/compute/__init__.py +14 -0
- deltacat/compute/compactor/compaction_session.py +97 -75
- deltacat/compute/compactor/model/compact_partition_params.py +75 -30
- deltacat/compute/compactor/model/compaction_session_audit_info.py +17 -0
- deltacat/compute/compactor/model/round_completion_info.py +16 -6
- deltacat/compute/compactor/repartition_session.py +8 -21
- deltacat/compute/compactor/steps/hash_bucket.py +5 -5
- deltacat/compute/compactor/steps/materialize.py +9 -7
- deltacat/compute/compactor/steps/repartition.py +12 -11
- deltacat/compute/compactor/utils/io.py +6 -5
- deltacat/compute/compactor/utils/round_completion_reader.py +117 -0
- deltacat/compute/compactor/utils/system_columns.py +3 -1
- deltacat/compute/compactor_v2/compaction_session.py +17 -14
- deltacat/compute/compactor_v2/constants.py +30 -1
- deltacat/compute/compactor_v2/model/evaluate_compaction_result.py +0 -1
- deltacat/compute/compactor_v2/model/hash_bucket_input.py +9 -3
- deltacat/compute/compactor_v2/model/merge_file_group.py +5 -2
- deltacat/compute/compactor_v2/model/merge_input.py +33 -8
- deltacat/compute/compactor_v2/private/compaction_utils.py +167 -68
- deltacat/compute/compactor_v2/steps/hash_bucket.py +5 -2
- deltacat/compute/compactor_v2/steps/merge.py +267 -55
- deltacat/compute/compactor_v2/utils/content_type_params.py +34 -6
- deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
- deltacat/compute/compactor_v2/utils/delta.py +5 -3
- deltacat/compute/compactor_v2/utils/io.py +11 -4
- deltacat/compute/compactor_v2/utils/merge.py +15 -2
- deltacat/compute/compactor_v2/utils/primary_key_index.py +28 -4
- deltacat/compute/compactor_v2/utils/task_options.py +45 -33
- deltacat/compute/converter/constants.py +5 -0
- deltacat/compute/converter/converter_session.py +207 -52
- deltacat/compute/converter/model/convert_input.py +43 -16
- deltacat/compute/converter/model/convert_input_files.py +33 -16
- deltacat/compute/converter/model/convert_result.py +80 -0
- deltacat/compute/converter/model/converter_session_params.py +64 -19
- deltacat/compute/converter/pyiceberg/catalog.py +21 -18
- deltacat/compute/converter/pyiceberg/overrides.py +193 -65
- deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +148 -100
- deltacat/compute/converter/steps/convert.py +230 -75
- deltacat/compute/converter/steps/dedupe.py +46 -12
- deltacat/compute/converter/utils/convert_task_options.py +66 -22
- deltacat/compute/converter/utils/converter_session_utils.py +126 -60
- deltacat/compute/converter/utils/iceberg_columns.py +13 -8
- deltacat/compute/converter/utils/io.py +173 -13
- deltacat/compute/converter/utils/s3u.py +42 -27
- deltacat/compute/janitor.py +205 -0
- deltacat/compute/jobs/client.py +417 -0
- deltacat/compute/resource_estimation/delta.py +38 -6
- deltacat/compute/resource_estimation/model.py +8 -0
- deltacat/constants.py +49 -6
- deltacat/docs/autogen/schema/inference/generate_type_mappings.py +687 -0
- deltacat/docs/autogen/schema/inference/parse_json_type_mappings.py +673 -0
- deltacat/env.py +10 -0
- deltacat/examples/basic_logging.py +6 -6
- deltacat/examples/compactor/aws/__init__.py +1 -0
- deltacat/examples/compactor/bootstrap.py +863 -0
- deltacat/examples/compactor/compactor.py +373 -0
- deltacat/examples/compactor/explorer.py +473 -0
- deltacat/examples/compactor/gcp/__init__.py +1 -0
- deltacat/examples/compactor/job_runner.py +439 -0
- deltacat/examples/compactor/utils/__init__.py +1 -0
- deltacat/examples/compactor/utils/common.py +261 -0
- deltacat/examples/experimental/iceberg/converter/beam/app.py +226 -0
- deltacat/examples/experimental/iceberg/converter/beam/main.py +133 -0
- deltacat/examples/experimental/iceberg/converter/beam/test_workflow.py +113 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/__init__.py +3 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/common.py +174 -0
- deltacat/examples/experimental/iceberg/converter/beam/utils/spark.py +263 -0
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_bucket_writer.py +66 -21
- deltacat/examples/{iceberg → experimental/iceberg}/iceberg_reader.py +2 -4
- deltacat/examples/hello_world.py +4 -2
- deltacat/examples/indexer/indexer.py +163 -0
- deltacat/examples/indexer/job_runner.py +198 -0
- deltacat/exceptions.py +66 -4
- deltacat/experimental/catalog/iceberg/__init__.py +6 -0
- deltacat/{catalog → experimental/catalog}/iceberg/iceberg_catalog_config.py +1 -1
- deltacat/{catalog → experimental/catalog}/iceberg/impl.py +43 -12
- deltacat/{catalog → experimental/catalog}/iceberg/overrides.py +12 -14
- deltacat/experimental/compatibility/backfill_locator_to_id_mappings.py +201 -0
- deltacat/experimental/converter_agent/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/__init__.py +0 -0
- deltacat/experimental/converter_agent/beam/managed.py +173 -0
- deltacat/experimental/converter_agent/table_monitor.py +479 -0
- deltacat/experimental/daft/__init__.py +4 -0
- deltacat/experimental/daft/daft_catalog.py +229 -0
- deltacat/experimental/storage/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/__init__.py +0 -0
- deltacat/experimental/storage/iceberg/iceberg_scan_planner.py +129 -0
- deltacat/{storage → experimental/storage}/iceberg/impl.py +6 -4
- deltacat/{storage → experimental/storage}/iceberg/model.py +7 -3
- deltacat/experimental/storage/iceberg/visitor.py +119 -0
- deltacat/experimental/storage/rivulet/__init__.py +11 -0
- deltacat/experimental/storage/rivulet/arrow/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/arrow/serializer.py +7 -4
- deltacat/{storage → experimental/storage}/rivulet/dataset.py +13 -12
- deltacat/{storage → experimental/storage}/rivulet/dataset_executor.py +12 -20
- deltacat/experimental/storage/rivulet/feather/__init__.py +7 -0
- deltacat/{storage → experimental/storage}/rivulet/feather/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/feather/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/fs/file_provider.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/fs/file_store.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/fs/output_file.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/logical_plan.py +4 -4
- deltacat/experimental/storage/rivulet/metastore/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/metastore/delta.py +1 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/json_sst.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst.py +2 -2
- deltacat/{storage → experimental/storage}/rivulet/metastore/sst_interval_tree.py +3 -3
- deltacat/experimental/storage/rivulet/parquet/__init__.py +7 -0
- deltacat/experimental/storage/rivulet/parquet/data_reader.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/parquet/file_reader.py +7 -5
- deltacat/{storage → experimental/storage}/rivulet/parquet/serializer.py +4 -4
- deltacat/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/reader/block_scanner.py +20 -9
- deltacat/{storage → experimental/storage}/rivulet/reader/data_reader.py +3 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/data_scan.py +5 -3
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_metastore.py +7 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/dataset_reader.py +8 -6
- deltacat/{storage → experimental/storage}/rivulet/reader/pyarrow_data_reader.py +4 -1
- deltacat/{storage → experimental/storage}/rivulet/reader/reader_type_registrar.py +4 -4
- deltacat/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/schema/schema.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer.py +1 -1
- deltacat/{storage → experimental/storage}/rivulet/serializer_factory.py +9 -5
- deltacat/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/experimental/storage/rivulet/shard/range_shard.py +129 -0
- deltacat/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/{storage → experimental/storage}/rivulet/writer/memtable_dataset_writer.py +20 -9
- deltacat/io/__init__.py +13 -0
- deltacat/io/dataset/__init__.py +0 -0
- deltacat/io/dataset/deltacat_dataset.py +91 -0
- deltacat/io/datasink/__init__.py +0 -0
- deltacat/io/datasink/deltacat_datasink.py +207 -0
- deltacat/io/datasource/__init__.py +0 -0
- deltacat/io/datasource/deltacat_datasource.py +579 -0
- deltacat/io/reader/__init__.py +0 -0
- deltacat/io/reader/deltacat_read_api.py +172 -0
- deltacat/storage/__init__.py +22 -2
- deltacat/storage/interface.py +54 -32
- deltacat/storage/main/impl.py +1494 -541
- deltacat/storage/model/delta.py +27 -3
- deltacat/storage/model/expression/__init__.py +47 -0
- deltacat/storage/model/expression/expression.py +656 -0
- deltacat/storage/model/expression/visitor.py +248 -0
- deltacat/storage/model/locator.py +6 -12
- deltacat/storage/model/manifest.py +231 -6
- deltacat/storage/model/metafile.py +224 -119
- deltacat/storage/model/namespace.py +8 -1
- deltacat/storage/model/partition.py +117 -42
- deltacat/storage/model/scan/push_down.py +32 -5
- deltacat/storage/model/schema.py +2427 -159
- deltacat/storage/model/shard.py +6 -2
- deltacat/storage/model/sort_key.py +40 -0
- deltacat/storage/model/stream.py +9 -2
- deltacat/storage/model/table.py +12 -1
- deltacat/storage/model/table_version.py +11 -0
- deltacat/storage/model/transaction.py +1184 -208
- deltacat/storage/model/transform.py +81 -2
- deltacat/storage/model/types.py +53 -29
- deltacat/storage/util/__init__.py +0 -0
- deltacat/storage/util/scan_planner.py +26 -0
- deltacat/tests/_io/reader/__init__.py +0 -0
- deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
- deltacat/tests/_io/test_cloudpickle_bug_fix.py +8 -4
- deltacat/tests/aws/test_s3u.py +2 -31
- deltacat/tests/catalog/data/__init__.py +0 -0
- deltacat/tests/catalog/main/__init__.py +0 -0
- deltacat/tests/catalog/main/test_catalog_impl_namespace_operations.py +130 -0
- deltacat/tests/catalog/main/test_catalog_impl_table_operations.py +1972 -0
- deltacat/tests/catalog/model/__init__.py +0 -0
- deltacat/tests/catalog/model/test_table_definition.py +16 -0
- deltacat/tests/catalog/test_catalogs.py +103 -106
- deltacat/tests/catalog/test_default_catalog_impl.py +12152 -72
- deltacat/tests/compute/compact_partition_test_cases.py +35 -8
- deltacat/tests/compute/compactor/steps/test_repartition.py +12 -12
- deltacat/tests/compute/compactor/utils/test_io.py +124 -120
- deltacat/tests/compute/compactor/utils/test_round_completion_reader.py +254 -0
- deltacat/tests/compute/compactor_v2/test_compaction_session.py +423 -312
- deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +266 -0
- deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +45 -0
- deltacat/tests/compute/compactor_v2/utils/test_task_options.py +270 -1
- deltacat/tests/compute/conftest.py +8 -44
- deltacat/tests/compute/converter/test_convert_session.py +697 -349
- deltacat/tests/compute/converter/utils.py +15 -6
- deltacat/tests/compute/resource_estimation/test_delta.py +145 -79
- deltacat/tests/compute/test_compact_partition_incremental.py +103 -70
- deltacat/tests/compute/test_compact_partition_multiple_rounds.py +89 -66
- deltacat/tests/compute/test_compact_partition_params.py +13 -8
- deltacat/tests/compute/test_compact_partition_rebase.py +77 -62
- deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +263 -193
- deltacat/tests/compute/test_janitor.py +236 -0
- deltacat/tests/compute/test_util_common.py +716 -43
- deltacat/tests/compute/test_util_constant.py +0 -1
- deltacat/tests/{storage/conftest.py → conftest.py} +1 -1
- deltacat/tests/daft/__init__.py +0 -0
- deltacat/tests/daft/test_model.py +97 -0
- deltacat/tests/experimental/__init__.py +1 -0
- deltacat/tests/experimental/catalog/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/__init__.py +0 -0
- deltacat/tests/experimental/catalog/iceberg/test_iceberg_catalog.py +71 -0
- deltacat/tests/experimental/compatibility/__init__.py +1 -0
- deltacat/tests/experimental/compatibility/test_backfill_locator_to_id_mappings.py +582 -0
- deltacat/tests/experimental/daft/__init__.py +0 -0
- deltacat/tests/experimental/daft/test_deltacat_daft_integration.py +136 -0
- deltacat/tests/experimental/storage/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/conftest.py +3 -3
- deltacat/tests/experimental/storage/rivulet/fs/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/fs/test_file_location_provider.py +3 -2
- deltacat/tests/experimental/storage/rivulet/reader/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/reader/query_expression.py +80 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_data_scan.py +119 -0
- deltacat/tests/experimental/storage/rivulet/reader/test_dataset_metastore.py +71 -0
- deltacat/tests/experimental/storage/rivulet/schema/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/schema/test_schema.py +1 -1
- deltacat/tests/experimental/storage/rivulet/shard/__init__.py +0 -0
- deltacat/tests/experimental/storage/rivulet/shard/test_range_shard.py +162 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/test_dataset.py +6 -4
- deltacat/tests/{storage → experimental/storage}/rivulet/test_manifest.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_sst_interval_tree.py +5 -5
- deltacat/tests/{storage → experimental/storage}/rivulet/test_utils.py +8 -6
- deltacat/tests/experimental/storage/rivulet/writer/__init__.py +0 -0
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_write_then_read.py +11 -9
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_dataset_writer.py +2 -2
- deltacat/tests/{storage → experimental/storage}/rivulet/writer/test_memtable_dataset_writer.py +7 -7
- deltacat/tests/storage/main/test_main_storage.py +6900 -95
- deltacat/tests/storage/model/test_expression.py +327 -0
- deltacat/tests/storage/model/test_manifest.py +129 -0
- deltacat/tests/storage/model/test_metafile_io.py +78 -173
- deltacat/tests/storage/model/test_partition_scheme.py +85 -0
- deltacat/tests/storage/model/test_schema.py +171 -0
- deltacat/tests/storage/model/test_schema_update.py +1925 -0
- deltacat/tests/storage/model/test_shard.py +3 -1
- deltacat/tests/storage/model/test_sort_scheme.py +90 -0
- deltacat/tests/storage/model/test_transaction.py +393 -48
- deltacat/tests/storage/model/test_transaction_history.py +886 -0
- deltacat/tests/test_deltacat_api.py +1036 -11
- deltacat/tests/test_exceptions.py +9 -5
- deltacat/tests/test_utils/pyarrow.py +52 -21
- deltacat/tests/test_utils/storage.py +23 -34
- deltacat/tests/types/__init__.py +0 -0
- deltacat/tests/types/test_tables.py +104 -0
- deltacat/tests/utils/exceptions.py +22 -0
- deltacat/tests/utils/main_deltacat_storage_mock.py +31 -0
- deltacat/tests/utils/ray_utils/test_dataset.py +123 -5
- deltacat/tests/utils/test_daft.py +121 -31
- deltacat/tests/utils/test_numpy.py +1193 -0
- deltacat/tests/utils/test_pandas.py +1106 -0
- deltacat/tests/utils/test_polars.py +1040 -0
- deltacat/tests/utils/test_pyarrow.py +1370 -89
- deltacat/types/media.py +345 -37
- deltacat/types/tables.py +2344 -46
- deltacat/utils/arguments.py +33 -1
- deltacat/utils/daft.py +824 -40
- deltacat/utils/export.py +3 -1
- deltacat/utils/filesystem.py +139 -9
- deltacat/utils/metafile_locator.py +2 -1
- deltacat/utils/numpy.py +118 -26
- deltacat/utils/pandas.py +577 -48
- deltacat/utils/polars.py +759 -0
- deltacat/utils/pyarrow.py +1373 -192
- deltacat/utils/ray_utils/concurrency.py +1 -1
- deltacat/utils/ray_utils/dataset.py +101 -10
- deltacat/utils/ray_utils/runtime.py +56 -4
- deltacat/utils/reader_compatibility_mapping.py +3083 -0
- deltacat/utils/url.py +1325 -0
- deltacat-2.0.0.dist-info/METADATA +1163 -0
- deltacat-2.0.0.dist-info/RECORD +439 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/WHEEL +1 -1
- deltacat/catalog/iceberg/__init__.py +0 -4
- deltacat/compute/compactor/utils/round_completion_file.py +0 -97
- deltacat/compute/merge_on_read/__init__.py +0 -4
- deltacat/compute/merge_on_read/daft.py +0 -40
- deltacat/compute/merge_on_read/model/merge_on_read_params.py +0 -66
- deltacat/compute/merge_on_read/utils/delta.py +0 -42
- deltacat/examples/common/fixtures.py +0 -15
- deltacat/storage/iceberg/iceberg_scan_planner.py +0 -28
- deltacat/storage/rivulet/__init__.py +0 -11
- deltacat/storage/rivulet/feather/__init__.py +0 -5
- deltacat/storage/rivulet/parquet/__init__.py +0 -5
- deltacat/tests/compute/compactor/utils/test_round_completion_file.py +0 -231
- deltacat/tests/compute/test_util_create_table_deltas_repo.py +0 -388
- deltacat/tests/local_deltacat_storage/__init__.py +0 -1235
- deltacat/tests/local_deltacat_storage/exceptions.py +0 -10
- deltacat/utils/s3fs.py +0 -21
- deltacat-2.0.dist-info/METADATA +0 -65
- deltacat-2.0.dist-info/RECORD +0 -347
- /deltacat/compute/{merge_on_read/model → jobs}/__init__.py +0 -0
- /deltacat/{compute/merge_on_read/utils → docs}/__init__.py +0 -0
- /deltacat/{examples/common → docs/autogen}/__init__.py +0 -0
- /deltacat/{examples/iceberg → docs/autogen/schema}/__init__.py +0 -0
- /deltacat/{storage/iceberg → docs/autogen/schema/inference}/__init__.py +0 -0
- /deltacat/{storage/rivulet/arrow → examples/compactor}/__init__.py +0 -0
- /deltacat/{storage/rivulet/fs → examples/experimental}/__init__.py +0 -0
- /deltacat/{storage/rivulet/metastore → examples/experimental/iceberg}/__init__.py +0 -0
- /deltacat/{storage/rivulet/reader → examples/experimental/iceberg/converter}/__init__.py +0 -0
- /deltacat/{storage/rivulet/schema → examples/experimental/iceberg/converter/beam}/__init__.py +0 -0
- /deltacat/{storage/rivulet/writer → examples/indexer}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet → examples/indexer/aws}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/fs → examples/indexer/gcp}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/schema → experimental}/__init__.py +0 -0
- /deltacat/{tests/storage/rivulet/writer → experimental/catalog}/__init__.py +0 -0
- /deltacat/{storage/rivulet/parquet/data_reader.py → experimental/compatibility/__init__.py} +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/fs/input_file.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/Table.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/mvp/__init__.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/reader/query_expression.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/schema/datatype.py +0 -0
- /deltacat/{storage → experimental/storage}/rivulet/writer/dataset_writer.py +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info/licenses}/LICENSE +0 -0
- {deltacat-2.0.dist-info → deltacat-2.0.0.dist-info}/top_level.txt +0 -0
@@ -8,36 +8,55 @@ import ray
|
|
8
8
|
import logging
|
9
9
|
from deltacat.compute.converter.model.convert_input import ConvertInput
|
10
10
|
from deltacat.compute.converter.steps.dedupe import dedupe_data_files
|
11
|
-
from deltacat.compute.converter.utils.
|
11
|
+
from deltacat.compute.converter.utils.io import write_sliced_table
|
12
12
|
from deltacat.compute.converter.utils.io import (
|
13
13
|
download_data_table_and_append_iceberg_columns,
|
14
14
|
)
|
15
15
|
from deltacat.compute.converter.utils.converter_session_utils import (
|
16
16
|
partition_value_record_to_partition_value_string,
|
17
|
+
sort_data_files_maintaining_order,
|
17
18
|
)
|
18
|
-
|
19
|
+
from deltacat.compute.converter.pyiceberg.overrides import (
|
20
|
+
parquet_files_dict_to_iceberg_data_files,
|
21
|
+
)
|
22
|
+
from deltacat.compute.converter.model.convert_result import ConvertResult
|
23
|
+
from pyiceberg.manifest import DataFileContent
|
19
24
|
from deltacat import logs
|
25
|
+
from fsspec import AbstractFileSystem
|
26
|
+
from typing import List, Dict, Tuple, Optional, Any
|
27
|
+
from deltacat.utils.resources import get_current_process_peak_memory_usage_in_bytes
|
28
|
+
from deltacat.compute.converter.model.convert_input_files import (
|
29
|
+
DataFileList,
|
30
|
+
DataFileListGroup,
|
31
|
+
)
|
20
32
|
|
21
33
|
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
22
34
|
|
23
35
|
|
24
36
|
@ray.remote
|
25
|
-
def convert(convert_input: ConvertInput):
|
37
|
+
def convert(convert_input: ConvertInput) -> ConvertResult:
|
26
38
|
convert_input_files = convert_input.convert_input_files
|
27
39
|
convert_task_index = convert_input.convert_task_index
|
28
40
|
iceberg_table_warehouse_prefix = convert_input.iceberg_table_warehouse_prefix
|
29
41
|
identifier_fields = convert_input.identifier_fields
|
30
|
-
|
42
|
+
table_io = convert_input.table_io
|
43
|
+
table_metadata = convert_input.table_metadata
|
44
|
+
compact_previous_position_delete_files = (
|
45
|
+
convert_input.compact_previous_position_delete_files
|
46
|
+
)
|
31
47
|
position_delete_for_multiple_data_files = (
|
32
48
|
convert_input.position_delete_for_multiple_data_files
|
33
49
|
)
|
34
50
|
max_parallel_data_file_download = convert_input.max_parallel_data_file_download
|
35
|
-
|
51
|
+
filesystem = convert_input.filesystem
|
52
|
+
s3_client_kwargs = convert_input.s3_client_kwargs
|
53
|
+
task_memory = convert_input.task_memory
|
54
|
+
|
36
55
|
if not position_delete_for_multiple_data_files:
|
37
56
|
raise NotImplementedError(
|
38
57
|
f"Distributed file level position delete compute is not supported yet"
|
39
58
|
)
|
40
|
-
if
|
59
|
+
if compact_previous_position_delete_files:
|
41
60
|
raise NotImplementedError(f"Compact previous position delete not supported yet")
|
42
61
|
|
43
62
|
logger.info(f"Starting convert task index: {convert_task_index}")
|
@@ -46,96 +65,214 @@ def convert(convert_input: ConvertInput):
|
|
46
65
|
applicable_equality_delete_files = (
|
47
66
|
convert_input_files.applicable_equality_delete_files
|
48
67
|
)
|
68
|
+
|
49
69
|
all_data_files_for_this_bucket = convert_input_files.all_data_files_for_dedupe
|
50
70
|
|
51
71
|
partition_value_str = partition_value_record_to_partition_value_string(
|
52
72
|
convert_input_files.partition_value
|
53
73
|
)
|
54
74
|
partition_value = convert_input_files.partition_value
|
55
|
-
|
56
|
-
|
57
|
-
|
75
|
+
|
76
|
+
if partition_value_str:
|
77
|
+
iceberg_table_warehouse_prefix_with_partition = (
|
78
|
+
f"{iceberg_table_warehouse_prefix}/{partition_value_str}"
|
79
|
+
)
|
80
|
+
else:
|
81
|
+
iceberg_table_warehouse_prefix_with_partition = (
|
82
|
+
f"{iceberg_table_warehouse_prefix}"
|
83
|
+
)
|
84
|
+
|
58
85
|
enforce_primary_key_uniqueness = convert_input.enforce_primary_key_uniqueness
|
59
86
|
total_pos_delete_table = []
|
87
|
+
data_table_after_converting_equality_delete = []
|
60
88
|
if applicable_equality_delete_files:
|
61
89
|
(
|
62
|
-
pos_delete_after_converting_equality_delete
|
90
|
+
pos_delete_after_converting_equality_delete,
|
91
|
+
data_table_after_converting_equality_delete,
|
63
92
|
) = compute_pos_delete_with_limited_parallelism(
|
64
93
|
data_files_list=applicable_data_files,
|
65
94
|
identifier_columns=identifier_fields,
|
66
95
|
equality_delete_files_list=applicable_equality_delete_files,
|
67
96
|
iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
|
97
|
+
convert_task_index=convert_task_index,
|
68
98
|
max_parallel_data_file_download=max_parallel_data_file_download,
|
69
|
-
s3_file_system=
|
99
|
+
s3_file_system=filesystem,
|
100
|
+
s3_client_kwargs=s3_client_kwargs,
|
70
101
|
)
|
71
102
|
if pos_delete_after_converting_equality_delete:
|
72
103
|
total_pos_delete_table.append(pos_delete_after_converting_equality_delete)
|
73
104
|
|
74
105
|
if enforce_primary_key_uniqueness:
|
106
|
+
data_files_downloaded_during_convert = []
|
107
|
+
if applicable_data_files:
|
108
|
+
for file_list in applicable_data_files:
|
109
|
+
for file in file_list:
|
110
|
+
data_files_downloaded_during_convert.append(file)
|
111
|
+
|
75
112
|
data_files_to_dedupe = get_additional_applicable_data_files(
|
76
113
|
all_data_files=all_data_files_for_this_bucket,
|
77
|
-
data_files_downloaded=
|
114
|
+
data_files_downloaded=data_files_downloaded_during_convert,
|
115
|
+
)
|
116
|
+
|
117
|
+
dedupe_file_size_bytes = sum(
|
118
|
+
data_file.file_size_in_bytes for _, data_file in data_files_to_dedupe
|
119
|
+
)
|
120
|
+
logger.info(
|
121
|
+
f"Total on-disk size of files to dedupe: {dedupe_file_size_bytes} bytes"
|
78
122
|
)
|
79
|
-
|
123
|
+
|
124
|
+
logger.info(
|
125
|
+
f"[Convert task {convert_task_index}]: Got {len(data_files_to_dedupe)} files to dedupe."
|
126
|
+
)
|
127
|
+
|
128
|
+
(
|
129
|
+
pos_delete_after_dedupe,
|
130
|
+
data_file_to_dedupe_record_count,
|
131
|
+
data_file_to_dedupe_size,
|
132
|
+
) = dedupe_data_files(
|
80
133
|
data_file_to_dedupe=data_files_to_dedupe,
|
81
|
-
identify_column_name_concatenated=identifier_fields[0],
|
82
134
|
identifier_columns=identifier_fields,
|
135
|
+
remaining_data_table_after_convert=data_table_after_converting_equality_delete,
|
83
136
|
merge_sort_column=sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
137
|
+
s3_client_kwargs=s3_client_kwargs,
|
138
|
+
)
|
139
|
+
logger.info(
|
140
|
+
f"[Convert task {convert_task_index}]: Dedupe produced {len(pos_delete_after_dedupe)} position delete records."
|
84
141
|
)
|
85
142
|
total_pos_delete_table.append(pos_delete_after_dedupe)
|
86
143
|
|
87
144
|
total_pos_delete = pa.concat_tables(total_pos_delete_table)
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
s3_table_writer_kwargs={},
|
92
|
-
s3_file_system=s3_file_system,
|
145
|
+
|
146
|
+
logger.info(
|
147
|
+
f"[Convert task {convert_task_index}]: Total position delete produced:{len(total_pos_delete)}"
|
93
148
|
)
|
94
149
|
|
150
|
+
to_be_added_files_list = []
|
151
|
+
if total_pos_delete:
|
152
|
+
to_be_added_files_list_parquet = write_sliced_table(
|
153
|
+
table=total_pos_delete,
|
154
|
+
base_path=iceberg_table_warehouse_prefix_with_partition,
|
155
|
+
table_writer_kwargs={},
|
156
|
+
filesystem=filesystem,
|
157
|
+
)
|
158
|
+
|
159
|
+
to_be_added_files_dict = defaultdict()
|
160
|
+
to_be_added_files_dict[partition_value] = to_be_added_files_list_parquet
|
161
|
+
|
162
|
+
logger.info(
|
163
|
+
f"[Convert task {convert_task_index}]: Produced {len(to_be_added_files_list_parquet)} position delete files."
|
164
|
+
)
|
165
|
+
file_content_type = DataFileContent.POSITION_DELETES
|
166
|
+
to_be_added_files_list = parquet_files_dict_to_iceberg_data_files(
|
167
|
+
io=table_io,
|
168
|
+
table_metadata=table_metadata,
|
169
|
+
files_dict=to_be_added_files_dict,
|
170
|
+
file_content_type=file_content_type,
|
171
|
+
)
|
172
|
+
|
95
173
|
to_be_delete_files_dict = defaultdict()
|
174
|
+
|
96
175
|
if applicable_equality_delete_files:
|
97
176
|
to_be_delete_files_dict[partition_value] = [
|
98
177
|
equality_delete_file[1]
|
99
|
-
for
|
178
|
+
for equality_delete_list in applicable_equality_delete_files
|
179
|
+
for equality_delete_file in equality_delete_list
|
100
180
|
]
|
101
|
-
to_be_added_files_dict = defaultdict()
|
102
|
-
to_be_added_files_dict[partition_value] = to_be_added_files_list
|
103
|
-
return (to_be_delete_files_dict, to_be_added_files_dict)
|
104
181
|
|
182
|
+
if not enforce_primary_key_uniqueness:
|
183
|
+
data_file_to_dedupe_record_count = 0
|
184
|
+
data_file_to_dedupe_size = 0
|
185
|
+
|
186
|
+
peak_memory_usage_bytes = (
|
187
|
+
get_current_process_peak_memory_usage_in_bytes()
|
188
|
+
) # Convert KB to bytes
|
189
|
+
memory_usage_percentage = (peak_memory_usage_bytes / task_memory) * 100
|
190
|
+
|
191
|
+
logger.info(
|
192
|
+
f"[Convert task {convert_task_index}]: Memory usage stats - "
|
193
|
+
f"Peak memory usage: {peak_memory_usage_bytes} bytes, "
|
194
|
+
f"Allocated task memory: {convert_input.task_memory} bytes, "
|
195
|
+
f"Usage percentage: {memory_usage_percentage:.2f}%"
|
196
|
+
)
|
105
197
|
|
106
|
-
|
107
|
-
|
198
|
+
convert_res = ConvertResult.of(
|
199
|
+
convert_task_index=convert_task_index,
|
200
|
+
to_be_added_files=to_be_added_files_list,
|
201
|
+
to_be_deleted_files=to_be_delete_files_dict,
|
202
|
+
position_delete_record_count=len(total_pos_delete),
|
203
|
+
input_data_files_record_count=data_file_to_dedupe_record_count,
|
204
|
+
input_data_files_hash_columns_in_memory_sizes=data_file_to_dedupe_size,
|
205
|
+
position_delete_in_memory_sizes=int(total_pos_delete.nbytes),
|
206
|
+
position_delete_on_disk_sizes=sum(
|
207
|
+
file.file_size_in_bytes for file in to_be_added_files_list
|
208
|
+
),
|
209
|
+
input_data_files_on_disk_size=dedupe_file_size_bytes,
|
210
|
+
peak_memory_usage_bytes=peak_memory_usage_bytes,
|
211
|
+
memory_usage_percentage=memory_usage_percentage,
|
212
|
+
)
|
213
|
+
return convert_res
|
214
|
+
|
215
|
+
|
216
|
+
def get_additional_applicable_data_files(
|
217
|
+
all_data_files: DataFileList,
|
218
|
+
data_files_downloaded: DataFileList,
|
219
|
+
) -> DataFileList:
|
220
|
+
data_file_to_dedupe = []
|
221
|
+
assert len(set(all_data_files)) >= len(set(data_files_downloaded)), (
|
222
|
+
f"Length of all data files ({len(set(all_data_files))}) should never be less than "
|
223
|
+
f"the length of candidate equality delete data files ({len(set(data_files_downloaded))})"
|
224
|
+
)
|
108
225
|
if data_files_downloaded:
|
109
|
-
|
226
|
+
# set1.difference(set2) returns elements in set1 but not in set2
|
227
|
+
data_file_to_dedupe.extend(
|
228
|
+
list(set(data_file_to_dedupe).difference(set(data_files_downloaded)))
|
229
|
+
)
|
230
|
+
else:
|
231
|
+
data_file_to_dedupe = all_data_files
|
110
232
|
return data_file_to_dedupe
|
111
233
|
|
112
234
|
|
113
235
|
def filter_rows_to_be_deleted(
|
114
|
-
equality_delete_table
|
115
|
-
|
116
|
-
|
236
|
+
equality_delete_table: Optional[pa.Table],
|
237
|
+
data_file_table: Optional[pa.Table],
|
238
|
+
identifier_columns: List[str],
|
239
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
240
|
+
identifier_column = sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME
|
117
241
|
if equality_delete_table and data_file_table:
|
118
242
|
equality_deletes = pc.is_in(
|
119
243
|
data_file_table[identifier_column],
|
120
244
|
equality_delete_table[identifier_column],
|
121
245
|
)
|
246
|
+
data_file_record_remaining = pc.invert(
|
247
|
+
pc.is_in(
|
248
|
+
data_file_table[identifier_column],
|
249
|
+
equality_delete_table[identifier_column],
|
250
|
+
)
|
251
|
+
)
|
122
252
|
position_delete_table = data_file_table.filter(equality_deletes)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
253
|
+
remaining_data_table = data_file_table.filter(data_file_record_remaining)
|
254
|
+
|
255
|
+
position_delete_table = position_delete_table.drop(
|
256
|
+
[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME]
|
127
257
|
)
|
128
|
-
|
258
|
+
assert len(position_delete_table) + len(remaining_data_table) == len(
|
259
|
+
data_file_table
|
260
|
+
), (
|
261
|
+
f"Expected undeleted data file record count plus length of pos deletes to match original data file record count of {len(data_file_table)}, "
|
262
|
+
f"but found {len(position_delete_table)} pos deletes + {len(remaining_data_table)} equality deletes."
|
263
|
+
)
|
264
|
+
|
265
|
+
return position_delete_table, remaining_data_table
|
129
266
|
|
130
267
|
|
131
268
|
def compute_pos_delete_converting_equality_deletes(
|
132
|
-
equality_delete_table,
|
133
|
-
data_file_table,
|
134
|
-
identifier_columns,
|
135
|
-
iceberg_table_warehouse_prefix_with_partition,
|
136
|
-
s3_file_system,
|
137
|
-
):
|
138
|
-
new_position_delete_table = filter_rows_to_be_deleted(
|
269
|
+
equality_delete_table: Optional[pa.Table],
|
270
|
+
data_file_table: Optional[pa.Table],
|
271
|
+
identifier_columns: List[str],
|
272
|
+
iceberg_table_warehouse_prefix_with_partition: str,
|
273
|
+
s3_file_system: Optional[AbstractFileSystem],
|
274
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
275
|
+
new_position_delete_table, remaining_data_table = filter_rows_to_be_deleted(
|
139
276
|
data_file_table=data_file_table,
|
140
277
|
equality_delete_table=equality_delete_table,
|
141
278
|
identifier_columns=identifier_columns,
|
@@ -144,44 +281,47 @@ def compute_pos_delete_converting_equality_deletes(
|
|
144
281
|
logger.info(
|
145
282
|
f"Length of position delete table after converting from equality deletes:{len(new_position_delete_table)}"
|
146
283
|
)
|
284
|
+
return new_position_delete_table, remaining_data_table
|
285
|
+
elif not remaining_data_table:
|
286
|
+
return None, None
|
147
287
|
else:
|
148
|
-
return None
|
149
|
-
return new_position_delete_table
|
288
|
+
return None, remaining_data_table
|
150
289
|
|
151
290
|
|
152
|
-
def
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
291
|
+
def compute_pos_delete_with_limited_parallelism(
|
292
|
+
data_files_list: DataFileListGroup,
|
293
|
+
identifier_columns: List[str],
|
294
|
+
equality_delete_files_list: DataFileListGroup,
|
295
|
+
iceberg_table_warehouse_prefix_with_partition: str,
|
296
|
+
convert_task_index: int,
|
297
|
+
max_parallel_data_file_download: int,
|
298
|
+
s3_file_system: Optional[AbstractFileSystem],
|
299
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
300
|
+
) -> Tuple[Optional[pa.Table], Optional[pa.Table]]:
|
301
|
+
assert len(data_files_list) == len(equality_delete_files_list), (
|
302
|
+
f"Number of lists of data files should equal to number of list of equality delete files, "
|
303
|
+
f"But got {len(data_files_list)} data files lists vs {len(equality_delete_files_list)}."
|
160
304
|
)
|
161
|
-
return compacted_table, equality_delete_table
|
162
305
|
|
163
|
-
|
164
|
-
def compute_pos_delete_with_limited_parallelism(
|
165
|
-
data_files_list,
|
166
|
-
identifier_columns,
|
167
|
-
equality_delete_files_list,
|
168
|
-
iceberg_table_warehouse_prefix_with_partition,
|
169
|
-
max_parallel_data_file_download,
|
170
|
-
s3_file_system,
|
171
|
-
):
|
306
|
+
new_pos_delete_table_total = []
|
172
307
|
for data_files, equality_delete_files in zip(
|
173
308
|
data_files_list, equality_delete_files_list
|
174
309
|
):
|
175
310
|
data_table_total = []
|
311
|
+
|
312
|
+
# Sort data files by file sequence number first, then file path to
|
313
|
+
# make sure files having same sequence number are deterministically sorted
|
314
|
+
data_files = sort_data_files_maintaining_order(data_files=data_files)
|
315
|
+
|
176
316
|
for data_file in data_files:
|
177
317
|
data_table = download_data_table_and_append_iceberg_columns(
|
178
|
-
|
318
|
+
file=data_file[1],
|
179
319
|
columns_to_download=identifier_columns,
|
180
320
|
additional_columns_to_append=[
|
181
321
|
sc._FILE_PATH_COLUMN_NAME,
|
182
322
|
sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
183
323
|
],
|
184
|
-
|
324
|
+
s3_client_kwargs=s3_client_kwargs,
|
185
325
|
)
|
186
326
|
data_table_total.append(data_table)
|
187
327
|
data_table_total = pa.concat_tables(data_table_total)
|
@@ -189,23 +329,38 @@ def compute_pos_delete_with_limited_parallelism(
|
|
189
329
|
equality_delete_table_total = []
|
190
330
|
for equality_delete in equality_delete_files:
|
191
331
|
equality_delete_table = download_data_table_and_append_iceberg_columns(
|
192
|
-
|
332
|
+
file=equality_delete[1],
|
193
333
|
columns_to_download=identifier_columns,
|
334
|
+
s3_client_kwargs=s3_client_kwargs,
|
194
335
|
)
|
195
336
|
equality_delete_table_total.append(equality_delete_table)
|
196
337
|
equality_delete_table_total = pa.concat_tables(equality_delete_table_total)
|
197
338
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
339
|
+
(
|
340
|
+
new_pos_delete_table,
|
341
|
+
remaining_data_table,
|
342
|
+
) = compute_pos_delete_converting_equality_deletes(
|
343
|
+
equality_delete_table=equality_delete_table_total,
|
344
|
+
data_file_table=data_table_total,
|
345
|
+
iceberg_table_warehouse_prefix_with_partition=iceberg_table_warehouse_prefix_with_partition,
|
346
|
+
identifier_columns=identifier_columns,
|
347
|
+
s3_file_system=s3_file_system,
|
348
|
+
)
|
349
|
+
new_pos_delete_table_total.append(new_pos_delete_table)
|
350
|
+
|
351
|
+
if new_pos_delete_table_total:
|
352
|
+
new_pos_delete_table_total = pa.concat_tables(new_pos_delete_table_total)
|
207
353
|
|
208
354
|
logger.info(
|
209
|
-
f"
|
355
|
+
f"[Convert task {convert_task_index}]: Find deletes got {len(data_table_total)} data table records, "
|
356
|
+
f"{len(equality_delete_table_total)} equality deletes as input, "
|
357
|
+
f"Produced {len(new_pos_delete_table_total)} position deletes based off find deletes input."
|
210
358
|
)
|
211
|
-
|
359
|
+
|
360
|
+
if not new_pos_delete_table_total:
|
361
|
+
logger.info("No records deleted based on equality delete convertion")
|
362
|
+
|
363
|
+
if not remaining_data_table:
|
364
|
+
logger.info("No data table remaining after converting equality deletes")
|
365
|
+
|
366
|
+
return new_pos_delete_table_total, remaining_data_table
|
@@ -4,20 +4,33 @@ import deltacat.compute.converter.utils.iceberg_columns as sc
|
|
4
4
|
from deltacat.compute.converter.utils.io import (
|
5
5
|
download_data_table_and_append_iceberg_columns,
|
6
6
|
)
|
7
|
+
from deltacat.compute.converter.utils.converter_session_utils import (
|
8
|
+
sort_data_files_maintaining_order,
|
9
|
+
)
|
10
|
+
import logging
|
11
|
+
from deltacat import logs
|
12
|
+
from typing import List, Dict, Tuple, Optional, Any
|
13
|
+
from pyiceberg.manifest import DataFile
|
14
|
+
|
15
|
+
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
|
7
16
|
|
8
17
|
|
9
18
|
def dedupe_data_files(
|
10
|
-
data_file_to_dedupe,
|
11
|
-
|
12
|
-
|
13
|
-
merge_sort_column,
|
14
|
-
|
19
|
+
data_file_to_dedupe: List[Tuple[int, DataFile]],
|
20
|
+
identifier_columns: List[str],
|
21
|
+
remaining_data_table_after_convert: Optional[pa.Table],
|
22
|
+
merge_sort_column: str,
|
23
|
+
s3_client_kwargs: Optional[Dict[str, Any]],
|
24
|
+
) -> Tuple[pa.Table, int, int]:
|
15
25
|
data_file_table = []
|
26
|
+
if remaining_data_table_after_convert:
|
27
|
+
data_file_table.append(remaining_data_table_after_convert)
|
16
28
|
|
17
|
-
|
18
|
-
|
29
|
+
data_file_to_dedupe = sort_data_files_maintaining_order(
|
30
|
+
data_files=data_file_to_dedupe
|
31
|
+
)
|
32
|
+
downloaded_data_file_record_count = 0
|
19
33
|
for file_tuple in data_file_to_dedupe:
|
20
|
-
sequence_number = file_tuple[0]
|
21
34
|
data_file = file_tuple[1]
|
22
35
|
data_file_to_dedupe_table = download_data_table_and_append_iceberg_columns(
|
23
36
|
file=data_file,
|
@@ -26,12 +39,26 @@ def dedupe_data_files(
|
|
26
39
|
sc._FILE_PATH_COLUMN_NAME,
|
27
40
|
sc._ORDERED_RECORD_IDX_COLUMN_NAME,
|
28
41
|
],
|
29
|
-
|
42
|
+
s3_client_kwargs=s3_client_kwargs,
|
30
43
|
)
|
44
|
+
logger.info(
|
45
|
+
f"Length of downloaded data file table: {len(data_file_to_dedupe_table)}"
|
46
|
+
)
|
47
|
+
downloaded_data_file_record_count += len(data_file_to_dedupe_table)
|
31
48
|
data_file_table.append(data_file_to_dedupe_table)
|
32
49
|
|
33
50
|
final_data_to_dedupe = pa.concat_tables(data_file_table)
|
34
51
|
|
52
|
+
dedupe_input_record_count = downloaded_data_file_record_count
|
53
|
+
if remaining_data_table_after_convert:
|
54
|
+
dedupe_input_record_count += len(remaining_data_table_after_convert)
|
55
|
+
assert len(final_data_to_dedupe) == dedupe_input_record_count, (
|
56
|
+
f"Mismatch record count while performing table concat, Got {len(final_data_to_dedupe)} in final table, "
|
57
|
+
f"while input table length is: {dedupe_input_record_count}"
|
58
|
+
)
|
59
|
+
|
60
|
+
logger.info(f"Length of pyarrow table to dedupe:{len(final_data_to_dedupe)}")
|
61
|
+
|
35
62
|
record_idx_iterator = iter(range(len(final_data_to_dedupe)))
|
36
63
|
|
37
64
|
# Append global record index to used as aggregate column
|
@@ -40,7 +67,7 @@ def dedupe_data_files(
|
|
40
67
|
)
|
41
68
|
|
42
69
|
final_data_table_indices = final_data_to_dedupe.group_by(
|
43
|
-
|
70
|
+
sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME, use_threads=False
|
44
71
|
).aggregate([(sc._GLOBAL_RECORD_IDX_COLUMN_NAME, "max")])
|
45
72
|
|
46
73
|
pos_delete_indices = pc.invert(
|
@@ -55,6 +82,13 @@ def dedupe_data_files(
|
|
55
82
|
final_data_table_to_delete = final_data_to_dedupe.filter(pos_delete_indices)
|
56
83
|
|
57
84
|
final_data_table_to_delete = final_data_table_to_delete.drop(
|
58
|
-
[
|
85
|
+
[sc._IDENTIFIER_COLUMNS_HASH_COLUMN_NAME, sc._GLOBAL_RECORD_IDX_COLUMN_NAME]
|
86
|
+
)
|
87
|
+
logger.info(
|
88
|
+
f"Deduped {len(final_data_table_to_delete)} Records based off identifier columns."
|
89
|
+
)
|
90
|
+
return (
|
91
|
+
final_data_table_to_delete,
|
92
|
+
len(final_data_to_dedupe),
|
93
|
+
int(final_data_to_dedupe.nbytes),
|
59
94
|
)
|
60
|
-
return final_data_table_to_delete
|
@@ -1,26 +1,36 @@
|
|
1
|
-
from typing import Optional, Dict
|
1
|
+
from typing import Optional, Dict, List, Tuple, Any
|
2
2
|
from deltacat.exceptions import RetryableError
|
3
|
+
from pyiceberg.manifest import DataFile
|
4
|
+
from deltacat.compute.converter.model.convert_input_files import ConvertInputFiles
|
3
5
|
|
4
|
-
AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES =
|
6
|
+
AVERAGE_FILE_PATH_COLUMN_SIZE_BYTES = 160
|
5
7
|
AVERAGE_POS_COLUMN_SIZE_BYTES = 4
|
6
8
|
XXHASH_BYTE_PER_RECORD = 8
|
7
|
-
MEMORY_BUFFER_RATE =
|
9
|
+
MEMORY_BUFFER_RATE = 2
|
10
|
+
# Worst case 2 as no duplicates exists across all pk
|
11
|
+
PYARROW_AGGREGATE_MEMORY_MULTIPLIER = 2
|
12
|
+
# Observed base memory usage at the beginning of each worker process
|
13
|
+
BASE_MEMORY_BUFFER = 0.3 * 1024 * 1024 * 1024
|
8
14
|
|
9
15
|
|
10
|
-
def estimate_fixed_hash_columns(
|
16
|
+
def estimate_fixed_hash_columns(
|
17
|
+
hash_value_size_bytes_per_record: int, total_record_count: int
|
18
|
+
) -> int:
|
11
19
|
return hash_value_size_bytes_per_record * total_record_count
|
12
20
|
|
13
21
|
|
14
|
-
def get_total_record_from_iceberg_files(
|
22
|
+
def get_total_record_from_iceberg_files(
|
23
|
+
iceberg_files_list: List[Tuple[int, DataFile]]
|
24
|
+
) -> int:
|
15
25
|
total_record_count = 0
|
16
|
-
|
17
|
-
|
26
|
+
# file are in form of tuple (sequence_number, DataFile)
|
27
|
+
total_record_count += sum(file[1].record_count for file in iceberg_files_list)
|
18
28
|
return total_record_count
|
19
29
|
|
20
30
|
|
21
31
|
def estimate_iceberg_pos_delete_additional_columns(
|
22
|
-
include_columns, num_of_record_count
|
23
|
-
):
|
32
|
+
include_columns: List[str], num_of_record_count: int
|
33
|
+
) -> int:
|
24
34
|
total_additional_columns_sizes = 0
|
25
35
|
if "file_path" in include_columns:
|
26
36
|
total_additional_columns_sizes += (
|
@@ -33,7 +43,10 @@ def estimate_iceberg_pos_delete_additional_columns(
|
|
33
43
|
return total_additional_columns_sizes
|
34
44
|
|
35
45
|
|
36
|
-
def estimate_convert_remote_option_resources(
|
46
|
+
def estimate_convert_remote_option_resources(
|
47
|
+
data_files: List[Tuple[int, DataFile]],
|
48
|
+
equality_delete_files: List[Tuple[int, DataFile]],
|
49
|
+
) -> float:
|
37
50
|
data_file_record_count = get_total_record_from_iceberg_files(data_files)
|
38
51
|
equality_delete_record_count = get_total_record_from_iceberg_files(
|
39
52
|
equality_delete_files
|
@@ -50,9 +63,9 @@ def estimate_convert_remote_option_resources(data_files, equality_delete_files):
|
|
50
63
|
|
51
64
|
def _get_task_options(
|
52
65
|
memory: float,
|
53
|
-
ray_custom_resources: Optional[Dict] = None,
|
66
|
+
ray_custom_resources: Optional[Dict[str, Any]] = None,
|
54
67
|
scheduling_strategy: str = "SPREAD",
|
55
|
-
) -> Dict:
|
68
|
+
) -> Dict[str, Any]:
|
56
69
|
|
57
70
|
# NOTE: With DEFAULT scheduling strategy in Ray 2.20.0, autoscaler does
|
58
71
|
# not spin up enough nodes fast and hence we see only approximately
|
@@ -68,7 +81,8 @@ def _get_task_options(
|
|
68
81
|
task_opts["resources"] = ray_custom_resources
|
69
82
|
|
70
83
|
task_opts["max_retries"] = 3
|
71
|
-
|
84
|
+
task_opts["num_cpus"] = 1
|
85
|
+
task_opts["resources"] = {"convert_task": 1}
|
72
86
|
# List of possible botocore exceptions are available at
|
73
87
|
# https://github.com/boto/botocore/blob/develop/botocore/exceptions.py
|
74
88
|
task_opts["retry_exceptions"] = [RetryableError]
|
@@ -76,13 +90,43 @@ def _get_task_options(
|
|
76
90
|
return task_opts
|
77
91
|
|
78
92
|
|
79
|
-
def
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
93
|
+
def estimate_dedupe_memory(
|
94
|
+
all_data_files_for_dedupe: List[Tuple[int, DataFile]]
|
95
|
+
) -> float:
|
96
|
+
dedupe_record_count = get_total_record_from_iceberg_files(all_data_files_for_dedupe)
|
97
|
+
produced_pos_memory_required = estimate_iceberg_pos_delete_additional_columns(
|
98
|
+
["file_path", "pos"], dedupe_record_count
|
99
|
+
)
|
100
|
+
download_pk_memory_required = estimate_fixed_hash_columns(
|
101
|
+
XXHASH_BYTE_PER_RECORD, dedupe_record_count
|
102
|
+
)
|
103
|
+
memory_required_by_dedupe = (
|
104
|
+
produced_pos_memory_required + download_pk_memory_required
|
105
|
+
) * PYARROW_AGGREGATE_MEMORY_MULTIPLIER
|
106
|
+
memory_with_buffer = memory_required_by_dedupe * MEMORY_BUFFER_RATE
|
107
|
+
return memory_with_buffer
|
108
|
+
|
109
|
+
|
110
|
+
def convert_resource_options_provider(
|
111
|
+
index: int, convert_input_files: ConvertInputFiles
|
112
|
+
) -> Dict[str, Any]:
|
113
|
+
applicable_data_files = convert_input_files.applicable_data_files
|
114
|
+
applicable_equality_delete_files = (
|
115
|
+
convert_input_files.applicable_equality_delete_files
|
87
116
|
)
|
88
|
-
|
117
|
+
all_data_files_for_dedupe = convert_input_files.all_data_files_for_dedupe
|
118
|
+
total_memory_required = 0
|
119
|
+
total_memory_required += BASE_MEMORY_BUFFER
|
120
|
+
if applicable_data_files and applicable_equality_delete_files:
|
121
|
+
memory_requirement_for_convert_equality_deletes = (
|
122
|
+
estimate_convert_remote_option_resources(
|
123
|
+
applicable_data_files, applicable_equality_delete_files
|
124
|
+
)
|
125
|
+
)
|
126
|
+
total_memory_required += memory_requirement_for_convert_equality_deletes
|
127
|
+
if all_data_files_for_dedupe:
|
128
|
+
memory_requirement_for_dedupe = estimate_dedupe_memory(
|
129
|
+
all_data_files_for_dedupe
|
130
|
+
)
|
131
|
+
total_memory_required += memory_requirement_for_dedupe
|
132
|
+
return _get_task_options(memory=total_memory_required)
|