PyPI - deltacat - Versions diffs - 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl - Mend

deltacat 2.0.0b7py3-none-any.whl → 2.0.0b10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

deltacat/__init__.py +27 -6
deltacat/api.py +478 -123
deltacat/aws/s3u.py +2 -2
deltacat/benchmarking/conftest.py +1 -1
deltacat/catalog/main/impl.py +12 -6
deltacat/catalog/model/catalog.py +65 -47
deltacat/catalog/model/properties.py +1 -3
deltacat/compute/__init__.py +14 -0
deltacat/compute/converter/constants.py +5 -0
deltacat/compute/converter/converter_session.py +78 -36
deltacat/compute/converter/model/convert_input.py +24 -4
deltacat/compute/converter/model/convert_result.py +61 -0
deltacat/compute/converter/model/converter_session_params.py +52 -10
deltacat/compute/converter/pyiceberg/overrides.py +181 -62
deltacat/compute/converter/steps/convert.py +84 -36
deltacat/compute/converter/steps/dedupe.py +25 -4
deltacat/compute/converter/utils/convert_task_options.py +42 -13
deltacat/compute/converter/utils/iceberg_columns.py +5 -0
deltacat/compute/converter/utils/io.py +82 -11
deltacat/compute/converter/utils/s3u.py +13 -4
deltacat/compute/jobs/__init__.py +0 -0
deltacat/compute/jobs/client.py +404 -0
deltacat/constants.py +4 -4
deltacat/daft/daft_scan.py +7 -3
deltacat/daft/translator.py +126 -0
deltacat/examples/basic_logging.py +5 -3
deltacat/examples/hello_world.py +4 -2
deltacat/examples/indexer/__init__.py +0 -0
deltacat/examples/indexer/aws/__init__.py +0 -0
deltacat/examples/indexer/gcp/__init__.py +0 -0
deltacat/examples/indexer/indexer.py +163 -0
deltacat/examples/indexer/job_runner.py +199 -0
deltacat/io/__init__.py +13 -0
deltacat/io/dataset/__init__.py +0 -0
deltacat/io/dataset/deltacat_dataset.py +91 -0
deltacat/io/datasink/__init__.py +0 -0
deltacat/io/datasink/deltacat_datasink.py +207 -0
deltacat/io/datasource/__init__.py +0 -0
deltacat/io/datasource/deltacat_datasource.py +580 -0
deltacat/io/reader/__init__.py +0 -0
deltacat/io/reader/deltacat_read_api.py +172 -0
deltacat/storage/__init__.py +2 -0
deltacat/storage/model/expression/__init__.py +47 -0
deltacat/storage/model/expression/expression.py +656 -0
deltacat/storage/model/expression/visitor.py +248 -0
deltacat/storage/model/metafile.py +74 -42
deltacat/storage/model/scan/push_down.py +32 -5
deltacat/storage/model/types.py +5 -3
deltacat/storage/rivulet/__init__.py +4 -4
deltacat/tests/_io/reader/__init__.py +0 -0
deltacat/tests/_io/reader/test_deltacat_read_api.py +0 -0
deltacat/tests/compute/converter/test_convert_session.py +209 -46
deltacat/tests/local_deltacat_storage/__init__.py +1 -0
deltacat/tests/storage/model/test_expression.py +327 -0
deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +2 -1
deltacat/tests/storage/rivulet/test_dataset.py +1 -1
deltacat/tests/storage/rivulet/test_manifest.py +1 -1
deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +1 -1
deltacat/tests/test_deltacat_api.py +50 -9
deltacat/types/media.py +141 -43
deltacat/types/tables.py +35 -7
deltacat/utils/daft.py +2 -2
deltacat/utils/filesystem.py +39 -9
deltacat/utils/polars.py +128 -0
deltacat/utils/pyarrow.py +151 -15
deltacat/utils/ray_utils/concurrency.py +1 -1
deltacat/utils/ray_utils/runtime.py +56 -4
deltacat/utils/url.py +1284 -0
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/METADATA +9 -6
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/RECORD +73 -48
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/LICENSE +0 -0
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/WHEEL +0 -0
{deltacat-2.0.0b7.dist-info → deltacat-2.0.0b10.dist-info}/top_level.txt +0 -0

deltacat/catalog/main/impl.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from typing import Any, Dict, List, Optional, Union, Tuple
 import logging
+import deltacat as dc
 from deltacat.catalog import CatalogProperties
 from deltacat.exceptions import (
     NamespaceAlreadyExistsError,
@@ -34,17 +36,17 @@ from deltacat.types.tables import TableWriteMode
 from deltacat.compute.merge_on_read import MERGE_FUNC_BY_DISTRIBUTED_DATASET_TYPE
 from deltacat import logs
 from deltacat.constants import DEFAULT_NAMESPACE
-from deltacat.storage import metastore as storage_impl
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 """
-This is the default implementation for the Catalog interface, using DeltaCAT native storage
+Default Catalog interface implementation using DeltaCAT native storage.
-Note that, when this catalog implementation gets called through the normal pattern of `delegate.py`, all functions
-will be called the kwarg "inner" equal to the `CatalogProperties` this was initialized with.
+When this is used by `delegate.py` the `Catalog` implementation `inner`
+property will be set to the value returned from `intialize`.
-`CatalogProperties` has all state required to implement catalog functions, such as metastore root URI
+`CatalogProperties` has all state required to implement catalog functions,
+such as metastore root URI.
 """
@@ -56,6 +58,10 @@ def initialize(config: CatalogProperties = None, *args, **kwargs) -> CatalogProp
     returns CatalogProperties as the "inner" state value for a DC native catalog
     """
     if config is not None:
+        if not isinstance(config, CatalogProperties):
+            raise ValueError(
+                f"Expected `CatalogProperties` but found `{type(config)}`."
+            )
         return config
     else:
         return CatalogProperties(*args, **kwargs)
@@ -717,4 +723,4 @@ def _get_storage(**kwargs):
     if properties is not None and properties.storage is not None:
         return properties.storage
     else:
-        return storage_impl
+        return dc.storage.metastore

deltacat/catalog/model/catalog.py CHANGED Viewed

@@ -10,7 +10,7 @@ import ray
 from deltacat import logs
 from deltacat.annotations import ExperimentalAPI
-from deltacat.catalog.main import impl as DeltacatCatalog
+from deltacat.catalog.main import impl as DeltaCatCatalog
 from deltacat.catalog.iceberg import impl as IcebergCatalog
 from deltacat.catalog import CatalogProperties
 from deltacat.catalog.iceberg import IcebergCatalogConfig
@@ -22,17 +22,14 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 class Catalog:
-    def __init__(self, impl: ModuleType = DeltacatCatalog, *args, **kwargs):
+    def __init__(self, impl: ModuleType = DeltaCatCatalog, *args, **kwargs):
         """
         Constructor for a Catalog.
-        The args and kwargs here will be plumbed through to the catalog initialize function, and the results
-        are stored in Catalog.inner. Any state which is required (like: metastore root URI, pyiceberg native catalog)
-        MUST be returned by initialize.
-        Note: all initialization configuration MUST be pickle-able. When `Catalog` is pickled, _inner is excluded.
-        Instead, we only pass impl/args/kwargs, which are pickled and then _inner is re-constituted by calling __init__.
-        See `ray.util.register_serializer` in Catalogs class.
+        Invokes `impl.initialize(*args, **kwargs)` and stores its return value
+        in the `inner` property, which captures all state required to
+        deterministically reconstruct this Catalog instance on any node (and
+        must therefore be pickleable by Ray cloudpickle).
         """
         if not isinstance(self, Catalog):
             # self may contain the tuple returned from __reduce__ (ray pickle bug?)
@@ -68,7 +65,7 @@ class Catalog:
         Uses CatalogProperties as configuration
         """
-        return cls(impl=DeltacatCatalog, *args, **{"config": config, **kwargs})
+        return cls(impl=DeltaCatCatalog, *args, **{"config": config, **kwargs})
     @property
     def impl(self):
@@ -104,25 +101,17 @@ class Catalogs:
         self,
         catalogs: Union[Catalog, Dict[str, Catalog]],
         default: Optional[str] = None,
-        *args,
-        **kwargs,
     ):
-        if default and default not in catalogs:
-            raise ValueError(
-                f"Catalog {default} not found " f"in catalogs to register: {catalogs}"
-            )
-        if not catalogs:
-            raise ValueError(
-                f"No catalogs given to register. "
-                f"Please specify one or more catalogs."
-            )
-        # if user only provides single Catalog, override it to be a map with default key
         if isinstance(catalogs, Catalog):
             catalogs = {DEFAULT_CATALOG: catalogs}
+        elif not isinstance(catalogs, dict):
+            raise ValueError(f"Expected Catalog or dict, but found: {catalogs}")
         self.catalogs: Dict[str, Catalog] = catalogs
         if default:
+            if default not in catalogs:
+                raise ValueError(
+                    f"Default catalog `{default}` not found in: {catalogs}"
+                )
             self.default_catalog = self.catalogs[default]
         elif len(catalogs) == 1:
             self.default_catalog = list(self.catalogs.values())[0]
@@ -140,7 +129,7 @@ class Catalogs:
         if set_default:
             self.default_catalog = catalog
-    def get(self, name) -> Catalog:
+    def get(self, name) -> Optional[Catalog]:
         return self.catalogs.get(name)
     def default(self) -> Optional[Catalog]:
@@ -149,7 +138,7 @@ class Catalogs:
 def is_initialized(*args, **kwargs) -> bool:
     """
-    Check if DeltaCAT is initialized
+    Check if DeltaCAT is initialized.
     """
     global all_catalogs
@@ -162,22 +151,36 @@ def is_initialized(*args, **kwargs) -> bool:
     return all_catalogs is not None
+def raise_if_not_initialized(
+    err_msg: str = "DeltaCAT is not initialized. Please call `deltacat.init()` and try again.",
+) -> None:
+    """
+    Raises a RuntimeError with the given error message if DeltaCAT is not
+    initialized.
+    :param err_msg: Custom error message to raise if DeltaCAT is not
+    initialized. If unspecified, the default error message is used.
+    """
+    if not is_initialized():
+        raise RuntimeError(err_msg)
 def init(
-    catalogs: Union[Dict[str, Catalog], Catalog],
+    catalogs: Union[Dict[str, Catalog], Catalog] = {},
     default: Optional[str] = None,
     ray_init_args: Dict[str, Any] = None,
-    *args,
+    *,
     force_reinitialize=False,
-    **kwargs,
 ) -> None:
     """
     Initialize DeltaCAT catalogs.
-    :param catalogs: Either a single Catalog instance or a map of string to Catalog instance
-    :param default: The Catalog to use by default. If only one Catalog is provided, it will
-        be set as the default
-    :param ray_init_args: kwargs to pass to ray initialization
-    :param force_reinitialize: if True, force the reinitialization of Ray. If false, will do nothing if ray already initialized
+    :param catalogs: A single Catalog instance or a map of catalog names to
+        Catalog instances.
+    :param default: The name of the default Catalog. If only one Catalog is
+        provided, it will always be the default.
+    :param ray_init_args: Keyword arguments to pass to `ray.init()`.
+    :param force_reinitialize: Whether to force Ray reinitialization.
     """
     global all_catalogs
@@ -195,6 +198,8 @@ def init(
     ray.util.register_serializer(
         Catalog, serializer=Catalog.__reduce__, deserializer=Catalog.__init__
     )
+    # TODO(pdames): If no catalogs are provided then re-initialize DeltaCAT
+    #  with all catalogs from the last session
     all_catalogs = Catalogs.remote(catalogs=catalogs, default=default)
@@ -216,7 +221,6 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
             "`deltacat.init(catalogs={...})` to register one or more "
             "catalogs then retry."
         )
     if name is not None:
         catalog = ray.get(all_catalogs.get.remote(name))
         if not catalog:
@@ -225,17 +229,16 @@ def get_catalog(name: Optional[str] = None, **kwargs) -> Catalog:
                 f"Catalog '{name}' not found. Available catalogs: "
                 f"{available_catalogs}."
             )
-        return catalog
     else:
         catalog = ray.get(all_catalogs.default.remote())
         if not catalog:
             available_catalogs = ray.get(all_catalogs.all.remote()).values()
             raise ValueError(
-                f"Call to get_catalog without name set failed because there is no default Catalog set. Available catalogs: "
+                f"Call to get_catalog without name set failed because there "
+                f"is no default Catalog set. Available catalogs: "
                 f"{available_catalogs}."
             )
-        return catalog
+    return catalog
 def put_catalog(
@@ -246,23 +249,37 @@ def put_catalog(
     ray_init_args: Dict[str, Any] = None,
     fail_if_exists: bool = False,
     **kwargs,
-) -> None:
+) -> Catalog:
     """
     Add a named catalog to the global map of named catalogs. Initializes ray if not already initialized.
     Args:
-        name: name of catalog
-        catalog: catalog instance to use, if provided
-        default:  Make this the default catalog if multiple catalogs are available.
-            ignored if this is the only catalog available, since it will always be the default catalog.
-        ray_init_args: ray initialization args (used only if ray not already initialized)
-        fail_if_exists: if True, raises KeyError if the catalog name already exists. Otherwise, overwrite catalog
+        name: Name of the catalog.
+        catalog: Catalog instance to use. If none is provided, then all
+            additional keyword arguments will be forwarded to
+            `CatalogProperties` for a default DeltaCAT native Catalog.
+        default:  Make this the default catalog if multiple catalogs are
+            available. If only one catalog is available, it will always be the
+            default.
+        ray_init_args: Ray initialization args (used only if ray not already
+            initialized)
+        fail_if_exists: if True, raises an error if a catalog with the given
+            name already exists. If False, inserts or replaces the given
+            catalog name.
+        kwargs: Additional keyword arguments to forward to `CatalogProperties`
+            for a default DeltaCAT native Catalog.
+    Returns:
+        The catalog put in the named catalog map.
     """
     global all_catalogs
+    if not catalog:
+        catalog = Catalog(**kwargs)
     # Initialize, if necessary
     if not is_initialized():
-        # NOTE - since we are initializing with a single catalog, it will be set to the default
+        # We are initializing a single catalog - make it the default
         if not default:
             logger.info(
                 f"Calling put_catalog with set_as_default=False, "
@@ -288,3 +305,4 @@ def put_catalog(
     # Add the catalog (which may overwrite existing if fail_if_exists=False)
     ray.get(all_catalogs.put.remote(name, catalog, default))
+    return catalog

deltacat/catalog/model/properties.py CHANGED Viewed

@@ -45,7 +45,7 @@ class CatalogProperties:
     Attributes:
         root (str): URI string The root path where catalog metadata and data
             files are stored. Root is determined (in prededence order) by:
-            1. check kwargs for "root"
+            1. check "root" input argument
             2. check env variable "DELTACAT_ROOT"
             3. default to ${cwd}/.deltacat
@@ -61,8 +61,6 @@ class CatalogProperties:
         root: Optional[str] = None,
         filesystem: Optional[pyarrow.fs.FileSystem] = None,
         storage=None,
-        *args,
-        **kwargs,
     ):
         """
         Initialize a CatalogProperties instance.

deltacat/compute/__init__.py CHANGED Viewed

@@ -0,0 +1,14 @@
+from deltacat.compute.jobs.client import (
+    DeltaCatJobClient,
+    job_client,
+    local_job_client,
+)
+from ray.job_submission import JobStatus
+__all__ = [
+    "job_client",
+    "local_job_client",
+    "DeltaCatJobClient",
+    "JobStatus",
+]

deltacat/compute/converter/constants.py CHANGED Viewed

@@ -2,3 +2,8 @@ DEFAULT_CONVERTER_TASK_MAX_PARALLELISM = 4096
 # Safe limit ONLY considering CPU limit, typically 32 for a 8x-large worker
 DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD = 30
+# Unique identifier delimiter to ensure different primary key don't end up with same hash when concatenated.
+# e.g.: pk column a with value: 1, 12; pk column b with value: 12, 1; Without delimiter will both become "121".
+IDENTIFIER_FIELD_DELIMITER = "c303282d"

deltacat/compute/converter/converter_session.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# from pyiceberg.typedef import EMPTY_DICT, Identifier, Properties
 from deltacat.utils.ray_utils.concurrency import (
     invoke_parallel,
     task_resource_options_provider,
@@ -20,7 +19,6 @@ from deltacat.compute.converter.steps.convert import convert
 from deltacat.compute.converter.model.convert_input import ConvertInput
 from deltacat.compute.converter.pyiceberg.overrides import (
     fetch_all_bucket_files,
-    parquet_files_dict_to_iceberg_data_files,
 )
 from deltacat.compute.converter.utils.converter_session_utils import (
     construct_iceberg_table_prefix,
@@ -48,32 +46,46 @@ def converter_session(params: ConverterSessionParams, **kwargs):
     table_name = params.iceberg_table_name
     iceberg_table = load_table(catalog, table_name)
     enforce_primary_key_uniqueness = params.enforce_primary_key_uniqueness
+    iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
+    iceberg_namespace = params.iceberg_namespace
+    merge_keys = params.merge_keys
+    compact_previous_position_delete_files = (
+        params.compact_previous_position_delete_files
+    )
+    task_max_parallelism = params.task_max_parallelism
+    s3_client_kwargs = params.s3_client_kwargs
+    s3_file_system = params.s3_file_system
+    location_provider_prefix_override = params.location_provider_prefix_override
+    position_delete_for_multiple_data_files = (
+        params.position_delete_for_multiple_data_files
+    )
     data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
         iceberg_table
     )
     convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
         data_file_dict=data_file_dict,
         equality_delete_dict=equality_delete_dict,
         pos_delete_dict=pos_delete_dict,
     )
-    iceberg_warehouse_bucket_name = params.iceberg_warehouse_bucket_name
-    iceberg_namespace = params.iceberg_namespace
-    iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
-        iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
-        table_name=table_name,
-        iceberg_namespace=iceberg_namespace,
-    )
-    merge_keys = params.merge_keys
+    if not location_provider_prefix_override:
+        iceberg_table_warehouse_prefix = construct_iceberg_table_prefix(
+            iceberg_warehouse_bucket_name=iceberg_warehouse_bucket_name,
+            table_name=table_name,
+            iceberg_namespace=iceberg_namespace,
+        )
+    else:
+        iceberg_table_warehouse_prefix = location_provider_prefix_override
     # Using table identifier fields as merge keys if merge keys not provided
     if not merge_keys:
         identifier_fields_set = iceberg_table.schema().identifier_field_names()
         identifier_fields = list(identifier_fields_set)
     else:
         identifier_fields = merge_keys
-    if len(identifier_fields) > 1:
-        raise NotImplementedError(
-            f"Multiple identifier fields lookup not supported yet."
-        )
     convert_options_provider = functools.partial(
         task_resource_options_provider,
         resource_amount_provider=convert_resource_options_provider,
@@ -86,58 +98,88 @@ def converter_session(params: ConverterSessionParams, **kwargs):
     #  Note that approach 2 will ideally require shared object store to avoid download equality delete files * number of child tasks times.
     max_parallel_data_file_download = DEFAULT_MAX_PARALLEL_DATA_FILE_DOWNLOAD
-    compact_small_files = params.compact_small_files
-    position_delete_for_multiple_data_files = (
-        params.position_delete_for_multiple_data_files
-    )
-    task_max_parallelism = params.task_max_parallelism
     def convert_input_provider(index, item):
         return {
             "convert_input": ConvertInput.of(
-                files_for_each_bucket=item,
+                convert_input_files=item,
                 convert_task_index=index,
                 iceberg_table_warehouse_prefix=iceberg_table_warehouse_prefix,
                 identifier_fields=identifier_fields,
-                compact_small_files=compact_small_files,
+                compact_previous_position_delete_files=compact_previous_position_delete_files,
+                table_io=iceberg_table.io,
+                table_metadata=iceberg_table.metadata,
                 enforce_primary_key_uniqueness=enforce_primary_key_uniqueness,
                 position_delete_for_multiple_data_files=position_delete_for_multiple_data_files,
                 max_parallel_data_file_download=max_parallel_data_file_download,
+                s3_client_kwargs=s3_client_kwargs,
+                s3_file_system=s3_file_system,
             )
         }
+    logger.info(f"Getting remote convert tasks...")
     # Ray remote task: convert
-    # Assuming that memory consume by each bucket doesn't exceed one node's memory limit.
     # TODO: Add split mechanism to split large buckets
     convert_tasks_pending = invoke_parallel(
-        items=convert_input_files_for_all_buckets.items(),
+        items=convert_input_files_for_all_buckets,
         ray_task=convert,
         max_parallelism=task_max_parallelism,
         options_provider=convert_options_provider,
         kwargs_provider=convert_input_provider,
     )
     to_be_deleted_files_list = []
-    to_be_added_files_dict_list = []
+    logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
     convert_results = ray.get(convert_tasks_pending)
-    for convert_result in convert_results:
-        to_be_deleted_files_list.extend(convert_result[0].values())
-        to_be_added_files_dict_list.append(convert_result[1])
+    logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
-    new_position_delete_files = parquet_files_dict_to_iceberg_data_files(
-        io=iceberg_table.io,
-        table_metadata=iceberg_table.metadata,
-        files_dict_list=to_be_added_files_dict_list,
+    total_position_delete_record_count = sum(
+        convert_result.position_delete_record_count
+        for convert_result in convert_results
+    )
+    total_input_data_file_record_count = sum(
+        convert_result.input_data_files_record_count
+        for convert_result in convert_results
+    )
+    total_data_file_hash_columns_in_memory_sizes = sum(
+        convert_result.input_data_files_hash_columns_in_memory_sizes
+        for convert_result in convert_results
+    )
+    total_position_delete_file_in_memory_sizes = sum(
+        convert_result.position_delete_in_memory_sizes
+        for convert_result in convert_results
+    )
+    total_position_delete_on_disk_sizes = sum(
+        convert_result.position_delete_on_disk_sizes
+        for convert_result in convert_results
     )
-    if not to_be_deleted_files_list:
+    to_be_added_files_list = []
+    for convert_result in convert_results:
+        to_be_added_files = convert_result.to_be_added_files
+        to_be_deleted_files = convert_result.to_be_deleted_files
+        to_be_deleted_files_list.extend(to_be_deleted_files.values())
+        to_be_added_files_list.extend(to_be_added_files)
+    if not to_be_deleted_files_list and to_be_added_files_list:
         commit_append_snapshot(
             iceberg_table=iceberg_table,
-            new_position_delete_files=new_position_delete_files,
+            new_position_delete_files=to_be_added_files_list,
         )
     else:
         commit_replace_snapshot(
             iceberg_table=iceberg_table,
-            # equality_delete_files + data file that all rows are deleted
             to_be_deleted_files_list=to_be_deleted_files_list,
-            new_position_delete_files=new_position_delete_files,
+            new_position_delete_files=to_be_added_files_list,
         )
+    logger.info(
+        f"Aggregated stats for {table_name}: "
+        f"total position delete record count: {total_position_delete_record_count}, "
+        f"total input data file record_count: {total_input_data_file_record_count}, "
+        f"total data file hash columns in memory sizes: {total_data_file_hash_columns_in_memory_sizes}, "
+        f"total position delete file in memory sizes: {total_position_delete_file_in_memory_sizes}, "
+        f"total position delete file on disk sizes: {total_position_delete_on_disk_sizes}."
+    )
+    logger.info(f"Committed new Iceberg snapshot.")

deltacat/compute/converter/model/convert_input.py CHANGED Viewed

@@ -10,11 +10,14 @@ class ConvertInput(Dict):
         convert_task_index,
         iceberg_table_warehouse_prefix,
         identifier_fields,
-        compact_small_files,
+        table_io,
+        table_metadata,
+        compact_previous_position_delete_files,
         enforce_primary_key_uniqueness,
         position_delete_for_multiple_data_files,
         max_parallel_data_file_download,
         s3_file_system,
+        s3_client_kwargs,
     ) -> ConvertInput:
         result = ConvertInput()
@@ -22,13 +25,18 @@ class ConvertInput(Dict):
         result["convert_task_index"] = convert_task_index
         result["identifier_fields"] = identifier_fields
         result["iceberg_table_warehouse_prefix"] = iceberg_table_warehouse_prefix
-        result["compact_small_files"] = compact_small_files
+        result["table_io"] = table_io
+        result["table_metadata"] = table_metadata
+        result[
+            "compact_previous_position_delete_files"
+        ] = compact_previous_position_delete_files
         result["enforce_primary_key_uniqueness"] = enforce_primary_key_uniqueness
         result[
             "position_delete_for_multiple_data_files"
         ] = position_delete_for_multiple_data_files
         result["max_parallel_data_file_download"] = max_parallel_data_file_download
         result["s3_file_system"] = s3_file_system
+        result["s3_client_kwargs"] = s3_client_kwargs
         return result
@@ -49,8 +57,16 @@ class ConvertInput(Dict):
         return self["iceberg_table_warehouse_prefix"]
     @property
-    def compact_small_files(self) -> bool:
-        return self["compact_small_files"]
+    def table_io(self):
+        return self["table_io"]
+    @property
+    def table_metadata(self):
+        return self["table_metadata"]
+    @property
+    def compact_previous_position_delete_files(self) -> bool:
+        return self["compact_previous_position_delete_files"]
     @property
     def enforce_primary_key_uniqueness(self) -> bool:
@@ -67,3 +83,7 @@ class ConvertInput(Dict):
     @property
     def s3_file_system(self):
         return self["s3_file_system"]
+    @property
+    def s3_client_kwargs(self):
+        return self["s3_client_kwargs"]

deltacat/compute/converter/model/convert_result.py ADDED Viewed

@@ -0,0 +1,61 @@
+from __future__ import annotations
+from typing import Dict
+class ConvertResult(Dict):
+    @staticmethod
+    def of(
+        convert_task_index,
+        to_be_added_files,
+        to_be_deleted_files,
+        position_delete_record_count,
+        input_data_files_record_count,
+        input_data_files_hash_columns_in_memory_sizes,
+        position_delete_in_memory_sizes,
+        position_delete_on_disk_sizes,
+    ) -> ConvertResult:
+        result = ConvertResult()
+        result["convert_task_index"] = convert_task_index
+        result["to_be_added_files"] = to_be_added_files
+        result["to_be_deleted_files"] = to_be_deleted_files
+        result["position_delete_record_count"] = position_delete_record_count
+        result["input_data_files_record_count"] = input_data_files_record_count
+        result[
+            "input_data_files_hash_columns_in_memory_sizes"
+        ] = input_data_files_hash_columns_in_memory_sizes
+        result["position_delete_in_memory_sizes"] = position_delete_in_memory_sizes
+        result["position_delete_on_disk_sizes"] = position_delete_on_disk_sizes
+        return result
+    @property
+    def convert_task_index(self) -> int:
+        return self["convert_task_index"]
+    @property
+    def to_be_added_files(self):
+        return self["to_be_added_files"]
+    @property
+    def to_be_deleted_files(self):
+        return self["to_be_deleted_files"]
+    @property
+    def position_delete_record_count(self):
+        return self["position_delete_record_count"]
+    @property
+    def input_data_files_record_count(self):
+        return self["input_data_files_record_count"]
+    @property
+    def input_data_files_hash_columns_in_memory_sizes(self):
+        return self["input_data_files_hash_columns_in_memory_sizes"]
+    @property
+    def position_delete_in_memory_sizes(self):
+        return self["position_delete_in_memory_sizes"]
+    @property
+    def position_delete_on_disk_sizes(self):
+        return self["position_delete_on_disk_sizes"]

deltacat 2.0.0b7__py3-none-any.whl → 2.0.0b10__py3-none-any.whl

deltacat 2.0.0b7py3-none-any.whl → 2.0.0b10py3-none-any.whl