PyPI - deltacat - Versions diffs - 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

deltacat/__init__.py +41 -15
deltacat/aws/clients.py +12 -31
deltacat/aws/constants.py +1 -1
deltacat/aws/redshift/__init__.py +7 -2
deltacat/aws/redshift/model/manifest.py +54 -50
deltacat/aws/s3u.py +176 -187
deltacat/catalog/delegate.py +151 -185
deltacat/catalog/interface.py +78 -97
deltacat/catalog/model/catalog.py +21 -21
deltacat/catalog/model/table_definition.py +11 -9
deltacat/compute/compactor/__init__.py +12 -16
deltacat/compute/compactor/compaction_session.py +237 -166
deltacat/compute/compactor/model/delta_annotated.py +60 -44
deltacat/compute/compactor/model/delta_file_envelope.py +5 -6
deltacat/compute/compactor/model/delta_file_locator.py +10 -8
deltacat/compute/compactor/model/materialize_result.py +6 -7
deltacat/compute/compactor/model/primary_key_index.py +38 -34
deltacat/compute/compactor/model/pyarrow_write_result.py +3 -4
deltacat/compute/compactor/model/round_completion_info.py +25 -19
deltacat/compute/compactor/model/sort_key.py +18 -15
deltacat/compute/compactor/steps/dedupe.py +119 -94
deltacat/compute/compactor/steps/hash_bucket.py +48 -47
deltacat/compute/compactor/steps/materialize.py +86 -92
deltacat/compute/compactor/steps/rehash/rehash_bucket.py +13 -13
deltacat/compute/compactor/steps/rehash/rewrite_index.py +5 -5
deltacat/compute/compactor/utils/io.py +59 -47
deltacat/compute/compactor/utils/primary_key_index.py +91 -80
deltacat/compute/compactor/utils/round_completion_file.py +22 -23
deltacat/compute/compactor/utils/system_columns.py +33 -45
deltacat/compute/metastats/meta_stats.py +235 -157
deltacat/compute/metastats/model/partition_stats_dict.py +7 -10
deltacat/compute/metastats/model/stats_cluster_size_estimator.py +13 -5
deltacat/compute/metastats/stats.py +95 -64
deltacat/compute/metastats/utils/io.py +100 -53
deltacat/compute/metastats/utils/pyarrow_memory_estimation_function.py +5 -2
deltacat/compute/metastats/utils/ray_utils.py +38 -33
deltacat/compute/stats/basic.py +107 -69
deltacat/compute/stats/models/delta_column_stats.py +11 -8
deltacat/compute/stats/models/delta_stats.py +59 -32
deltacat/compute/stats/models/delta_stats_cache_result.py +4 -1
deltacat/compute/stats/models/manifest_entry_stats.py +12 -6
deltacat/compute/stats/models/stats_result.py +24 -14
deltacat/compute/stats/utils/intervals.py +16 -9
deltacat/compute/stats/utils/io.py +86 -51
deltacat/compute/stats/utils/manifest_stats_file.py +24 -33
deltacat/constants.py +4 -13
deltacat/io/__init__.py +2 -2
deltacat/io/aws/redshift/redshift_datasource.py +157 -143
deltacat/io/dataset.py +14 -17
deltacat/io/read_api.py +36 -33
deltacat/logs.py +94 -42
deltacat/storage/__init__.py +18 -8
deltacat/storage/interface.py +196 -213
deltacat/storage/model/delta.py +45 -51
deltacat/storage/model/list_result.py +12 -8
deltacat/storage/model/namespace.py +4 -5
deltacat/storage/model/partition.py +42 -42
deltacat/storage/model/stream.py +29 -30
deltacat/storage/model/table.py +14 -14
deltacat/storage/model/table_version.py +32 -31
deltacat/storage/model/types.py +1 -0
deltacat/tests/stats/test_intervals.py +11 -24
deltacat/tests/utils/__init__.py +0 -0
deltacat/tests/utils/test_record_batch_tables.py +284 -0
deltacat/types/media.py +3 -4
deltacat/types/tables.py +31 -21
deltacat/utils/common.py +5 -11
deltacat/utils/numpy.py +20 -22
deltacat/utils/pandas.py +73 -100
deltacat/utils/performance.py +3 -9
deltacat/utils/placement.py +259 -230
deltacat/utils/pyarrow.py +302 -89
deltacat/utils/ray_utils/collections.py +2 -1
deltacat/utils/ray_utils/concurrency.py +27 -28
deltacat/utils/ray_utils/dataset.py +28 -28
deltacat/utils/ray_utils/performance.py +5 -9
deltacat/utils/ray_utils/runtime.py +9 -10
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/METADATA +1 -1
deltacat-0.1.12.dist-info/RECORD +110 -0
deltacat-0.1.10.dev0.dist-info/RECORD +0 -108
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/LICENSE +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/WHEEL +0 -0
{deltacat-0.1.10.dev0.dist-info → deltacat-0.1.12.dist-info}/top_level.txt +0 -0

deltacat/compute/compactor/utils/primary_key_index.py CHANGED Viewed

@@ -8,14 +8,8 @@ import pyarrow as pa
 import ray
 import s3fs
 from ray import cloudpickle
-from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
 from ray.types import ObjectRef
-from deltacat.storage import Manifest, PartitionLocator
-from deltacat.utils.ray_utils.concurrency import invoke_parallel
-from deltacat.compute.compactor import PyArrowWriteResult, \
-    RoundCompletionInfo, PrimaryKeyIndexMeta, PrimaryKeyIndexLocator, \
-    PrimaryKeyIndexVersionMeta, PrimaryKeyIndexVersionLocator
 from deltacat import logs
 from deltacat.aws import s3u
 from deltacat.compute.compactor import (
@@ -30,29 +24,31 @@ from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
 from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
 from deltacat.compute.compactor.utils import round_completion_file as rcf
 from deltacat.compute.compactor.utils import system_columns as sc
+from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
 from deltacat.storage import Manifest, PartitionLocator
 from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.types.tables import get_table_slicer, get_table_writer
 from deltacat.utils.common import ReadKwargsProvider
-from deltacat.utils.ray_utils.concurrency import (
-    invoke_parallel
-)
+from deltacat.utils.ray_utils.concurrency import invoke_parallel
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def rehash(
-        options_provider: Callable[[int, Any], Dict[str, Any]],
-        s3_bucket: str,
-        source_partition_locator: PartitionLocator,
-        old_rci: RoundCompletionInfo,
-        new_hash_bucket_count: int,
-        hash_bucket_index_group_count: int,
-        records_per_primary_key_index_file: int,
-        delete_old_primary_key_index: bool) -> RoundCompletionInfo:
-    logger.info(f"Rehashing primary key index. Old round completion info: "
-                f"{old_rci}. New hash bucket count: {new_hash_bucket_count}")
+    options_provider: Callable[[int, Any], Dict[str, Any]],
+    s3_bucket: str,
+    source_partition_locator: PartitionLocator,
+    old_rci: RoundCompletionInfo,
+    new_hash_bucket_count: int,
+    hash_bucket_index_group_count: int,
+    records_per_primary_key_index_file: int,
+    delete_old_primary_key_index: bool,
+) -> RoundCompletionInfo:
+    logger.info(
+        f"Rehashing primary key index. Old round completion info: "
+        f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
+    )
     # collect old primary key index information
     old_pki_version_locator = old_rci.primary_key_index_version_locator
@@ -60,10 +56,12 @@ def rehash(
     old_pki_meta = old_pkiv_meta.primary_key_index_meta
     old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
     if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
-        raise ValueError(f"Primary key index rehash failed. Old hash bucket "
-                         f"count ({new_hash_bucket_count}) is "
-                         f"equal to new hash bucket count. Partition: "
-                         f"{old_compacted_partition_locator}.")
+        raise ValueError(
+            f"Primary key index rehash failed. Old hash bucket "
+            f"count ({new_hash_bucket_count}) is "
+            f"equal to new hash bucket count. Partition: "
+            f"{old_compacted_partition_locator}."
+        )
     # generate a new unique primary key index version locator to rehash into
     new_pki_meta = PrimaryKeyIndexMeta.of(
@@ -78,7 +76,8 @@ def rehash(
         new_hash_bucket_count,
     )
     rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
-        new_pki_version_meta)
+        new_pki_version_meta
+    )
     # launch a rehash task for each bucket of the old primary key index version
     old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
@@ -124,6 +123,7 @@ def rehash(
         PyArrowWriteResult.union(pki_stats),
         old_rci.sort_keys_bit_width,
         rehashed_pki_version_locator,
+        old_rci.rebase_source_partition_locator,
     )
     rcf.write_round_completion_file(
         s3_bucket,
@@ -136,41 +136,48 @@ def rehash(
             s3_bucket,
             old_pki_version_locator,
         )
-    logger.info(f"Rehashed primary key index. New round completion info: "
-                f"{round_completion_info}.")
+    logger.info(
+        f"Rehashed primary key index. New round completion info: "
+        f"{round_completion_info}."
+    )
     return round_completion_info
 def download_hash_bucket_entries(
-        s3_bucket: str,
-        hash_bucket_index: int,
-        primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-        file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None) \
-        -> List[pa.Table]:
-    pk_index_manifest_s3_url = primary_key_index_version_locator\
-        .get_pkiv_hb_index_manifest_s3_url(
+    s3_bucket: str,
+    hash_bucket_index: int,
+    primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
+    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
+) -> List[pa.Table]:
+    pk_index_manifest_s3_url = (
+        primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
             s3_bucket,
             hash_bucket_index,
         )
+    )
     result = s3u.download(pk_index_manifest_s3_url, False)
-    logger.info(f"Downloading primary key index hash bucket manifest entries: "
-                f"{pk_index_manifest_s3_url}. Primary key index version "
-                f"locator: {primary_key_index_version_locator}")
+    logger.info(
+        f"Downloading primary key index hash bucket manifest entries: "
+        f"{pk_index_manifest_s3_url}. Primary key index version "
+        f"locator: {primary_key_index_version_locator}"
+    )
     pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
-    tables = s3u.download_manifest_entries(pk_index_manifest,
-                                           file_reader_kwargs_provider=file_reader_kwargs_provider)
+    tables = s3u.download_manifest_entries(
+        pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
+    )
     if not tables:
         logger.warning(
             f"Primary key index manifest is empty at: "
             f"{pk_index_manifest_s3_url}. Primary key index version "
-            f"locator: {primary_key_index_version_locator}")
+            f"locator: {primary_key_index_version_locator}"
+        )
     return tables
 def delete_primary_key_index_version(
-        s3_bucket: str,
-        pki_version_locator: PrimaryKeyIndexVersionLocator) -> None:
+    s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
+) -> None:
     logger.info(f"Deleting primary key index: {pki_version_locator}")
     s3u.delete_files_by_prefix(
@@ -181,8 +188,8 @@ def delete_primary_key_index_version(
 def group_record_indices_by_hash_bucket(
-        pki_table: pa.Table,
-        num_buckets: int) -> np.ndarray:
+    pki_table: pa.Table, num_buckets: int
+) -> np.ndarray:
     hash_bucket_to_indices = np.empty([num_buckets], dtype="object")
     record_index = 0
@@ -196,11 +203,10 @@ def group_record_indices_by_hash_bucket(
 def group_hash_bucket_indices(
-        hash_bucket_object_groups: np.ndarray,
-        num_buckets: int,
-        num_groups: int) -> Tuple[np.ndarray, List[ObjectRef]]:
+    hash_bucket_object_groups: np.ndarray, num_buckets: int, num_groups: int
+) -> Tuple[np.ndarray, List[ObjectRef]]:
     """
-    Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
+    Groups all the ObjectRef that belongs to a particular hash bucket group and hash bucket index.
     """
     object_refs = []
@@ -214,8 +220,7 @@ def group_hash_bucket_indices(
         if obj:
             hb_group = hb_index % num_groups
             if hb_group_to_object[hb_group] is None:
-                hb_group_to_object[hb_group] = np.empty(
-                    [num_buckets], dtype="object")
+                hb_group_to_object[hb_group] = np.empty([num_buckets], dtype="object")
             hb_group_to_object[hb_group][hb_index] = obj
     for hb_group, obj in enumerate(hb_group_to_object):
@@ -225,21 +230,19 @@ def group_hash_bucket_indices(
         pickled_obj_ref = cloudpickle.dumps(obj_ref)
         object_refs.append(pickled_obj_ref)
         hash_bucket_group_to_obj_id[hb_group] = pickled_obj_ref
-        # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
-        # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
-        # (e.g., if the ObjectRef is deserialized by a non-Ray process).
-        # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
-        # The object now has a permanent reference and the data can't be freed from Ray’s object store.
-        # Manually deleting the untrackable object references offsets these permanent references and
-        # helps to allow these objects to be garbage collected normally.
+        # NOTE: The cloudpickle.dumps API call creates an out of band object reference to the object_ref variable.
+        # After pickling, Ray cannot track the serialized copy of the object or determine when the ObjectRef has been deserialized
+        # (e.g., if the ObjectRef is deserialized by a non-Ray process).
+        # Thus the object_ref cannot be tracked by Ray's distributed reference counter, even if it goes out of scope.
+        # The object now has a permanent reference and the data can't be freed from Ray’s object store.
+        # Manually deleting the untrackable object references offsets these permanent references and
+        # helps to allow these objects to be garbage collected normally.
         del obj_ref
         del pickled_obj_ref
     return hash_bucket_group_to_obj_id, object_refs
-def pk_digest_to_hash_bucket_index(
-        digest,
-        num_buckets: int) -> int:
+def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
     """
     Deterministically get the hash bucket a particular digest belongs to
     based on number of total hash buckets.
@@ -249,11 +252,12 @@ def pk_digest_to_hash_bucket_index(
 def write_primary_key_index_files(
-        table: pa.Table,
-        primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-        s3_bucket: str,
-        hb_index: int,
-        records_per_index_file: int) -> PyArrowWriteResult:
+    table: pa.Table,
+    primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
+    s3_bucket: str,
+    hb_index: int,
+    records_per_index_file: int,
+) -> PyArrowWriteResult:
     """
     Writes primary key index files for the given hash bucket index out to the
     specified S3 bucket at the path identified by the given primary key index
@@ -262,19 +266,24 @@ def write_primary_key_index_files(
     TODO(raghumdani): Support writing primary key index to any data catalog
     """
-    logger.info(f"Writing primary key index files for hash bucket {hb_index}. "
-                f"Primary key index version locator: "
-                f"{primary_key_index_version_locator}.")
+    logger.info(
+        f"Writing primary key index files for hash bucket {hb_index}. "
+        f"Primary key index version locator: "
+        f"{primary_key_index_version_locator}."
+    )
     s3_file_system = s3fs.S3FileSystem(
         anon=False,
         s3_additional_kwargs={
             "ContentType": ContentType.PARQUET.value,
             "ContentEncoding": ContentEncoding.IDENTITY.value,
         },
-        config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
+        config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
+    )
+    pkiv_hb_index_s3_url_base = (
+        primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
+            s3_bucket, hb_index
+        )
     )
-    pkiv_hb_index_s3_url_base = primary_key_index_version_locator\
-        .get_pkiv_hb_index_s3_url_base(s3_bucket, hb_index)
     manifest_entries = s3u.upload_sliced_table(
         table,
         pkiv_hb_index_s3_url_base,
@@ -284,19 +293,21 @@ def write_primary_key_index_files(
         get_table_slicer(table),
     )
     manifest = Manifest.of(manifest_entries)
-    pkiv_hb_index_s3_manifest_s3_url = primary_key_index_version_locator\
-        .get_pkiv_hb_index_manifest_s3_url(s3_bucket, hb_index)
-    s3u.upload(
-        pkiv_hb_index_s3_manifest_s3_url,
-        str(json.dumps(manifest))
+    pkiv_hb_index_s3_manifest_s3_url = (
+        primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
+            s3_bucket, hb_index
+        )
     )
+    s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
     result = PyArrowWriteResult.of(
         len(manifest_entries),
         table.nbytes,
         manifest.meta.content_length,
         len(table),
     )
-    logger.info(f"Wrote primary key index files for hash bucket {hb_index}. "
-                f"Primary key index version locator: "
-                f"{primary_key_index_version_locator}. Result: {result}")
+    logger.info(
+        f"Wrote primary key index files for hash bucket {hb_index}. "
+        f"Primary key index version locator: "
+        f"{primary_key_index_version_locator}. Result: {result}"
+    )
     return result

deltacat/compute/compactor/utils/round_completion_file.py CHANGED Viewed

@@ -1,35 +1,35 @@
-import logging
 import json
+import logging
-from deltacat.storage import PartitionLocator
-from deltacat.compute.compactor import RoundCompletionInfo
 from deltacat import logs
+from deltacat.compute.compactor import RoundCompletionInfo
+from deltacat.storage import PartitionLocator
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 def get_round_completion_file_s3_url(
-        bucket: str,
-        source_partition_locator: PartitionLocator,
-        pki_root_path: str) -> str:
+    bucket: str, source_partition_locator: PartitionLocator, pki_root_path: str
+) -> str:
     base_url = source_partition_locator.path(f"s3://{bucket}")
     return f"{base_url}/{pki_root_path}.json"
 def read_round_completion_file(
-        bucket: str,
-        source_partition_locator: PartitionLocator,
-        primary_key_index_root_path: str) -> RoundCompletionInfo:
+    bucket: str,
+    source_partition_locator: PartitionLocator,
+    primary_key_index_root_path: str,
+) -> RoundCompletionInfo:
     from deltacat.aws import s3u as s3_utils
     round_completion_file_url = get_round_completion_file_s3_url(
         bucket,
         source_partition_locator,
         primary_key_index_root_path,
     )
-    logger.info(
-        f"reading round completion file from: {round_completion_file_url}")
+    logger.info(f"reading round completion file from: {round_completion_file_url}")
     round_completion_info = None
     result = s3_utils.download(round_completion_file_url, False)
     if result:
@@ -40,24 +40,23 @@ def read_round_completion_file(
 def write_round_completion_file(
-        bucket: str,
-        source_partition_locator: PartitionLocator,
-        primary_key_index_root_path: str,
-        round_completion_info: RoundCompletionInfo):
+    bucket: str,
+    source_partition_locator: PartitionLocator,
+    primary_key_index_root_path: str,
+    round_completion_info: RoundCompletionInfo,
+) -> str:
     from deltacat.aws import s3u as s3_utils
-    logger.info(
-        f"writing round completion file contents: {round_completion_info}")
+    logger.info(f"writing round completion file contents: {round_completion_info}")
     round_completion_file_s3_url = get_round_completion_file_s3_url(
         bucket,
         source_partition_locator,
         primary_key_index_root_path,
     )
-    logger.info(
-        f"writing round completion file to: {round_completion_file_s3_url}")
+    logger.info(f"writing round completion file to: {round_completion_file_s3_url}")
     s3_utils.upload(
-        round_completion_file_s3_url,
-        str(json.dumps(round_completion_info))
+        round_completion_file_s3_url, str(json.dumps(round_completion_info))
     )
-    logger.info(
-        f"round completion file written to: {round_completion_file_s3_url}")
+    logger.info(f"round completion file written to: {round_completion_file_s3_url}")
+    return round_completion_file_s3_url

deltacat/compute/compactor/utils/system_columns.py CHANGED Viewed

@@ -1,10 +1,11 @@
-import pyarrow as pa
-import numpy as np
 from itertools import repeat
 from typing import Union
-from deltacat.storage import DeltaType
+import numpy as np
+import pyarrow as pa
 from deltacat.compute.compactor import DeltaFileEnvelope
+from deltacat.storage import DeltaType
 _SYS_COL_UUID = "4000f124-dfbd-48c6-885b-7b22621a6d41"
@@ -65,10 +66,7 @@ _IS_SOURCE_COLUMN_FIELD = pa.field(
 def get_pk_hash_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
-    return pa.array(
-        obj,
-        _PK_HASH_COLUMN_TYPE
-    )
+    return pa.array(obj, _PK_HASH_COLUMN_TYPE)
 def pk_hash_column_np(table: pa.Table) -> np.ndarray:
@@ -79,6 +77,10 @@ def pk_hash_column(table: pa.Table) -> pa.ChunkedArray:
     return table[_PK_HASH_COLUMN_NAME]
+def delta_type_column_np(table: pa.Table) -> np.ndarray:
+    return table[_DELTA_TYPE_COLUMN_NAME].to_numpy()
 def delta_type_column(table: pa.Table) -> pa.ChunkedArray:
     return table[_DELTA_TYPE_COLUMN_NAME]
@@ -101,8 +103,7 @@ def stream_position_column_np(table: pa.Table) -> np.ndarray:
     return table[_PARTITION_STREAM_POSITION_COLUMN_NAME].to_numpy()
-def get_file_index_column_array(obj) \
-        -> Union[pa.Array, pa.ChunkedArray]:
+def get_file_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
     return pa.array(
         obj,
         _ORDERED_FILE_IDX_COLUMN_TYPE,
@@ -113,8 +114,7 @@ def file_index_column_np(table: pa.Table) -> np.ndarray:
     return table[_ORDERED_FILE_IDX_COLUMN_NAME].to_numpy()
-def get_record_index_column_array(obj) -> \
-        Union[pa.Array, pa.ChunkedArray]:
+def get_record_index_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
     return pa.array(
         obj,
         _ORDERED_RECORD_IDX_COLUMN_TYPE,
@@ -144,7 +144,8 @@ def get_is_source_column_array(obj) -> Union[pa.Array, pa.ChunkedArray]:
 def project_delta_file_metadata_on_table(
-        delta_file_envelope: DeltaFileEnvelope) -> pa.Table:
+    delta_file_envelope: DeltaFileEnvelope,
+) -> pa.Table:
     table = delta_file_envelope.table
@@ -181,42 +182,33 @@ def project_delta_file_metadata_on_table(
     return table
-def append_stream_position_column(
-        table: pa.Table,
-        stream_positions):
+def append_stream_position_column(table: pa.Table, stream_positions):
     table = table.append_column(
         _PARTITION_STREAM_POSITION_COLUMN_FIELD,
-        get_stream_position_column_array(stream_positions)
+        get_stream_position_column_array(stream_positions),
     )
     return table
-def append_file_idx_column(
-        table: pa.Table,
-        ordered_file_indices):
+def append_file_idx_column(table: pa.Table, ordered_file_indices):
     table = table.append_column(
         _ORDERED_FILE_IDX_COLUMN_FIELD,
-        get_file_index_column_array(ordered_file_indices)
+        get_file_index_column_array(ordered_file_indices),
     )
     return table
-def append_pk_hash_column(
-        table: pa.Table,
-        pk_hashes) -> pa.Table:
+def append_pk_hash_column(table: pa.Table, pk_hashes) -> pa.Table:
     table = table.append_column(
-        _PK_HASH_COLUMN_FIELD,
-        get_pk_hash_column_array(pk_hashes)
+        _PK_HASH_COLUMN_FIELD, get_pk_hash_column_array(pk_hashes)
     )
     return table
-def append_record_idx_col(
-        table: pa.Table,
-        ordered_record_indices) -> pa.Table:
+def append_record_idx_col(table: pa.Table, ordered_record_indices) -> pa.Table:
     table = table.append_column(
         _ORDERED_RECORD_IDX_COLUMN_FIELD,
@@ -225,9 +217,7 @@ def append_record_idx_col(
     return table
-def append_dedupe_task_idx_col(
-        table: pa.Table,
-        dedupe_task_indices) -> pa.Table:
+def append_dedupe_task_idx_col(table: pa.Table, dedupe_task_indices) -> pa.Table:
     table = table.append_column(
         _DEDUPE_TASK_IDX_COLUMN_FIELD,
@@ -244,9 +234,7 @@ def delta_type_from_field(delta_type_field: bool) -> DeltaType:
     return DeltaType.UPSERT if delta_type_field else DeltaType.DELETE
-def append_delta_type_col(
-        table: pa.Table,
-        delta_types) -> pa.Table:
+def append_delta_type_col(table: pa.Table, delta_types) -> pa.Table:
     table = table.append_column(
         _DELTA_TYPE_COLUMN_FIELD,
@@ -255,9 +243,7 @@ def append_delta_type_col(
     return table
-def append_is_source_col(
-        table: pa.Table,
-        booleans) -> pa.Table:
+def append_is_source_col(table: pa.Table, booleans) -> pa.Table:
     table = table.append_column(
         _IS_SOURCE_COLUMN_FIELD,
@@ -267,11 +253,13 @@ def append_is_source_col(
 def get_minimal_hb_schema() -> pa.schema:
-    return pa.schema([
-        _PK_HASH_COLUMN_FIELD,
-        _ORDERED_RECORD_IDX_COLUMN_FIELD,
-        _ORDERED_FILE_IDX_COLUMN_FIELD,
-        _PARTITION_STREAM_POSITION_COLUMN_FIELD,
-        _DELTA_TYPE_COLUMN_FIELD,
-        _IS_SOURCE_COLUMN_FIELD
-    ])
+    return pa.schema(
+        [
+            _PK_HASH_COLUMN_FIELD,
+            _ORDERED_RECORD_IDX_COLUMN_FIELD,
+            _ORDERED_FILE_IDX_COLUMN_FIELD,
+            _PARTITION_STREAM_POSITION_COLUMN_FIELD,
+            _DELTA_TYPE_COLUMN_FIELD,
+            _IS_SOURCE_COLUMN_FIELD,
+        ]
+    )

deltacat 0.1.10.dev0__py3-none-any.whl → 0.1.12__py3-none-any.whl

deltacat 0.1.10.dev0py3-none-any.whl → 0.1.12py3-none-any.whl