PyPI - deltacat - Versions diffs - 0.1.18b11__py3-none-any.whl → 0.1.18b13__py3-none-any.whl - Mend

deltacat 0.1.18b11py3-none-any.whl → 0.1.18b13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

deltacat/__init__.py CHANGED Viewed

@@ -43,7 +43,7 @@ from deltacat.types.tables import TableWriteMode
 deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__))
-__version__ = "0.1.18b11"
+__version__ = "0.1.18b13"
 __all__ = [

deltacat/compute/compactor/repartition_session.py CHANGED Viewed

@@ -144,7 +144,9 @@ def repartition(
     logger.info(f"repartition {repar_end - repar_start} seconds")
     logger.info(f"Got {len(ordered_deltas)} task results.")
     # ordered_deltas are ordered as [cold1, cold2, coldN, hot1, hot2, hotN]
-    merged_delta = Delta.merge_deltas(ordered_deltas)
+    merged_delta = Delta.merge_deltas(
+        ordered_deltas, stream_position=last_stream_position_to_compact
+    )
     compacted_delta = deltacat_storage.commit_delta(
         merged_delta, properties=kwargs.get("properties", {})
     )

deltacat/compute/compactor/steps/repartition.py CHANGED Viewed

@@ -2,6 +2,7 @@ import importlib
 import logging
 from contextlib import nullcontext
 import pyarrow.compute as pc
+from deltacat.constants import SIGNED_INT64_MIN_VALUE, SIGNED_INT64_MAX_VALUE
 import pyarrow as pa
 from typing import List, Optional
 from deltacat.types.media import StorageType, ContentType
@@ -93,7 +94,9 @@ def repartition_range(
     if not all(column in table.column_names for table in tables):
         raise ValueError(f"Column {column} does not exist in the table")
     partition_ranges.sort()
-    partition_ranges = [-float("Inf")] + partition_ranges + [float("Inf")]
+    partition_ranges = (
+        [SIGNED_INT64_MIN_VALUE] + partition_ranges + [SIGNED_INT64_MAX_VALUE]
+    )
     partitioned_tables_list = [[] for _ in range(len(partition_ranges) - 1)]
     total_record_count = 0
@@ -106,6 +109,7 @@ def repartition_range(
             pa.field(col_name_int64, pa.int64()),
             pc.cast(table[column], pa.int64()),
         )
+        null_row_table = table_new.filter(pc.field(col_name_int64).is_null())
         # Iterate over pairs of values in partition_ranges
         for i, (lower_limit, upper_limit) in enumerate(
             zip(partition_ranges[:-1], partition_ranges[1:]), start=0
@@ -117,12 +121,19 @@ def repartition_range(
                     & (pc.field(col_name_int64) <= pc.scalar(upper_limit))
                 )
             )
+            if i == 0:
+                partitioned_tables_list[i].append(null_row_table)
     partition_table_length = 0
     # After re-grouping the tables by specified ranges, for each group, we need concat and stage the tables
     partition_deltas: List[Delta] = []
     for partition_tables in partitioned_tables_list:
         if len(partition_tables) > 0:
-            partition_table: pa.Table = pa.concat_tables(partition_tables)
+            print(f"column to be dropped: {col_name_int64}")
+            partition_table: pa.Table = pa.concat_tables(partition_tables).drop(
+                [col_name_int64]
+            )
+            assert col_name_int64 not in partition_table.schema.names
             if len(partition_table) > 0:
                 partition_table_length += len(partition_table)
                 partition_delta: Delta = deltacat_storage.stage_delta(
@@ -136,6 +147,7 @@ def repartition_range(
     assert (
         partition_table_length == total_record_count
     ), f"Repartitioned table should have the same number of records {partition_table_length} as the original table {total_record_count}"
     return RepartitionResult(
         range_deltas=partition_deltas,
     )

deltacat/compute/compactor/utils/primary_key_index.py CHANGED Viewed

@@ -1,180 +1,21 @@
-import json
 import logging
-from collections import defaultdict
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import List, Optional, Tuple
 import numpy as np
 import pyarrow as pa
-import ray
-import s3fs
 from ray.types import ObjectRef
 from deltacat import logs
 from deltacat.aws import s3u
 from deltacat.compute.compactor import (
-    PrimaryKeyIndexLocator,
-    PrimaryKeyIndexMeta,
     PrimaryKeyIndexVersionLocator,
-    PrimaryKeyIndexVersionMeta,
-    PyArrowWriteResult,
-    RoundCompletionInfo,
 )
-from deltacat.compute.compactor.steps.rehash import rehash_bucket as rb
-from deltacat.compute.compactor.steps.rehash import rewrite_index as ri
-from deltacat.compute.compactor.utils import round_completion_file as rcf
 from deltacat.compute.compactor.utils import system_columns as sc
-from deltacat.constants import PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG
-from deltacat.storage import Manifest, PartitionLocator
-from deltacat.types.media import ContentEncoding, ContentType
-from deltacat.types.tables import get_table_slicer, get_table_writer
-from deltacat.utils.common import ReadKwargsProvider
-from deltacat.utils.ray_utils.concurrency import invoke_parallel
 from deltacat.io.object_store import IObjectStore
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
-def rehash(
-    options_provider: Callable[[int, Any], Dict[str, Any]],
-    s3_bucket: str,
-    source_partition_locator: PartitionLocator,
-    old_rci: RoundCompletionInfo,
-    new_hash_bucket_count: int,
-    hash_bucket_index_group_count: int,
-    records_per_primary_key_index_file: int,
-    delete_old_primary_key_index: bool,
-) -> RoundCompletionInfo:
-    logger.info(
-        f"Rehashing primary key index. Old round completion info: "
-        f"{old_rci}. New hash bucket count: {new_hash_bucket_count}"
-    )
-    # collect old primary key index information
-    old_pki_version_locator = old_rci.primary_key_index_version_locator
-    old_pkiv_meta = old_pki_version_locator.primary_key_index_version_meta
-    old_pki_meta = old_pkiv_meta.primary_key_index_meta
-    old_compacted_partition_locator = old_pki_meta.compacted_partition_locator
-    if old_pkiv_meta.hash_bucket_count == new_hash_bucket_count:
-        raise ValueError(
-            f"Primary key index rehash failed. Old hash bucket "
-            f"count ({new_hash_bucket_count}) is "
-            f"equal to new hash bucket count. Partition: "
-            f"{old_compacted_partition_locator}."
-        )
-    # generate a new unique primary key index version locator to rehash into
-    new_pki_meta = PrimaryKeyIndexMeta.of(
-        old_compacted_partition_locator,
-        old_pki_meta.primary_keys,
-        old_pki_meta.sort_keys,
-        old_pki_meta.primary_key_index_algorithm_version,
-    )
-    new_pki_locator = PrimaryKeyIndexLocator.of(new_pki_meta)
-    new_pki_version_meta = PrimaryKeyIndexVersionMeta.of(
-        new_pki_meta,
-        new_hash_bucket_count,
-    )
-    rehashed_pki_version_locator = PrimaryKeyIndexVersionLocator.generate(
-        new_pki_version_meta
-    )
-    # launch a rehash task for each bucket of the old primary key index version
-    old_hash_bucket_count = old_pkiv_meta.hash_bucket_count
-    hb_tasks_pending = invoke_parallel(
-        items=range(old_hash_bucket_count),
-        ray_task=rb.rehash_bucket,
-        max_parallelism=None,
-        options_provider=options_provider,
-        s3_bucket=s3_bucket,
-        old_pki_version_locator=old_pki_version_locator,
-        num_buckets=new_hash_bucket_count,
-        num_groups=hash_bucket_index_group_count,
-    )
-    logger.info(f"Getting {len(hb_tasks_pending)} rehash bucket results...")
-    hb_results = ray.get([t[0] for t in hb_tasks_pending])
-    logger.info(f"Got {len(hb_results)} rehash bucket results.")
-    all_hash_group_idx_to_obj_id = defaultdict(list)
-    for hash_group_idx_to_obj_id in hb_results:
-        for hash_group_index, object_id in enumerate(hash_group_idx_to_obj_id):
-            if object_id:
-                all_hash_group_idx_to_obj_id[hash_group_index].append(object_id)
-    hash_group_count = len(all_hash_group_idx_to_obj_id)
-    logger.info(f"Rehash bucket groups created: {hash_group_count}")
-    # write primary key index files for each rehashed output bucket
-    pki_stats_promises = invoke_parallel(
-        items=all_hash_group_idx_to_obj_id.values(),
-        ray_task=ri.rewrite_index,
-        max_parallelism=None,
-        options_provider=options_provider,
-        s3_bucket=s3_bucket,
-        new_primary_key_index_version_locator=rehashed_pki_version_locator,
-        max_records_per_index_file=records_per_primary_key_index_file,
-    )
-    logger.info(f"Getting {len(pki_stats_promises)} rewrite index results...")
-    pki_stats = ray.get([t[0] for t in pki_stats_promises])
-    logger.info(f"Got {len(pki_stats)} rewrite index results.")
-    round_completion_info = RoundCompletionInfo.of(
-        old_rci.high_watermark,
-        old_rci.compacted_delta_locator,
-        old_rci.compacted_pyarrow_write_result,
-        PyArrowWriteResult.union(pki_stats),
-        old_rci.sort_keys_bit_width,
-        rehashed_pki_version_locator,
-        old_rci.rebase_source_partition_locator,
-    )
-    rcf.write_round_completion_file(
-        s3_bucket,
-        source_partition_locator,
-        new_pki_locator.primary_key_index_root_path,
-        round_completion_info,
-    )
-    if delete_old_primary_key_index:
-        delete_primary_key_index_version(
-            s3_bucket,
-            old_pki_version_locator,
-        )
-    logger.info(
-        f"Rehashed primary key index. New round completion info: "
-        f"{round_completion_info}."
-    )
-    return round_completion_info
-def download_hash_bucket_entries(
-    s3_bucket: str,
-    hash_bucket_index: int,
-    primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-    file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
-) -> List[pa.Table]:
-    pk_index_manifest_s3_url = (
-        primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
-            s3_bucket,
-            hash_bucket_index,
-        )
-    )
-    result = s3u.download(pk_index_manifest_s3_url, False)
-    logger.info(
-        f"Downloading primary key index hash bucket manifest entries: "
-        f"{pk_index_manifest_s3_url}. Primary key index version "
-        f"locator: {primary_key_index_version_locator}"
-    )
-    pk_index_manifest = Manifest(json.loads(result["Body"].read().decode("utf-8")))
-    tables = s3u.download_manifest_entries(
-        pk_index_manifest, file_reader_kwargs_provider=file_reader_kwargs_provider
-    )
-    if not tables:
-        logger.warning(
-            f"Primary key index manifest is empty at: "
-            f"{pk_index_manifest_s3_url}. Primary key index version "
-            f"locator: {primary_key_index_version_locator}"
-        )
-    return tables
 def delete_primary_key_index_version(
     s3_bucket: str, pki_version_locator: PrimaryKeyIndexVersionLocator
 ) -> None:
@@ -243,65 +84,3 @@ def pk_digest_to_hash_bucket_index(digest, num_buckets: int) -> int:
     """
     return int.from_bytes(digest, "big") % num_buckets
-def write_primary_key_index_files(
-    table: pa.Table,
-    primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-    s3_bucket: str,
-    hb_index: int,
-    records_per_index_file: int,
-) -> PyArrowWriteResult:
-    """
-    Writes primary key index files for the given hash bucket index out to the
-    specified S3 bucket at the path identified by the given primary key index
-    version locator. Output is written as 1 or more Parquet files with the
-    given maximum number of records per file.
-    TODO(raghumdani): Support writing primary key index to any data catalog
-    """
-    logger.info(
-        f"Writing primary key index files for hash bucket {hb_index}. "
-        f"Primary key index version locator: "
-        f"{primary_key_index_version_locator}."
-    )
-    s3_file_system = s3fs.S3FileSystem(
-        anon=False,
-        s3_additional_kwargs={
-            "ContentType": ContentType.PARQUET.value,
-            "ContentEncoding": ContentEncoding.IDENTITY.value,
-        },
-        config_kwargs=PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG,
-    )
-    pkiv_hb_index_s3_url_base = (
-        primary_key_index_version_locator.get_pkiv_hb_index_s3_url_base(
-            s3_bucket, hb_index
-        )
-    )
-    manifest_entries = s3u.upload_sliced_table(
-        table,
-        pkiv_hb_index_s3_url_base,
-        s3_file_system,
-        records_per_index_file,
-        get_table_writer(table),
-        get_table_slicer(table),
-    )
-    manifest = Manifest.of(manifest_entries)
-    pkiv_hb_index_s3_manifest_s3_url = (
-        primary_key_index_version_locator.get_pkiv_hb_index_manifest_s3_url(
-            s3_bucket, hb_index
-        )
-    )
-    s3u.upload(pkiv_hb_index_s3_manifest_s3_url, str(json.dumps(manifest)))
-    result = PyArrowWriteResult.of(
-        len(manifest_entries),
-        table.nbytes,
-        manifest.meta.content_length,
-        len(table),
-    )
-    logger.info(
-        f"Wrote primary key index files for hash bucket {hb_index}. "
-        f"Primary key index version locator: "
-        f"{primary_key_index_version_locator}. Result: {result}"
-    )
-    return result

deltacat/constants.py CHANGED Viewed

@@ -36,6 +36,9 @@ BYTES_PER_GIBIBYTE = 2**30
 BYTES_PER_TEBIBYTE = 2**40
 BYTES_PER_PEBIBYTE = 2**50
+SIGNED_INT64_MIN_VALUE = -(2**63)
+SIGNED_INT64_MAX_VALUE = 2**63 - 1
 # Inflation multiplier from snappy-compressed parquet to pyarrow.
 # This should be kept larger than actual average inflation multipliers.
 # Note that this is a very rough guess since actual observed pyarrow
@@ -49,8 +52,4 @@ PYARROW_INFLATION_MULTIPLIER = 2.5
 # Inflation multiplier from snappy-compressed parquet to pyarrow for all columns.
 PYARROW_INFLATION_MULTIPLIER_ALL_COLUMNS = 6
-PRIMARY_KEY_INDEX_WRITE_BOTO3_CONFIG = {
-    "retries": {"max_attempts": 25, "mode": "standard"}
-}
 MEMORY_TO_HASH_BUCKET_COUNT_RATIO = 0.0512 * BYTES_PER_TEBIBYTE

deltacat/tests/test_repartition.py CHANGED Viewed

@@ -188,6 +188,36 @@ class TestRepartitionRange(unittest.TestCase):
         )
         self.assertEqual(len(result.range_deltas), 2)
+    def test_null_rows_are_not_dropped(self):
+        # Add null value to the first table
+        tables_with_null = [
+            pa.table(
+                {
+                    "last_updated": [
+                        None,
+                        1678665487112746,
+                        1678665487112747,
+                        1678665487112748,
+                    ]
+                }
+            ),
+            self.tables[1],
+        ]
+        result = repartition_range(
+            tables_with_null,
+            self.destination_partition,
+            self.repartition_args,
+            self.max_records_per_output_file,
+            self.repartitioned_file_content_type,
+            self.deltacat_storage,
+        )
+        # Assuming range_deltas is a list of DataFrames,
+        # check that the first DataFrame has the null value in the 'last_updated' column
+        # This may need to be adjusted depending on the actual structure of range_deltas
+        self.assertEqual(len(result.range_deltas), 2)
 if __name__ == "__main__":
     unittest.main()

deltacat/utils/placement.py CHANGED Viewed

@@ -229,8 +229,13 @@ class PlacementGroupManager:
     def get_current_node_resource_key(self) -> str:
         # on ec2: address="172.31.34.51:6379"
-        # on manta: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
-        current_node_name = ray.experimental.internal_kv.global_gcs_client.address[:-5]
+        # on AWS Glue for Ray: address = "2600:1f10:4674:6815:aadb:2dc8:de61:bc8e:6379"
+        (
+            current_node_name,
+            _,
+        ) = ray.experimental.internal_kv.global_gcs_client.address.rsplit(
+            ":", 1
+        )  # using rsplit split on the last occurence of delimiter ":"
         for node in ray.nodes():
             if node["NodeName"] == current_node_name:
                 # Found the node.

{deltacat-0.1.18b11.dist-info → deltacat-0.1.18b13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deltacat
-Version: 0.1.18b11
+Version: 0.1.18b13
 Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
 Home-page: https://github.com/ray-project/deltacat
 Author: Ray Team

{deltacat-0.1.18b11.dist-info → deltacat-0.1.18b13.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-deltacat/__init__.py,sha256=-pROroKFHbLQAMruWQRdiPV5IEfyY12EgCXKDrSBkbw,1811
-deltacat/constants.py,sha256=oMU8ypqvDBTG54-6MLGWrt9iJKTN-HKsSWxEWnWp77c,1969
+deltacat/__init__.py,sha256=_t2_FxNTDhr42lxts3cV8iHgCrw_PAT3pIx7MHSA5Ro,1811
+deltacat/constants.py,sha256=_6oRI-3yp5c8J1qKGQZrt89I9-ttT_gSSvVsJ0h8Duc,1939
 deltacat/exceptions.py,sha256=x7qem7FLujXf-DzPsNcQ-XYkW3cF3A0YGIbxkcpz0Mw,146
 deltacat/logs.py,sha256=yyve_6Y4bLWAdCOnxFOPrSR9FRXwZuh68_rRoPpmg08,5633
 deltacat/aws/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -18,7 +18,7 @@ deltacat/catalog/model/table_definition.py,sha256=tKrM1mmaQlvxqXrLt3QJVZK5BZfaJn
 deltacat/compute/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/compactor/__init__.py,sha256=kmWC-Qnw861k7mPhLH4fQEL6CaMeBql2AipHeFqJ2uI,1127
 deltacat/compute/compactor/compaction_session.py,sha256=21Ai6esOqw9nhXIpbVQteLvROIPeiqpDg1iBsOclais,25946
-deltacat/compute/compactor/repartition_session.py,sha256=t76aZ-bZxqPOjkTfCH3wHXR93DYkwXQxojqUdCdERfQ,6923
+deltacat/compute/compactor/repartition_session.py,sha256=IYBygwvoAGAY6uftZ3C4bAW0VKPfGuKjkdbpr6_FnCo,6986
 deltacat/compute/compactor/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/compactor/model/compact_partition_params.py,sha256=QvjH10IsA8O6ufVzwPz-mcw326BT-Zbs29wFGCcGerA,5677
 deltacat/compute/compactor/model/compaction_session_audit_info.py,sha256=TKgFFdd38cplihdMtHja-cBTwk3dflEipc8smWtZlGg,25231
@@ -37,13 +37,10 @@ deltacat/compute/compactor/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
 deltacat/compute/compactor/steps/dedupe.py,sha256=R6p43mOUWgA1t468FS8JU-Wlrr96tt0ccwa0uytuaRY,10063
 deltacat/compute/compactor/steps/hash_bucket.py,sha256=ZzJQWulSOMve7bDZX7ZRuYAl4bSC4U5SJzPhpeGpKB0,9769
 deltacat/compute/compactor/steps/materialize.py,sha256=mXxKSaPL7iYtqP-eiJlFwi8kuywFmiU5FLS2-DW5314,13964
-deltacat/compute/compactor/steps/repartition.py,sha256=lpvxhiTC27MKqUXPN70H5L-FcLA1-yCCElERQq74Zig,9487
-deltacat/compute/compactor/steps/rehash/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-deltacat/compute/compactor/steps/rehash/rehash_bucket.py,sha256=yh-sBuUI3hqw2vk_nK9o-KDrgSww4oSvAz2hBxTkv8s,1765
-deltacat/compute/compactor/steps/rehash/rewrite_index.py,sha256=-HVM08pk5ROHEgDP-FVty55-a_0dsGRiSnPlNJw7C6Q,1838
+deltacat/compute/compactor/steps/repartition.py,sha256=EH843SI33fporpxQbeBmEQdvogSYmVYih6hUkxXlZ9w,9953
 deltacat/compute/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/compute/compactor/utils/io.py,sha256=itraIfLGUFfVFrW-XHnsEEa9GNIJR4VCnav0LyjHons,16543
-deltacat/compute/compactor/utils/primary_key_index.py,sha256=Y8MBkDMS4N9xgJpuqWcdqpdNbfrfycIABrKlGZwfoRM,11359
+deltacat/compute/compactor/utils/primary_key_index.py,sha256=ldcgWqnwCfnGSmUWpe68zvFO7SfOXCrytLTISQ3KwNY,2866
 deltacat/compute/compactor/utils/round_completion_file.py,sha256=DmZfHeAXlQn0DDdcsIHZROHWfyBCKTY3pNUdHzalqkE,2284
 deltacat/compute/compactor/utils/system_columns.py,sha256=I36NAEGwRegv56ouVLwTCCisyoOupDCbbaxtoFDzYTE,8121
 deltacat/compute/metastats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -96,7 +93,7 @@ deltacat/storage/model/table.py,sha256=IOu1ZOrdRkVDB-FOxYMRvnNf5TukIDfbdHWTqHYN_
 deltacat/storage/model/table_version.py,sha256=j57er3zlN0_2kwVMpWZ3iouABO-Kl8_Txi0UWIZ0dtk,7034
 deltacat/storage/model/types.py,sha256=-9yPA5wjZf9jOd-iErf4sN-YD-6fbl2z8m8t1lGa0I0,2061
 deltacat/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-deltacat/tests/test_repartition.py,sha256=xzqdfRzZS-bA1yBdPNxelecTFe2MtON5Lrd-jTGZ4Xk,7245
+deltacat/tests/test_repartition.py,sha256=dzFkmSB9QmrqJWj2JVxhHS-sefiOVljTd0vVoGFS_L0,8265
 deltacat/tests/compactor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 deltacat/tests/compactor/test_compact_partition_params.py,sha256=0h0cXNg-1NslQ98Nld7brD1WHHhzzBZR1x16kUd7MdA,8848
 deltacat/tests/compactor/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -123,7 +120,7 @@ deltacat/utils/metrics.py,sha256=1CHb5f9SXvTeKljjGawK6wmyij0HN9X6ixMiTssbT_w,467
 deltacat/utils/numpy.py,sha256=ZiGREobTVT6IZXgPxkSUpLJFN2Hn8KEZcrqybLDXCIA,2027
 deltacat/utils/pandas.py,sha256=eGOpiZE1zLznTtuwoN80j4PBp1_bUV8SE4c951r0a3o,9561
 deltacat/utils/performance.py,sha256=rC3CPfroZP3T5TbRNZXB9GRBr0F9i2KUeZYL45JBgCU,610
-deltacat/utils/placement.py,sha256=JE6OsW16VonlMhdH5B2IYuLJxItoYguaKpZNgbpMNLw,11066
+deltacat/utils/placement.py,sha256=6ppSypvmkVH5twN-UdAmDaNLJkBaGnJ2DDMv5NmNv4o,11210
 deltacat/utils/pyarrow.py,sha256=dgAruwOpWYSlnJ5w8iJz_NWpfQoZHA_iG-F7CBDieko,18245
 deltacat/utils/resources.py,sha256=fA53NiJOd5rLMtwvuTnqTyq4g59deD6NCGDbX5yIlg8,2908
 deltacat/utils/ray_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -132,8 +129,8 @@ deltacat/utils/ray_utils/concurrency.py,sha256=AyL7hpvYjkmsz-KcpYjVgPpNsmu-x8-rl
 deltacat/utils/ray_utils/dataset.py,sha256=SIljK3UkSqQ6Ntit_iSiYt9yYjN_gGrCTX6_72XdQ3w,3244
 deltacat/utils/ray_utils/performance.py,sha256=d7JFM7vTXHzkGx9qNQcZzUWajnqINvYRwaM088_FpsE,464
 deltacat/utils/ray_utils/runtime.py,sha256=xOVkqL6o8qGsewGvzhMKxmCcqcFZDnNILuz5IGMgxSc,4991
-deltacat-0.1.18b11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-deltacat-0.1.18b11.dist-info/METADATA,sha256=H0JKD8faSlKHvi44zSoCE9cA8IXKzwxydHL49eVt3vI,1558
-deltacat-0.1.18b11.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
-deltacat-0.1.18b11.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
-deltacat-0.1.18b11.dist-info/RECORD,,
+deltacat-0.1.18b13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+deltacat-0.1.18b13.dist-info/METADATA,sha256=BxNxkho94qIqKJWLh4ShgkrA1BPKKQd1so2e3YV8z5U,1558
+deltacat-0.1.18b13.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
+deltacat-0.1.18b13.dist-info/top_level.txt,sha256=RWdIcid4Bv2i2ozLVh-70kJpyB61xEKXod9XXGpiono,9
+deltacat-0.1.18b13.dist-info/RECORD,,

deltacat/compute/compactor/steps/rehash/__init__.py DELETED Viewed

File without changes

deltacat/compute/compactor/steps/rehash/rehash_bucket.py DELETED Viewed

@@ -1,57 +0,0 @@
-import logging
-from typing import List, Tuple
-import numpy as np
-import pyarrow as pa
-import ray
-from ray.types import ObjectRef
-from deltacat import logs
-from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator
-from deltacat.compute.compactor.utils import primary_key_index as pki
-logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
-def group_file_records_by_pk_hash_bucket(
-    pki_table: pa.Table, num_buckets: int
-) -> np.ndarray:
-    # generate the new table for each new hash bucket
-    hash_bucket_to_indices = pki.group_record_indices_by_hash_bucket(
-        pki_table,
-        num_buckets,
-    )
-    hash_bucket_to_table = np.empty([num_buckets], dtype="object")
-    for hash_bucket, indices in enumerate(hash_bucket_to_indices):
-        if indices:
-            hash_bucket_to_table[hash_bucket] = pki_table.take(indices)
-    return hash_bucket_to_table
-@ray.remote(num_cpus=1, num_returns=2)
-def rehash_bucket(
-    hash_bucket_index: int,
-    s3_bucket: str,
-    old_pki_version_locator: PrimaryKeyIndexVersionLocator,
-    num_buckets: int,
-    num_groups: int,
-) -> Tuple[np.ndarray, List[ObjectRef]]:
-    logger.info(f"Starting rehash bucket task...")
-    tables = pki.download_hash_bucket_entries(
-        s3_bucket,
-        hash_bucket_index,
-        old_pki_version_locator,
-    )
-    prior_pk_index_table = pa.concat_tables(tables)
-    hash_bucket_to_table = group_file_records_by_pk_hash_bucket(
-        prior_pk_index_table,
-        num_buckets,
-    )
-    hash_bucket_group_to_obj_id, object_refs = pki.group_hash_bucket_indices(
-        hash_bucket_to_table,
-        num_buckets,
-        num_groups,
-    )
-    logger.info(f"Finished rehash bucket task...")
-    return hash_bucket_group_to_obj_id, object_refs

deltacat/compute/compactor/steps/rehash/rewrite_index.py DELETED Viewed

@@ -1,48 +0,0 @@
-import logging
-from collections import defaultdict
-from typing import Any, List, Tuple
-import pyarrow as pa
-import ray
-from ray import cloudpickle
-from ray.types import ObjectRef
-from deltacat import logs
-from deltacat.compute.compactor import PrimaryKeyIndexVersionLocator, PyArrowWriteResult
-from deltacat.compute.compactor.utils import primary_key_index as pki
-logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
-@ray.remote(num_cpus=1, num_returns=2)
-def rewrite_index(
-    object_ids: List[Any],
-    s3_bucket: str,
-    new_primary_key_index_version_locator: PrimaryKeyIndexVersionLocator,
-    max_records_per_index_file: int,
-) -> Tuple[PyArrowWriteResult, List[ObjectRef]]:
-    logger.info(f"Starting rewrite primary key index task...")
-    object_refs = [cloudpickle.loads(obj_id_pkl) for obj_id_pkl in object_ids]
-    logger.info(f"Getting table groups object refs...")
-    table_groups_list = ray.get(object_refs)
-    logger.info(f"Got {len(table_groups_list)} table groups object refs...")
-    hb_index_to_tables = defaultdict(list)
-    for table_groups in table_groups_list:
-        for hb_index, table in enumerate(table_groups):
-            if table is not None:
-                hb_index_to_tables[hb_index].append(table)
-    logger.info(f"Running {len(hb_index_to_tables)} rewrite index rounds...")
-    pki_stats = []
-    for hb_index, tables in hb_index_to_tables.items():
-        table = pa.concat_tables(tables)
-        hb_pki_stats = pki.write_primary_key_index_files(
-            table,
-            new_primary_key_index_version_locator,
-            s3_bucket,
-            hb_index,
-            max_records_per_index_file,
-        )
-        pki_stats.append(hb_pki_stats)
-    logger.info(f"Finished rewrite primary key index task...")
-    return PyArrowWriteResult.union(pki_stats), object_refs

{deltacat-0.1.18b11.dist-info → deltacat-0.1.18b13.dist-info}/LICENSE RENAMED Viewed

File without changes

{deltacat-0.1.18b11.dist-info → deltacat-0.1.18b13.dist-info}/WHEEL RENAMED Viewed

File without changes

{deltacat-0.1.18b11.dist-info → deltacat-0.1.18b13.dist-info}/top_level.txt RENAMED Viewed

File without changes

deltacat 0.1.18b11__py3-none-any.whl → 0.1.18b13__py3-none-any.whl

deltacat 0.1.18b11py3-none-any.whl → 0.1.18b13py3-none-any.whl