PyPI - deltacat - Versions diffs - 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

deltacat 0.2.9py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

deltacat/__init__.py +1 -1
deltacat/aws/redshift/__init__.py +4 -0
deltacat/aws/redshift/model/manifest.py +93 -1
deltacat/aws/s3u.py +250 -111
deltacat/catalog/default_catalog_impl/__init__.py +369 -0
deltacat/compute/compactor_v2/compaction_session.py +175 -152
deltacat/compute/compactor_v2/model/hash_bucket_input.py +6 -0
deltacat/compute/compactor_v2/model/merge_file_group.py +213 -0
deltacat/compute/compactor_v2/model/merge_input.py +8 -24
deltacat/compute/compactor_v2/model/merge_result.py +1 -0
deltacat/compute/compactor_v2/steps/hash_bucket.py +4 -56
deltacat/compute/compactor_v2/steps/merge.py +106 -171
deltacat/compute/compactor_v2/utils/delta.py +97 -0
deltacat/compute/compactor_v2/utils/merge.py +126 -0
deltacat/compute/compactor_v2/utils/task_options.py +47 -4
deltacat/compute/merge_on_read/__init__.py +4 -0
deltacat/compute/merge_on_read/daft.py +40 -0
deltacat/compute/merge_on_read/model/__init__.py +0 -0
deltacat/compute/merge_on_read/model/merge_on_read_params.py +66 -0
deltacat/compute/merge_on_read/utils/__init__.py +0 -0
deltacat/compute/merge_on_read/utils/delta.py +42 -0
deltacat/storage/interface.py +10 -2
deltacat/storage/model/types.py +3 -11
deltacat/tests/catalog/__init__.py +0 -0
deltacat/tests/catalog/test_default_catalog_impl.py +98 -0
deltacat/tests/compute/compact_partition_test_cases.py +126 -1
deltacat/tests/compute/test_compact_partition_incremental.py +4 -1
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +9 -2
deltacat/tests/local_deltacat_storage/__init__.py +19 -2
deltacat/tests/test_utils/pyarrow.py +33 -14
deltacat/tests/utils/test_daft.py +42 -2
deltacat/types/media.py +5 -0
deltacat/types/tables.py +7 -1
deltacat/utils/daft.py +78 -13
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA +2 -2
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/RECORD +39 -27
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/LICENSE +0 -0
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/WHEEL +0 -0
{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/top_level.txt +0 -0

deltacat/tests/compute/compact_partition_test_cases.py CHANGED Viewed

@@ -442,6 +442,33 @@ INCREMENTAL_TEST_CASES: Dict[str, IncrementalCompactionTestCaseParams] = {
         drop_duplicates=True,
         skip_enabled_compact_partition_drivers=None,
     ),
+    "12-incremental-decimal-single-hash-bucket": IncrementalCompactionTestCaseParams(
+        primary_keys={"pk_col_1"},
+        sort_keys=[SortKey.of(key_name="sk_col_1")],
+        partition_keys=ZERO_VALUED_PARTITION_KEYS_PARAM,
+        partition_values=ZERO_VALUED_PARTITION_VALUES_PARAM,
+        input_deltas=pa.Table.from_arrays(
+            [
+                pa.array([0.1] * 4 + [0.2] * 4 + [0.3] * 4 + [0.4] * 4 + [0.5] * 4),
+                pa.array([i for i in range(20)]),
+            ],
+            names=["pk_col_1", "sk_col_1"],
+        ),
+        input_deltas_delta_type=DeltaType.UPSERT,
+        expected_terminal_compact_partition_result=pa.Table.from_arrays(
+            [
+                pa.array([0.1, 0.2, 0.3, 0.4, 0.5]),
+                pa.array([3, 7, 11, 15, 19]),
+            ],
+            names=["pk_col_1", "sk_col_1"],
+        ),
+        do_create_placement_group=False,
+        records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
+        hash_bucket_count=1,
+        read_kwargs_provider=None,
+        drop_duplicates=True,
+        skip_enabled_compact_partition_drivers=None,
+    ),
 }
 REBASE_THEN_INCREMENTAL_TEST_CASES = {
@@ -1091,6 +1118,104 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
         skip_enabled_compact_partition_drivers=[CompactorVersion.V1],
     ),
     "14-rebase-then-empty-incremental-delta": RebaseThenIncrementalCompactionTestCaseParams(
+        primary_keys={"pk_col_1"},
+        sort_keys=[
+            SortKey.of(key_name="sk_col_1"),
+            SortKey.of(key_name="sk_col_2"),
+        ],
+        partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
+        partition_values=["1"],
+        input_deltas=pa.Table.from_arrays(
+            [
+                pa.array([str(i) for i in range(10)]),
+                pa.array([i for i in range(0, 10)]),
+                pa.array(["foo"] * 10),
+                pa.array([i / 10 for i in range(10, 20)]),
+            ],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
+        ),
+        input_deltas_delta_type=DeltaType.UPSERT,
+        rebase_expected_compact_partition_result=pa.Table.from_arrays(
+            [
+                pa.array([str(i) for i in range(10)]),
+                pa.array([i for i in range(0, 10)]),
+                pa.array(["foo"] * 10),
+                pa.array([i / 10 for i in range(10, 20)]),
+            ],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
+        ),
+        incremental_deltas=None,
+        incremental_deltas_delta_type=DeltaType.UPSERT,
+        expected_terminal_compact_partition_result=pa.Table.from_arrays(
+            [
+                pa.array([str(i) for i in range(10)]),
+                pa.array([i for i in range(0, 10)]),
+                pa.array(["foo"] * 10),
+                pa.array([i / 10 for i in range(10, 20)]),
+            ],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
+        ),
+        do_create_placement_group=False,
+        records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
+        hash_bucket_count=3,
+        read_kwargs_provider=None,
+        drop_duplicates=True,
+        skip_enabled_compact_partition_drivers=None,
+    ),
+    "15-rebase-then-incremental-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
+        primary_keys={"pk_col_1"},
+        sort_keys=[
+            SortKey.of(key_name="sk_col_1"),
+            SortKey.of(key_name="sk_col_2"),
+        ],
+        partition_keys=[PartitionKey.of("region_id", PartitionKeyType.INT)],
+        partition_values=["1"],
+        input_deltas=pa.Table.from_arrays(
+            [
+                pa.array([str(i) for i in range(10)]),
+                pa.array([i for i in range(0, 10)]),
+                pa.array(["foo"] * 10),
+                pa.array([i / 10 for i in range(10, 20)]),
+            ],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
+        ),
+        input_deltas_delta_type=DeltaType.UPSERT,
+        rebase_expected_compact_partition_result=pa.Table.from_arrays(
+            [
+                pa.array([str(i) for i in range(10)]),
+                pa.array([i for i in range(0, 10)]),
+                pa.array(["foo"] * 10),
+                pa.array([i / 10 for i in range(10, 20)]),
+            ],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
+        ),
+        incremental_deltas=pa.Table.from_arrays(
+            [
+                pa.array([str(i) for i in range(10)]),
+                pa.array([i for i in range(20, 30)]),
+                pa.array(["foo"] * 10),
+                pa.array([i / 10 for i in range(40, 50)]),
+            ],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
+        ),
+        incremental_deltas_delta_type=DeltaType.UPSERT,
+        expected_terminal_compact_partition_result=pa.Table.from_arrays(
+            [
+                pa.array([str(i) for i in range(10)]),
+                pa.array([i for i in range(20, 30)]),
+                pa.array(["foo"] * 10),
+                pa.array([i / 10 for i in range(40, 50)]),
+            ],
+            names=["pk_col_1", "sk_col_1", "sk_col_2", "col_1"],
+        ),
+        do_create_placement_group=False,
+        records_per_compacted_file=DEFAULT_MAX_RECORDS_PER_FILE,
+        hash_bucket_count=1,
+        read_kwargs_provider=None,
+        drop_duplicates=True,
+        skip_enabled_compact_partition_drivers=None,
+    ),
+    "16-rebase-then-empty-incremental-delta-hash-bucket-single": RebaseThenIncrementalCompactionTestCaseParams(
         primary_keys={"pk_col_1"},
         sort_keys=[
             SortKey.of(key_name="sk_col_1"),
@@ -1137,9 +1262,9 @@ REBASE_THEN_INCREMENTAL_TEST_CASES = {
     ),
 }
 INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(INCREMENTAL_TEST_CASES)
 REBASE_THEN_INCREMENTAL_TEST_CASES = with_compactor_version_func_test_param(
     REBASE_THEN_INCREMENTAL_TEST_CASES
 )

deltacat/tests/compute/test_compact_partition_incremental.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Any, Callable, Dict, List, Optional, Set
 from boto3.resources.base import ServiceResource
 import pyarrow as pa
 from pytest_benchmark.fixture import BenchmarkFixture
+from deltacat.types.media import StorageType
 from deltacat.tests.compute.test_util_common import (
     get_rcf,
@@ -269,7 +270,9 @@ def test_compact_partition_incremental(
         **compaction_audit_obj
     )
-    tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
+    tables = ds.download_delta(
+        compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
+    )
     actual_compacted_table = pa.concat_tables(tables)
     sorting_cols: List[Any] = [(val, "ascending") for val in primary_keys]
     # the compacted table may contain multiple files and chunks

deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py CHANGED Viewed

@@ -32,6 +32,7 @@ from deltacat.tests.compute.compact_partition_test_cases import (
     REBASE_THEN_INCREMENTAL_TEST_CASES,
 )
 from typing import Any, Callable, Dict, List, Optional, Set
+from deltacat.types.media import StorageType
 DATABASE_FILE_PATH_KEY, DATABASE_FILE_PATH_VALUE = (
     "db_file_path",
@@ -272,7 +273,9 @@ def test_compact_partition_rebase_then_incremental(
     compacted_delta_locator: DeltaLocator = get_compacted_delta_locator_from_rcf(
         setup_s3_resource, rcf_file_s3_uri
     )
-    tables = ds.download_delta(compacted_delta_locator, **ds_mock_kwargs)
+    tables = ds.download_delta(
+        compacted_delta_locator, storage_type=StorageType.LOCAL, **ds_mock_kwargs
+    )
     actual_rebase_compacted_table = pa.concat_tables(tables)
     # if no primary key is specified then sort by sort_key for consistent assertion
     sorting_cols: List[Any] = (
@@ -341,7 +344,11 @@ def test_compact_partition_rebase_then_incremental(
         **compaction_audit_obj
     )
-    tables = ds.download_delta(compacted_delta_locator_incremental, **ds_mock_kwargs)
+    tables = ds.download_delta(
+        compacted_delta_locator_incremental,
+        storage_type=StorageType.LOCAL,
+        **ds_mock_kwargs,
+    )
     actual_compacted_table = pa.concat_tables(tables)
     expected_terminal_compact_partition_result = (
         expected_terminal_compact_partition_result.combine_chunks().sort_by(

deltacat/tests/local_deltacat_storage/__init__.py CHANGED Viewed

@@ -1,10 +1,12 @@
 from typing import Any, Callable, Dict, List, Optional, Set, Union, Tuple
 import pyarrow as pa
+import daft
 import json
 import sqlite3
 from sqlite3 import Cursor, Connection
 import uuid
+import ray
 import io
 from deltacat.tests.test_utils.storage import create_empty_delta
@@ -38,7 +40,13 @@ from deltacat.storage import (
     ManifestEntry,
     ManifestEntryList,
 )
-from deltacat.types.media import ContentType, StorageType, TableType, ContentEncoding
+from deltacat.types.media import (
+    ContentType,
+    StorageType,
+    TableType,
+    ContentEncoding,
+    DistributedDatasetType,
+)
 from deltacat.utils.common import ReadKwargsProvider
 SQLITE_CUR_ARG = "sqlite3_cur"
@@ -337,9 +345,10 @@ def download_delta(
     columns: Optional[List[str]] = None,
     file_reader_kwargs_provider: Optional[ReadKwargsProvider] = None,
     ray_options_provider: Callable[[int, Any], Dict[str, Any]] = None,
+    distributed_dataset_type: DistributedDatasetType = DistributedDatasetType.RAY_DATASET,
     *args,
     **kwargs,
-) -> Union[LocalDataset, DistributedDataset]:
+) -> Union[LocalDataset, DistributedDataset]:  # type: ignore
     result = []
     manifest = get_delta_manifest(delta_like, *args, **kwargs)
@@ -356,6 +365,14 @@ def download_delta(
             )
         )
+    if storage_type == StorageType.DISTRIBUTED:
+        if distributed_dataset_type is DistributedDatasetType.DAFT:
+            return daft.from_arrow(result)
+        elif distributed_dataset_type is DistributedDatasetType.RAY_DATASET:
+            return ray.data.from_arrow(result)
+        else:
+            raise ValueError(f"Dataset type {distributed_dataset_type} not supported!")
     return result

deltacat/tests/test_utils/pyarrow.py CHANGED Viewed

@@ -1,14 +1,18 @@
-from typing import List
+from typing import List, Optional
 import pyarrow as pa
-from deltacat.storage import Delta, Partition
+from deltacat.storage import Delta, Partition, PartitionLocator
 import deltacat.tests.local_deltacat_storage as ds
 def create_delta_from_csv_file(
-    namespace: str, file_paths: List[str], *args, **kwargs
+    namespace: str,
+    file_paths: List[str],
+    table_name: Optional[str] = None,
+    *args,
+    **kwargs
 ) -> Delta:
     staged_partition = stage_partition_from_file_paths(
-        namespace, file_paths, *args, **kwargs
+        namespace, file_paths, *args, table_name=table_name, **kwargs
     )
     committed_delta = commit_delta_to_staged_partition(
@@ -19,10 +23,15 @@ def create_delta_from_csv_file(
 def stage_partition_from_file_paths(
-    namespace: str, file_paths: List[str], *args, **kwargs
+    namespace: str,
+    file_paths: List[str],
+    table_name: Optional[str] = None,
+    *args,
+    **kwargs
 ) -> Partition:
     ds.create_namespace(namespace, {}, **kwargs)
-    table_name = "-".join(file_paths).replace("/", "_")
+    if table_name is None:
+        table_name = "-".join(file_paths).replace("/", "_")
     ds.create_table_version(namespace, table_name, "1", **kwargs)
     stream = ds.get_stream(namespace, table_name, "1", **kwargs)
     staged_partition = ds.stage_partition(stream, [], **kwargs)
@@ -31,19 +40,29 @@ def stage_partition_from_file_paths(
 def commit_delta_to_staged_partition(
     staged_partition, file_paths: List[str], *args, **kwargs
+) -> Delta:
+    committed_delta = commit_delta_to_partition(
+        staged_partition, file_paths=file_paths, *args, **kwargs
+    )
+    ds.commit_partition(staged_partition, **kwargs)
+    return committed_delta
+def commit_delta_to_partition(
+    partition: Partition, file_paths: List[str], *args, **kwargs
 ) -> Delta:
     tables = []
+    if isinstance(partition, PartitionLocator):
+        partition = ds.get_partition(
+            partition.stream_locator, partition.partition_values, *args, **kwargs
+        )
     for file_path in file_paths:
         table = pa.csv.read_csv(file_path)
         tables.append(table)
-    deltas = []
-    for table in tables:
-        delta = ds.stage_delta(table, staged_partition, **kwargs)
-        deltas.append(delta)
+    table = pa.concat_tables(tables)
+    staged_delta = ds.stage_delta(table, partition, **kwargs)
-    merged_delta = Delta.merge_deltas(deltas=deltas)
-    committed_delta = ds.commit_delta(merged_delta, **kwargs)
-    ds.commit_partition(staged_partition, **kwargs)
-    return committed_delta
+    return ds.commit_delta(staged_delta, **kwargs)

deltacat/tests/utils/test_daft.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import unittest
 from deltacat.types.media import ContentEncoding, ContentType
-from deltacat.utils.daft import daft_s3_file_to_table
+from deltacat.utils.daft import daft_s3_file_to_table, s3_files_to_dataframe
 from deltacat.utils.pyarrow import ReadKwargsProviderPyArrowSchemaOverride
 from deltacat.types.partial_download import PartialParquetParameters
@@ -9,7 +9,7 @@ import pyarrow as pa
 from pyarrow import parquet as pq
-class TestDaftParquetReader(unittest.TestCase):
+class TestDaftS3FileToTable(unittest.TestCase):
     MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
     def test_read_from_s3_all_columns(self):
@@ -121,5 +121,45 @@ class TestDaftParquetReader(unittest.TestCase):
         self.assertEqual(table.num_rows, 10)
+class TestDaftS3FilesToDataFrame(unittest.TestCase):
+    MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
+    def test_read_from_s3_all_columns(self):
+        df = s3_files_to_dataframe(
+            uris=[self.MVP_PATH],
+            content_encoding=ContentEncoding.IDENTITY.value,
+            content_type=ContentType.PARQUET.value,
+            ray_init_options={"local_mode": True},
+        )
+        table = df.to_arrow()
+        self.assertEqual(table.schema.names, ["a", "b"])
+        self.assertEqual(table.num_rows, 100)
+    def test_does_not_read_from_s3_if_not_materialized(self):
+        df = s3_files_to_dataframe(
+            uris=[self.MVP_PATH],
+            content_encoding=ContentEncoding.IDENTITY.value,
+            content_type=ContentType.PARQUET.value,
+            ray_init_options={"local_mode": True},
+        )
+        self.assertRaises(RuntimeError, lambda: len(df))
+        df.collect()
+        self.assertEqual(len(df), 100)
+    def test_raises_error_if_not_supported_content_type(self):
+        self.assertRaises(
+            AssertionError,
+            lambda: s3_files_to_dataframe(
+                uris=[self.MVP_PATH],
+                content_encoding=ContentEncoding.IDENTITY.value,
+                content_type=ContentType.UNESCAPED_TSV.value,
+                ray_init_options={"local_mode": True},
+            ),
+        )
 if __name__ == "__main__":
     unittest.main()

deltacat/types/media.py CHANGED Viewed

@@ -44,6 +44,11 @@ class TableType(str, Enum):
     PYARROW_PARQUET = "pyarrow_parquet"
+class DistributedDatasetType(str, Enum):
+    DAFT = "daft"
+    RAY_DATASET = "ray_dataset"
 class SchemaType(str, Enum):
     ARROW = "arrow"

deltacat/types/tables.py CHANGED Viewed

@@ -15,10 +15,11 @@ from ray.data.read_api import (
 )
 import deltacat.storage as dcs
-from deltacat.types.media import TableType
+from deltacat.types.media import TableType, DistributedDatasetType
 from deltacat.utils import numpy as np_utils
 from deltacat.utils import pandas as pd_utils
 from deltacat.utils import pyarrow as pa_utils
+from deltacat.utils import daft as daft_utils
 from deltacat.utils.ray_utils import dataset as ds_utils
 TABLE_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
@@ -78,6 +79,11 @@ TABLE_TYPE_TO_DATASET_CREATE_FUNC_REFS: Dict[str, Callable] = {
 }
+DISTRIBUTED_DATASET_TYPE_TO_READER_FUNC: Dict[int, Callable] = {
+    DistributedDatasetType.DAFT.value: daft_utils.s3_files_to_dataframe
+}
 class TableWriteMode(str, Enum):
     """
     Enum controlling how a given dataset will be written to a table.

deltacat/utils/daft.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import logging
-from typing import Optional, List
+from typing import Optional, List, Any, Dict, Callable
+import daft
+import ray
 from daft.table import read_parquet_into_pyarrow
-from daft import TimeUnit
+from daft import TimeUnit, DataFrame
 from daft.io import IOConfig, S3Config
 import pyarrow as pa
@@ -22,6 +23,66 @@ from deltacat.types.partial_download import (
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def s3_files_to_dataframe(
+    uris: List[str],
+    content_type: str,
+    content_encoding: str,
+    column_names: Optional[List[str]] = None,
+    include_columns: Optional[List[str]] = None,
+    read_func_kwargs_provider: Optional[ReadKwargsProvider] = None,
+    ray_options_provider: Optional[Callable[[int, Any], Dict[str, Any]]] = None,
+    s3_client_kwargs: Optional[Any] = None,
+    ray_init_options: Optional[Dict[str, Any]] = None,
+) -> DataFrame:
+    if ray_init_options is None:
+        ray_init_options = {}
+    assert (
+        content_type == ContentType.PARQUET.value
+    ), f"daft native reader currently only supports parquet, got {content_type}"
+    assert (
+        content_encoding == ContentEncoding.IDENTITY.value
+    ), f"daft native reader currently only supports identity encoding, got {content_encoding}"
+    if not ray.is_initialized():
+        ray.init(address="auto", ignore_reinit_error=True, **ray_init_options)
+    daft.context.set_runner_ray(noop_if_initialized=True)
+    if s3_client_kwargs is None:
+        s3_client_kwargs = {}
+    kwargs = {}
+    if read_func_kwargs_provider is not None:
+        kwargs = read_func_kwargs_provider(content_type, kwargs)
+    # TODO(raghumdani): pass in coerce_int96_timestamp arg
+    # https://github.com/Eventual-Inc/Daft/issues/1894
+    io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
+    logger.debug(
+        f"Preparing to read S3 object from {len(uris)} files into daft dataframe"
+    )
+    df, latency = timed_invocation(
+        daft.read_parquet, path=uris, io_config=io_config, use_native_downloader=True
+    )
+    logger.debug(f"Time to create daft dataframe from {len(uris)} files is {latency}s")
+    columns_to_read = include_columns or column_names
+    logger.debug(f"Taking columns {columns_to_read} from the daft df.")
+    if columns_to_read:
+        return df.select(*columns_to_read)
+    else:
+        return df
 def daft_s3_file_to_table(
     s3_url: str,
     content_type: str,
@@ -55,16 +116,7 @@ def daft_s3_file_to_table(
     ):
         row_groups = partial_file_download_params.row_groups_to_download
-    io_config = IOConfig(
-        s3=S3Config(
-            key_id=s3_client_kwargs.get("aws_access_key_id"),
-            access_key=s3_client_kwargs.get("aws_secret_access_key"),
-            session_token=s3_client_kwargs.get("aws_session_token"),
-            retry_mode="adaptive",
-            num_tries=BOTO_MAX_RETRIES,
-            max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
-        )
-    )
+    io_config = _get_s3_io_config(s3_client_kwargs=s3_client_kwargs)
     logger.debug(f"Preparing to read S3 object from {s3_url} into daft table")
@@ -95,3 +147,16 @@ def daft_s3_file_to_table(
         return coerce_pyarrow_table_to_schema(pa_table, input_schema)
     else:
         return pa_table
+def _get_s3_io_config(s3_client_kwargs) -> IOConfig:
+    return IOConfig(
+        s3=S3Config(
+            key_id=s3_client_kwargs.get("aws_access_key_id"),
+            access_key=s3_client_kwargs.get("aws_secret_access_key"),
+            session_token=s3_client_kwargs.get("aws_session_token"),
+            retry_mode="adaptive",
+            num_tries=BOTO_MAX_RETRIES,
+            max_connections=DAFT_MAX_S3_CONNECTIONS_PER_FILE,
+        )
+    )

{deltacat-0.2.9.dist-info → deltacat-1.0.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deltacat
-Version: 0.2.9
+Version: 1.0.0
 Summary: A scalable, fast, ACID-compliant Data Catalog powered by Ray.
 Home-page: https://github.com/ray-project/deltacat
 Author: Ray Team
@@ -27,7 +27,7 @@ Requires-Dist: tenacity ==8.1.0
 Requires-Dist: typing-extensions ==4.4.0
 Requires-Dist: pymemcache ==4.0.0
 Requires-Dist: redis ==4.6.0
-Requires-Dist: getdaft ==0.2.4
+Requires-Dist: getdaft ==0.2.16
 Requires-Dist: schedule ==1.2.0
 # DeltaCAT

deltacat 0.2.9__py3-none-any.whl → 1.0.0__py3-none-any.whl

deltacat 0.2.9py3-none-any.whl → 1.0.0py3-none-any.whl