PyPI - deltacat - Versions diffs - 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl - Mend

deltacat 2.0.0b9py3-none-any.whl → 2.0.0b11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

deltacat/utils/polars.py ADDED Viewed

@@ -0,0 +1,128 @@
+import logging
+from typing import Optional, List, Dict, Callable, Union
+import polars as pl
+from fsspec import AbstractFileSystem
+from ray.data.datasource import FilenameProvider
+from deltacat import logs
+from deltacat.types.media import ContentType
+logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
+def write_json(
+    table: pl.DataFrame,
+    path: str,
+    *,
+    filesystem: Optional[AbstractFileSystem] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem:
+        table.write_ndjson(path, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            table.write_ndjson(f, **write_kwargs)
+def write_csv(
+    table: pl.DataFrame,
+    path: str,
+    *,
+    filesystem: Optional[AbstractFileSystem] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem:
+        table.write_csv(path, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            table.write_csv(f, **write_kwargs)
+def write_avro(
+    table: pl.DataFrame,
+    path: str,
+    *,
+    filesystem: Optional[AbstractFileSystem] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem:
+        table.write_avro(path, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            table.write_avro(f, **write_kwargs)
+def write_parquet(
+    table: pl.DataFrame,
+    path: str,
+    *,
+    filesystem: Optional[AbstractFileSystem] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem:
+        table.write_parquet(path, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            table.write_parquet(f, **write_kwargs)
+CONTENT_TYPE_TO_PL_WRITE_FUNC: Dict[str, Callable] = {
+    # TODO (pdames): add support for other delimited text content types as
+    #  pyarrow adds support for custom delimiters, escaping, and None value
+    #  representations to pyarrow.csv.WriteOptions.
+    ContentType.AVRO.value: write_avro,
+    ContentType.CSV.value: write_csv,
+    ContentType.PARQUET.value: write_parquet,
+    ContentType.JSON.value: write_json,
+}
+def slice_table(table: pl.DataFrame, max_len: Optional[int]) -> List[pl.DataFrame]:
+    """
+    Iteratively create 0-copy table slices.
+    """
+    if max_len is None:
+        return [table]
+    tables = []
+    offset = 0
+    records_remaining = len(table)
+    while records_remaining > 0:
+        records_this_entry = min(max_len, records_remaining)
+        tables.append(table.slice(offset, records_this_entry))
+        records_remaining -= records_this_entry
+        offset += records_this_entry
+    return tables
+def dataframe_size(table: pl.DataFrame) -> int:
+    return table.estimated_size()
+def dataframe_to_file(
+    table: pl.DataFrame,
+    base_path: str,
+    file_system: Optional[AbstractFileSystem],
+    block_path_provider: Union[Callable, FilenameProvider],
+    content_type: str = ContentType.PARQUET.value,
+    **kwargs,
+) -> None:
+    """
+    Writes the given Pyarrow Table to a file.
+    """
+    writer = CONTENT_TYPE_TO_PL_WRITE_FUNC.get(content_type)
+    if not writer:
+        raise NotImplementedError(
+            f"Pyarrow writer for content type '{content_type}' not "
+            f"implemented. Known content types: "
+            f"{CONTENT_TYPE_TO_PL_WRITE_FUNC.keys}"
+        )
+    path = block_path_provider(base_path)
+    logger.debug(f"Writing table: {table} with kwargs: {kwargs} to path: {path}")
+    writer(table, path, filesystem=file_system, **kwargs)

deltacat/utils/pyarrow.py CHANGED Viewed

@@ -13,11 +13,14 @@ from deltacat.exceptions import ContentTypeValidationError
 import pyarrow as pa
 import numpy as np
 import pyarrow.compute as pc
+import pyarrow.fs as pafs
 from fsspec import AbstractFileSystem
 from pyarrow import csv as pacsv
 from pyarrow import feather as paf
 from pyarrow import json as pajson
 from pyarrow import parquet as papq
+from pyarrow import orc as paorc
 from ray.data.datasource import FilenameProvider
 from deltacat.utils.s3fs import create_s3_file_system
@@ -40,8 +43,10 @@ from deltacat.utils.arguments import (
     sanitize_kwargs_to_callable,
     sanitize_kwargs_by_supported_kwargs,
 )
+from deltacat.utils.filesystem import resolve_path_and_filesystem
 from functools import lru_cache
 logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
@@ -103,6 +108,82 @@ def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
         raise e
+# TODO(pdames): Remove deprecated S3-only readers.
+def read_csv(
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **read_kwargs,
+) -> pa.Table:
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
+            return pacsv.read_csv(f, **read_kwargs)
+    with filesystem.open(path, "rb", **fs_open_kwargs) as f:
+        return pacsv.read_csv(f, **read_kwargs)
+def read_feather(
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **read_kwargs,
+) -> pa.Table:
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
+            return paf.read_feather(f, **read_kwargs)
+    with filesystem.open(path, "rb", **fs_open_kwargs) as f:
+        return paf.read_feather(f, **read_kwargs)
+def read_json(
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **read_kwargs,
+) -> pa.Table:
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
+            return pajson.read_json(f, **read_kwargs)
+    with filesystem.open(path, "rb", **fs_open_kwargs) as f:
+        return pajson.read_json(f, **read_kwargs)
+def read_orc(
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **read_kwargs,
+) -> pa.Table:
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
+            return paorc.read_table(f, **read_kwargs)
+    with filesystem.open(path, "rb", **fs_open_kwargs) as f:
+        return paorc.read_table(f, **read_kwargs)
+def read_parquet(
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **read_kwargs,
+) -> pa.Table:
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_input_stream(path, **fs_open_kwargs) as f:
+            return papq.read_table(f, **read_kwargs)
+    with filesystem.open(path, "rb", **fs_open_kwargs) as f:
+        return papq.read_table(f, **read_kwargs)
 CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
     ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
     ContentType.TSV.value: pyarrow_read_csv,
@@ -118,24 +199,78 @@ CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
 def write_feather(
-    table: pa.Table, path: str, *, filesystem: AbstractFileSystem, **kwargs
+    table: pa.Table,
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
 ) -> None:
-    with filesystem.open(path, "wb") as f:
-        paf.write_feather(table, f, **kwargs)
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
+            paf.write_feather(table, f, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            paf.write_feather(table, f, **write_kwargs)
 def write_csv(
-    table: pa.Table, path: str, *, filesystem: AbstractFileSystem, **kwargs
+    table: pa.Table,
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
+            pacsv.write_csv(table, f, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            # TODO (pdames): Add support for client-specified compression types.
+            with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
+                if write_kwargs.get("write_options") is None:
+                    # column names are kept in table metadata, so omit header
+                    write_kwargs["write_options"] = pacsv.WriteOptions(
+                        include_header=False
+                    )
+                pacsv.write_csv(table, out, **write_kwargs)
+def write_orc(
+    table: pa.Table,
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
 ) -> None:
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
+            paorc.write_table(table, f, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            paorc.write_table(table, f, **write_kwargs)
-    with filesystem.open(path, "wb") as f:
-        # TODO (pdames): Add support for client-specified compression types.
-        with pa.CompressedOutputStream(f, ContentEncoding.GZIP.value) as out:
-            if kwargs.get("write_options") is None:
-                # column names are kept in table metadata, so omit header
-                kwargs["write_options"] = pacsv.WriteOptions(include_header=False)
-            pacsv.write_csv(table, out, **kwargs)
+def write_parquet(
+    table: pa.Table,
+    path: str,
+    *,
+    filesystem: Optional[Union[AbstractFileSystem, pafs.FileSystem]] = None,
+    fs_open_kwargs: Dict[str, any] = {},
+    **write_kwargs,
+) -> None:
+    if not filesystem or isinstance(filesystem, pafs.FileSystem):
+        path, filesystem = resolve_path_and_filesystem(path)
+        with filesystem.open_output_stream(path, **fs_open_kwargs) as f:
+            papq.write_table(table, f, **write_kwargs)
+    else:
+        with filesystem.open(path, "wb", **fs_open_kwargs) as f:
+            papq.write_table(table, f, **write_kwargs)
 CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
@@ -143,7 +278,8 @@ CONTENT_TYPE_TO_PA_WRITE_FUNC: Dict[str, Callable] = {
     #  pyarrow adds support for custom delimiters, escaping, and None value
     #  representations to pyarrow.csv.WriteOptions.
     ContentType.CSV.value: write_csv,
-    ContentType.PARQUET.value: papq.write_table,
+    ContentType.ORC.value: write_orc,
+    ContentType.PARQUET.value: write_parquet,
     ContentType.FEATHER.value: write_feather,
 }
@@ -180,7 +316,7 @@ def content_type_to_reader_kwargs(content_type: str) -> Dict[str, Any]:
 ENCODING_TO_FILE_INIT: Dict[str, Callable] = {
     ContentEncoding.GZIP.value: partial(gzip.open, mode="rb"),
     ContentEncoding.BZIP2.value: partial(bz2.open, mode="rb"),
-    ContentEncoding.IDENTITY.value: lambda s3_file: s3_file,
+    ContentEncoding.IDENTITY.value: lambda file_path: file_path,
 }
@@ -522,7 +658,7 @@ def parquet_file_size(table: papq.ParquetFile) -> int:
 def table_to_file(
     table: pa.Table,
     base_path: str,
-    file_system: AbstractFileSystem,
+    file_system: Optional[AbstractFileSystem],
     block_path_provider: Union[Callable, FilenameProvider],
     content_type: str = ContentType.PARQUET.value,
     **kwargs,

deltacat/utils/ray_utils/concurrency.py CHANGED Viewed

@@ -88,7 +88,7 @@ def round_robin_options_provider(
     **kwargs,
 ) -> Dict[str, Any]:
     """Returns a resource dictionary that can be included with ray remote
-    options to round robin indexed tasks or actors across a list of resource
+    options to round-robin indexed tasks or actors across a list of resource
     keys. For example, the following code round-robins 100 tasks across all
     live cluster nodes:
     ```

deltacat/utils/ray_utils/runtime.py CHANGED Viewed

@@ -21,7 +21,7 @@ def node_resource_keys(
     keys = []
     node_dict = ray.nodes()
     if node_dict:
-        for node in ray.nodes():
+        for node in node_dict:
             if filter_fn(node):
                 for key in node["Resources"].keys():
                     if key.startswith("node:"):
@@ -37,7 +37,7 @@ def current_node_resource_key() -> str:
     actors on that node via:
     `foo.options(resources={get_current_node_resource_key(): 0.01}).remote()`
     """
-    current_node_id = ray.get_runtime_context().get_node_id().hex()
+    current_node_id = ray.get_runtime_context().get_node_id()
     keys = node_resource_keys(lambda n: n["NodeID"] == current_node_id)
     assert (
         len(keys) <= 1
@@ -45,6 +45,47 @@ def current_node_resource_key() -> str:
     return keys[0] if len(keys) == 1 else None
+def current_node_resources() -> Dict[str, float]:
+    """Get's Ray's resources for the current node as a dictionary.
+    Example Return Value:
+        >>> {
+        >>>    'memory': 17611605607.0,
+        >>>    'node:127.0.0.1': 1.0,
+        >>>    'node:__internal_head__': 1.0,
+        >>>    'object_store_memory': 2147483648.0,
+        >>>    'CPU': 10.0,
+        >>> }
+    """
+    current_node_id = ray.get_runtime_context().get_node_id()
+    node_dict = ray.nodes()
+    if node_dict:
+        for node in node_dict:
+            if node["NodeID"] == current_node_id:
+                return node["Resources"]
+    else:
+        raise ValueError("No node dictionary found on current node.")
+    return {}
+def find_max_single_node_resource_type(resource_type: str) -> float:
+    """Finds the max resource amount available on any single cluster node
+    for the given resource type. Returns the max resource amount as a float."""
+    node_dict = ray.nodes()
+    max_single_node_resource_amount = 0
+    if node_dict:
+        for node in node_dict:
+            node_resource_amount = node["Resources"].get(resource_type)
+            if node_resource_amount is not None:
+                max_single_node_resource_amount = max(
+                    max_single_node_resource_amount,
+                    node_resource_amount,
+                )
+    else:
+        raise ValueError("No node dictionary found on current node.")
+    return max_single_node_resource_amount
 def is_node_alive(node: Dict[str, Any]) -> bool:
     """Takes a node from `ray.nodes()` as input. Returns True if the node is
     alive, and False otherwise."""
@@ -67,6 +108,17 @@ def live_node_waiter(min_live_nodes: int, poll_interval_seconds: float = 0.5) ->
         time.sleep(poll_interval_seconds)
+def live_cpu_waiter(min_live_cpus: int, poll_interval_seconds: float = 0.5) -> None:
+    """Waits until the given minimum number of live CPUs are present in the
+    cluster. Checks the current number of live CPUs every
+    `poll_interval_seconds`."""
+    live_cpus = cluster_cpus()
+    while live_cpus < min_live_cpus:
+        live_cpus = cluster_cpus()
+        logger.info(f"Waiting for Live CPUs: {live_cpus}/{min_live_cpus}")
+        time.sleep(poll_interval_seconds)
 def live_node_resource_keys() -> List[str]:
     """Get Ray resource keys for all live cluster nodes as a list of strings of
     the form: "node:{node_resource_name}". The returned keys can be used to
@@ -83,7 +135,7 @@ def other_live_node_resource_keys() -> List[str]:
     For example, invoking this function from your Ray application driver on the
     head node returns the resource keys of all live worker nodes."""
-    current_node_id = ray.get_runtime_context().get_node_id().hex()
+    current_node_id = ray.get_runtime_context().get_node_id()
     return node_resource_keys(
         lambda n: n["NodeID"] != current_node_id and is_node_alive(n)
     )
@@ -97,7 +149,7 @@ def other_node_resource_keys() -> List[str]:
     For example, invoking this function from your Ray application driver on the
     head node returns the resource keys of all worker nodes."""
-    current_node_id = ray.get_runtime_context().get_node_id().hex()
+    current_node_id = ray.get_runtime_context().get_node_id()
     return node_resource_keys(lambda n: n["NodeID"] != current_node_id)

deltacat 2.0.0b9__py3-none-any.whl → 2.0.0b11__py3-none-any.whl

deltacat 2.0.0b9py3-none-any.whl → 2.0.0b11py3-none-any.whl