PyPI - deltacat - Versions diffs - 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl - Mend

deltacat 1.1.36py3-none-any.whl → 2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (236) hide show

deltacat/__init__.py +42 -3
deltacat/annotations.py +36 -0
deltacat/api.py +168 -0
deltacat/aws/s3u.py +4 -4
deltacat/benchmarking/benchmark_engine.py +82 -0
deltacat/benchmarking/benchmark_report.py +86 -0
deltacat/benchmarking/benchmark_suite.py +11 -0
deltacat/benchmarking/conftest.py +21 -0
deltacat/benchmarking/data/random_row_generator.py +94 -0
deltacat/benchmarking/data/row_generator.py +10 -0
deltacat/benchmarking/test_benchmark_pipeline.py +106 -0
deltacat/catalog/__init__.py +14 -0
deltacat/catalog/delegate.py +199 -106
deltacat/catalog/iceberg/__init__.py +4 -0
deltacat/catalog/iceberg/iceberg_catalog_config.py +26 -0
deltacat/catalog/iceberg/impl.py +368 -0
deltacat/catalog/iceberg/overrides.py +74 -0
deltacat/catalog/interface.py +273 -76
deltacat/catalog/main/impl.py +720 -0
deltacat/catalog/model/catalog.py +227 -20
deltacat/catalog/model/properties.py +116 -0
deltacat/catalog/model/table_definition.py +32 -1
deltacat/compute/compactor/model/compaction_session_audit_info.py +7 -3
deltacat/compute/compactor/model/delta_annotated.py +3 -3
deltacat/compute/compactor/model/delta_file_envelope.py +3 -1
deltacat/compute/compactor/model/delta_file_locator.py +3 -1
deltacat/compute/compactor/model/round_completion_info.py +5 -5
deltacat/compute/compactor/model/table_object_store.py +3 -2
deltacat/compute/compactor/repartition_session.py +1 -1
deltacat/compute/compactor/steps/dedupe.py +11 -4
deltacat/compute/compactor/steps/hash_bucket.py +1 -1
deltacat/compute/compactor/steps/materialize.py +6 -2
deltacat/compute/compactor/utils/io.py +1 -1
deltacat/compute/compactor/utils/sort_key.py +9 -2
deltacat/compute/compactor_v2/compaction_session.py +5 -9
deltacat/compute/compactor_v2/constants.py +1 -30
deltacat/compute/compactor_v2/deletes/utils.py +3 -3
deltacat/compute/compactor_v2/model/merge_input.py +1 -7
deltacat/compute/compactor_v2/private/compaction_utils.py +5 -6
deltacat/compute/compactor_v2/steps/merge.py +17 -126
deltacat/compute/compactor_v2/utils/content_type_params.py +0 -17
deltacat/compute/compactor_v2/utils/dedupe.py +1 -1
deltacat/compute/compactor_v2/utils/io.py +1 -1
deltacat/compute/compactor_v2/utils/merge.py +0 -1
deltacat/compute/compactor_v2/utils/primary_key_index.py +3 -15
deltacat/compute/compactor_v2/utils/task_options.py +23 -43
deltacat/compute/converter/constants.py +4 -0
deltacat/compute/converter/converter_session.py +143 -0
deltacat/compute/converter/model/convert_input.py +69 -0
deltacat/compute/converter/model/convert_input_files.py +61 -0
deltacat/compute/converter/model/converter_session_params.py +99 -0
deltacat/compute/converter/pyiceberg/__init__.py +0 -0
deltacat/compute/converter/pyiceberg/catalog.py +75 -0
deltacat/compute/converter/pyiceberg/overrides.py +135 -0
deltacat/compute/converter/pyiceberg/update_snapshot_overrides.py +251 -0
deltacat/compute/converter/steps/__init__.py +0 -0
deltacat/compute/converter/steps/convert.py +211 -0
deltacat/compute/converter/steps/dedupe.py +60 -0
deltacat/compute/converter/utils/__init__.py +0 -0
deltacat/compute/converter/utils/convert_task_options.py +88 -0
deltacat/compute/converter/utils/converter_session_utils.py +109 -0
deltacat/compute/converter/utils/iceberg_columns.py +82 -0
deltacat/compute/converter/utils/io.py +43 -0
deltacat/compute/converter/utils/s3u.py +133 -0
deltacat/compute/resource_estimation/delta.py +1 -19
deltacat/constants.py +47 -1
deltacat/env.py +51 -0
deltacat/examples/__init__.py +0 -0
deltacat/examples/basic_logging.py +101 -0
deltacat/examples/common/__init__.py +0 -0
deltacat/examples/common/fixtures.py +15 -0
deltacat/examples/hello_world.py +27 -0
deltacat/examples/iceberg/__init__.py +0 -0
deltacat/examples/iceberg/iceberg_bucket_writer.py +139 -0
deltacat/examples/iceberg/iceberg_reader.py +149 -0
deltacat/exceptions.py +51 -9
deltacat/logs.py +4 -1
deltacat/storage/__init__.py +118 -28
deltacat/storage/iceberg/__init__.py +0 -0
deltacat/storage/iceberg/iceberg_scan_planner.py +28 -0
deltacat/storage/iceberg/impl.py +737 -0
deltacat/storage/iceberg/model.py +709 -0
deltacat/storage/interface.py +217 -134
deltacat/storage/main/__init__.py +0 -0
deltacat/storage/main/impl.py +2077 -0
deltacat/storage/model/delta.py +118 -71
deltacat/storage/model/interop.py +24 -0
deltacat/storage/model/list_result.py +8 -0
deltacat/storage/model/locator.py +93 -3
deltacat/{aws/redshift → storage}/model/manifest.py +122 -98
deltacat/storage/model/metafile.py +1316 -0
deltacat/storage/model/namespace.py +34 -18
deltacat/storage/model/partition.py +362 -37
deltacat/storage/model/scan/__init__.py +0 -0
deltacat/storage/model/scan/push_down.py +19 -0
deltacat/storage/model/scan/scan_plan.py +10 -0
deltacat/storage/model/scan/scan_task.py +34 -0
deltacat/storage/model/schema.py +892 -0
deltacat/storage/model/shard.py +47 -0
deltacat/storage/model/sort_key.py +170 -13
deltacat/storage/model/stream.py +208 -80
deltacat/storage/model/table.py +123 -29
deltacat/storage/model/table_version.py +322 -46
deltacat/storage/model/transaction.py +757 -0
deltacat/storage/model/transform.py +198 -61
deltacat/storage/model/types.py +111 -13
deltacat/storage/rivulet/__init__.py +11 -0
deltacat/storage/rivulet/arrow/__init__.py +0 -0
deltacat/storage/rivulet/arrow/serializer.py +75 -0
deltacat/storage/rivulet/dataset.py +744 -0
deltacat/storage/rivulet/dataset_executor.py +87 -0
deltacat/storage/rivulet/feather/__init__.py +5 -0
deltacat/storage/rivulet/feather/file_reader.py +136 -0
deltacat/storage/rivulet/feather/serializer.py +35 -0
deltacat/storage/rivulet/fs/__init__.py +0 -0
deltacat/storage/rivulet/fs/file_provider.py +105 -0
deltacat/storage/rivulet/fs/file_store.py +130 -0
deltacat/storage/rivulet/fs/input_file.py +76 -0
deltacat/storage/rivulet/fs/output_file.py +86 -0
deltacat/storage/rivulet/logical_plan.py +105 -0
deltacat/storage/rivulet/metastore/__init__.py +0 -0
deltacat/storage/rivulet/metastore/delta.py +190 -0
deltacat/storage/rivulet/metastore/json_sst.py +105 -0
deltacat/storage/rivulet/metastore/sst.py +82 -0
deltacat/storage/rivulet/metastore/sst_interval_tree.py +260 -0
deltacat/storage/rivulet/mvp/Table.py +101 -0
deltacat/storage/rivulet/mvp/__init__.py +5 -0
deltacat/storage/rivulet/parquet/__init__.py +5 -0
deltacat/storage/rivulet/parquet/data_reader.py +0 -0
deltacat/storage/rivulet/parquet/file_reader.py +127 -0
deltacat/storage/rivulet/parquet/serializer.py +37 -0
deltacat/storage/rivulet/reader/__init__.py +0 -0
deltacat/storage/rivulet/reader/block_scanner.py +378 -0
deltacat/storage/rivulet/reader/data_reader.py +136 -0
deltacat/storage/rivulet/reader/data_scan.py +63 -0
deltacat/storage/rivulet/reader/dataset_metastore.py +178 -0
deltacat/storage/rivulet/reader/dataset_reader.py +156 -0
deltacat/storage/rivulet/reader/pyarrow_data_reader.py +121 -0
deltacat/storage/rivulet/reader/query_expression.py +99 -0
deltacat/storage/rivulet/reader/reader_type_registrar.py +84 -0
deltacat/storage/rivulet/schema/__init__.py +0 -0
deltacat/storage/rivulet/schema/datatype.py +128 -0
deltacat/storage/rivulet/schema/schema.py +251 -0
deltacat/storage/rivulet/serializer.py +40 -0
deltacat/storage/rivulet/serializer_factory.py +42 -0
deltacat/storage/rivulet/writer/__init__.py +0 -0
deltacat/storage/rivulet/writer/dataset_writer.py +29 -0
deltacat/storage/rivulet/writer/memtable_dataset_writer.py +294 -0
deltacat/tests/_io/__init__.py +1 -0
deltacat/tests/catalog/test_catalogs.py +324 -0
deltacat/tests/catalog/test_default_catalog_impl.py +16 -8
deltacat/tests/compute/compact_partition_multiple_rounds_test_cases.py +21 -21
deltacat/tests/compute/compact_partition_rebase_test_cases.py +6 -6
deltacat/tests/compute/compact_partition_rebase_then_incremental_test_cases.py +56 -56
deltacat/tests/compute/compact_partition_test_cases.py +19 -53
deltacat/tests/compute/compactor/steps/test_repartition.py +2 -2
deltacat/tests/compute/compactor/utils/test_io.py +6 -8
deltacat/tests/compute/compactor_v2/test_compaction_session.py +0 -466
deltacat/tests/compute/compactor_v2/utils/test_task_options.py +1 -273
deltacat/tests/compute/conftest.py +75 -0
deltacat/tests/compute/converter/__init__.py +0 -0
deltacat/tests/compute/converter/conftest.py +80 -0
deltacat/tests/compute/converter/test_convert_session.py +478 -0
deltacat/tests/compute/converter/utils.py +123 -0
deltacat/tests/compute/resource_estimation/test_delta.py +0 -16
deltacat/tests/compute/test_compact_partition_incremental.py +2 -42
deltacat/tests/compute/test_compact_partition_multiple_rounds.py +5 -46
deltacat/tests/compute/test_compact_partition_params.py +3 -3
deltacat/tests/compute/test_compact_partition_rebase.py +1 -46
deltacat/tests/compute/test_compact_partition_rebase_then_incremental.py +5 -46
deltacat/tests/compute/test_util_common.py +19 -12
deltacat/tests/compute/test_util_create_table_deltas_repo.py +13 -22
deltacat/tests/local_deltacat_storage/__init__.py +76 -103
deltacat/tests/storage/__init__.py +0 -0
deltacat/tests/storage/conftest.py +25 -0
deltacat/tests/storage/main/__init__.py +0 -0
deltacat/tests/storage/main/test_main_storage.py +1399 -0
deltacat/tests/storage/model/__init__.py +0 -0
deltacat/tests/storage/model/test_delete_parameters.py +21 -0
deltacat/tests/storage/model/test_metafile_io.py +2535 -0
deltacat/tests/storage/model/test_schema.py +308 -0
deltacat/tests/storage/model/test_shard.py +22 -0
deltacat/tests/storage/model/test_table_version.py +110 -0
deltacat/tests/storage/model/test_transaction.py +308 -0
deltacat/tests/storage/rivulet/__init__.py +0 -0
deltacat/tests/storage/rivulet/conftest.py +149 -0
deltacat/tests/storage/rivulet/fs/__init__.py +0 -0
deltacat/tests/storage/rivulet/fs/test_file_location_provider.py +93 -0
deltacat/tests/storage/rivulet/schema/__init__.py +0 -0
deltacat/tests/storage/rivulet/schema/test_schema.py +241 -0
deltacat/tests/storage/rivulet/test_dataset.py +406 -0
deltacat/tests/storage/rivulet/test_manifest.py +67 -0
deltacat/tests/storage/rivulet/test_sst_interval_tree.py +232 -0
deltacat/tests/storage/rivulet/test_utils.py +122 -0
deltacat/tests/storage/rivulet/writer/__init__.py +0 -0
deltacat/tests/storage/rivulet/writer/test_dataset_write_then_read.py +341 -0
deltacat/tests/storage/rivulet/writer/test_dataset_writer.py +79 -0
deltacat/tests/storage/rivulet/writer/test_memtable_dataset_writer.py +75 -0
deltacat/tests/test_deltacat_api.py +39 -0
deltacat/tests/test_utils/filesystem.py +14 -0
deltacat/tests/test_utils/message_pack_utils.py +54 -0
deltacat/tests/test_utils/pyarrow.py +8 -15
deltacat/tests/test_utils/storage.py +266 -3
deltacat/tests/utils/test_daft.py +3 -3
deltacat/tests/utils/test_pyarrow.py +0 -432
deltacat/types/partial_download.py +1 -1
deltacat/types/tables.py +1 -1
deltacat/utils/export.py +59 -0
deltacat/utils/filesystem.py +320 -0
deltacat/utils/metafile_locator.py +73 -0
deltacat/utils/pyarrow.py +36 -183
deltacat-2.0.dist-info/METADATA +65 -0
deltacat-2.0.dist-info/RECORD +347 -0
deltacat/aws/redshift/__init__.py +0 -19
deltacat/catalog/default_catalog_impl/__init__.py +0 -369
deltacat/io/dataset.py +0 -73
deltacat/io/read_api.py +0 -143
deltacat/storage/model/delete_parameters.py +0 -40
deltacat/storage/model/partition_spec.py +0 -71
deltacat/tests/compute/compactor_v2/utils/test_content_type_params.py +0 -253
deltacat/tests/compute/compactor_v2/utils/test_primary_key_index.py +0 -45
deltacat-1.1.36.dist-info/METADATA +0 -64
deltacat-1.1.36.dist-info/RECORD +0 -219
/deltacat/{aws/redshift/model → benchmarking/data}/__init__.py +0 -0
/deltacat/{io/aws → catalog/main}/__init__.py +0 -0
/deltacat/{io/aws/redshift → compute/converter}/__init__.py +0 -0
/deltacat/{tests/io → compute/converter/model}/__init__.py +0 -0
/deltacat/tests/{io → _io}/test_cloudpickle_bug_fix.py +0 -0
/deltacat/tests/{io → _io}/test_file_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_memcached_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_ray_plasma_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_redis_object_store.py +0 -0
/deltacat/tests/{io → _io}/test_s3_object_store.py +0 -0
{deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/LICENSE +0 -0
{deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/WHEEL +0 -0
{deltacat-1.1.36.dist-info → deltacat-2.0.dist-info}/top_level.txt +0 -0

deltacat/utils/filesystem.py ADDED Viewed

@@ -0,0 +1,320 @@
+from __future__ import annotations
+import re
+from typing import Optional, Tuple, Union, List
+import sys
+import urllib
+import pathlib
+import pyarrow
+import pyarrow as pa
+from pyarrow.fs import (
+    _resolve_filesystem_and_path,
+    FileSelector,
+    FileInfo,
+    FileType,
+    FileSystem,
+    FSSpecHandler,
+    PyFileSystem,
+)
+_LOCAL_SCHEME = "local"
+def resolve_paths_and_filesystem(
+    paths: Union[str, List[str]],
+    filesystem: pyarrow.fs.FileSystem = None,
+) -> Tuple[List[str], pyarrow.fs.FileSystem]:
+    """
+    Resolves and normalizes all provided paths, infers a filesystem from the
+    paths or validates the provided filesystem against the paths and ensures
+    that all paths use the same filesystem.
+    Args:
+        paths: A single file/directory path or a list of file/directory paths.
+            A list of paths can contain both files and directories.
+        filesystem: The filesystem implementation that should be used for
+            reading these files. If None, a filesystem will be inferred. If not
+            None, the provided filesystem will still be validated against all
+            filesystems inferred from the provided paths to ensure
+            compatibility.
+    """
+    if isinstance(paths, str):
+        paths = [paths]
+    if isinstance(paths, pathlib.Path):
+        paths = [str(paths)]
+    elif not isinstance(paths, list) or any(not isinstance(p, str) for p in paths):
+        raise ValueError(
+            "Expected `paths` to be a `str`, `pathlib.Path`, or `list[str]`, but got "
+            f"`{paths}`."
+        )
+    elif len(paths) == 0:
+        raise ValueError("Must provide at least one path.")
+    need_unwrap_path_protocol = True
+    if filesystem and not isinstance(filesystem, FileSystem):
+        err_msg = (
+            f"The filesystem passed must either conform to "
+            f"pyarrow.fs.FileSystem, or "
+            f"fsspec.spec.AbstractFileSystem. The provided "
+            f"filesystem was: {filesystem}"
+        )
+        try:
+            import fsspec
+            from fsspec.implementations.http import HTTPFileSystem
+        except ModuleNotFoundError:
+            # If filesystem is not a pyarrow filesystem and fsspec isn't
+            # installed, then filesystem is neither a pyarrow filesystem nor
+            # an fsspec filesystem, so we raise a TypeError.
+            raise TypeError(err_msg) from None
+        if not isinstance(filesystem, fsspec.spec.AbstractFileSystem):
+            raise TypeError(err_msg) from None
+        if isinstance(filesystem, HTTPFileSystem):
+            # If filesystem is fsspec HTTPFileSystem, the protocol/scheme of paths
+            # should not be unwrapped/removed, because HTTPFileSystem expects full file
+            # paths including protocol/scheme. This is different behavior compared to
+            # file systems implementation in pyarrow.fs.FileSystem.
+            need_unwrap_path_protocol = False
+        filesystem = PyFileSystem(FSSpecHandler(filesystem))
+    resolved_paths = []
+    for path in paths:
+        path = _resolve_custom_scheme(path)
+        try:
+            resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
+                path, filesystem
+            )
+        except pa.lib.ArrowInvalid as e:
+            if "Cannot parse URI" in str(e):
+                resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
+                    _encode_url(path), filesystem
+                )
+                resolved_path = _decode_url(resolved_path)
+            elif "Unrecognized filesystem type in URI" in str(e):
+                scheme = urllib.parse.urlparse(path, allow_fragments=False).scheme
+                if scheme in ["http", "https"]:
+                    # If scheme of path is HTTP and filesystem is not resolved,
+                    # try to use fsspec HTTPFileSystem. This expects fsspec is
+                    # installed.
+                    try:
+                        from fsspec.implementations.http import HTTPFileSystem
+                    except ModuleNotFoundError:
+                        raise ImportError(
+                            "Please install fsspec to read files from HTTP."
+                        ) from None
+                    resolved_filesystem = PyFileSystem(FSSpecHandler(HTTPFileSystem()))
+                    resolved_path = path
+                    need_unwrap_path_protocol = False
+                else:
+                    raise
+            else:
+                raise
+        if filesystem is None:
+            filesystem = resolved_filesystem
+        elif need_unwrap_path_protocol:
+            resolved_path = _unwrap_protocol(resolved_path)
+        resolved_path = filesystem.normalize_path(resolved_path)
+        resolved_paths.append(resolved_path)
+    return resolved_paths, filesystem
+def resolve_path_and_filesystem(
+    path: str,
+    filesystem: Optional[pyarrow.fs.FileSystem] = None,
+) -> Tuple[str, pyarrow.fs.FileSystem]:
+    """
+    Resolves and normalizes the provided path, infers a filesystem from the
+    path or validates the provided filesystem against the path.
+    Args:
+        path: A single file/directory path.
+        filesystem: The filesystem implementation that should be used for
+            reading these files. If None, a filesystem will be inferred. If not
+            None, the provided filesystem will still be validated against all
+            filesystems inferred from the provided paths to ensure
+            compatibility.
+    """
+    paths, filesystem = resolve_paths_and_filesystem(
+        paths=path,
+        filesystem=filesystem,
+    )
+    assert len(paths) == 1, len(paths)
+    return paths[0], filesystem
+def list_directory(
+    path: str,
+    filesystem: pyarrow.fs.FileSystem,
+    exclude_prefixes: Optional[List[str]] = None,
+    ignore_missing_path: bool = False,
+    recursive: bool = False,
+) -> List[Tuple[str, int]]:
+    """
+    Expand the provided directory path to a list of file paths.
+    Args:
+        path: The directory path to expand.
+        filesystem: The filesystem implementation that should be used for
+            reading these files.
+        exclude_prefixes: The file relative path prefixes that should be
+            excluded from the returned file set. Default excluded prefixes are
+            "." and "_".
+        recursive: Whether to expand subdirectories or not.
+    Returns:
+        An iterator of (file_path, file_size) tuples.
+    """
+    if exclude_prefixes is None:
+        exclude_prefixes = [".", "_"]
+    selector = FileSelector(
+        base_dir=path,
+        recursive=recursive,
+        allow_not_found=ignore_missing_path,
+    )
+    try:
+        files = filesystem.get_file_info(selector)
+    except OSError as e:
+        if isinstance(e, FileNotFoundError):
+            files = []
+        else:
+            _handle_read_os_error(e, path)
+    base_path = selector.base_dir
+    out = []
+    for file_ in files:
+        file_path = file_.path
+        if not file_path.startswith(base_path):
+            continue
+        relative = file_path[len(base_path) :]
+        if any(relative.startswith(prefix) for prefix in exclude_prefixes):
+            continue
+        out.append((file_path, file_.size))
+    # We sort the paths to guarantee a stable order.
+    return sorted(out)
+def get_file_info(
+    path: str,
+    filesystem: pyarrow.fs.FileSystem,
+    ignore_missing_path: bool = False,
+) -> FileInfo:
+    """Get the file info for the provided path."""
+    try:
+        file_info = filesystem.get_file_info(path)
+    except OSError as e:
+        _handle_read_os_error(e, path)
+    if file_info.type == FileType.NotFound and not ignore_missing_path:
+        raise FileNotFoundError(path)
+    return file_info
+def _handle_read_os_error(
+    error: OSError,
+    paths: Union[str, List[str]],
+) -> str:
+    # NOTE: this is not comprehensive yet, and should be extended as more errors arise.
+    # NOTE: The latter patterns are raised in Arrow 10+, while the former is raised in
+    # Arrow < 10.
+    aws_error_pattern = (
+        r"^(?:(.*)AWS Error \[code \d+\]: No response body\.(.*))|"
+        r"(?:(.*)AWS Error UNKNOWN \(HTTP status 400\) during HeadObject operation: "
+        r"No response body\.(.*))|"
+        r"(?:(.*)AWS Error ACCESS_DENIED during HeadObject operation: No response "
+        r"body\.(.*))$"
+    )
+    if re.match(aws_error_pattern, str(error)):
+        # Specially handle AWS error when reading files, to give a clearer error
+        # message to avoid confusing users. The real issue is most likely that the AWS
+        # S3 file credentials have not been properly configured yet.
+        if isinstance(paths, str):
+            # Quote to highlight single file path in error message for better
+            # readability. List of file paths will be shown up as ['foo', 'boo'],
+            # so only quote single file path here.
+            paths = f'"{paths}"'
+        raise OSError(
+            (
+                f"Failing to read AWS S3 file(s): {paths}. "
+                "Please check that file exists and has properly configured access. "
+                "You can also run AWS CLI command to get more detailed error message "
+                "(e.g., aws s3 ls <file-name>). "
+                "See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/s3/index.html "  # noqa
+                "for more information."
+            )
+        )
+    else:
+        raise error
+def _is_local_windows_path(path: str) -> bool:
+    """Determines if path is a Windows file-system location."""
+    if sys.platform != "win32":
+        return False
+    if len(path) >= 1 and path[0] == "\\":
+        return True
+    if (
+        len(path) >= 3
+        and path[1] == ":"
+        and (path[2] == "/" or path[2] == "\\")
+        and path[0].isalpha()
+    ):
+        return True
+    return False
+def _unwrap_protocol(path):
+    """
+    Slice off any protocol prefixes on path.
+    """
+    if sys.platform == "win32" and _is_local_windows_path(path):
+        # Represent as posix path such that downstream functions properly handle it.
+        # This is executed when 'file://' is NOT included in the path.
+        return pathlib.Path(path).as_posix()
+    parsed = urllib.parse.urlparse(path, allow_fragments=False)  # support '#' in path
+    query = "?" + parsed.query if parsed.query else ""  # support '?' in path
+    netloc = parsed.netloc
+    if parsed.scheme == "s3" and "@" in parsed.netloc:
+        # If the path contains an @, it is assumed to be an anonymous
+        # credentialed path, and we need to strip off the credentials.
+        netloc = parsed.netloc.split("@")[-1]
+    parsed_path = parsed.path
+    # urlparse prepends the path with a '/'. This does not work on Windows
+    # so if this is the case strip the leading slash.
+    if (
+        sys.platform == "win32"
+        and not netloc
+        and len(parsed_path) >= 3
+        and parsed_path[0] == "/"  # The problematic leading slash
+        and parsed_path[1].isalpha()  # Ensure it is a drive letter.
+        and parsed_path[2:4] in (":", ":/")
+    ):
+        parsed_path = parsed_path[1:]
+    return netloc + parsed_path + query
+def _encode_url(path):
+    return urllib.parse.quote(path, safe="/:")
+def _decode_url(path):
+    return urllib.parse.unquote(path)
+def _resolve_custom_scheme(path: str) -> str:
+    """Returns the resolved path if the given path follows a Ray-specific custom
+    scheme. Othewise, returns the path unchanged.
+    The supported custom schemes are: "local", "example".
+    """
+    parsed_uri = urllib.parse.urlparse(path)
+    if parsed_uri.scheme == _LOCAL_SCHEME:
+        path = parsed_uri.netloc + parsed_uri.path
+    return path

deltacat/utils/metafile_locator.py ADDED Viewed

@@ -0,0 +1,73 @@
+import posixpath
+import pyarrow.fs
+from deltacat.storage.model.partition import PartitionLocator
+from deltacat.utils.filesystem import resolve_path_and_filesystem
+"""
+Helper functions to work with deltacat metadata paths.
+TODO: Replace with direct calls to Deltacat storage interface.
+"""
+def _find_first_child_with_rev(
+    parent_path: str, filesystem: pyarrow.fs.FileSystem
+) -> str:
+    """
+    Walks the filesystem to find the first child directory with a `rev/` folder.
+    This is a temporary solution to locate the first Namespace and Table directories.
+    The Deltacat Storage interface will provide a more robust way to locate these directories.
+    param: parent_path: The parent directory to search for a child with a `rev/` folder.
+    param: filesystem: The filesystem to search for the child directory.
+    returns: The name of the first child directory with a `rev/` folder.
+    """
+    children = filesystem.get_file_info(
+        pyarrow.fs.FileSelector(parent_path, allow_not_found=True)
+    )
+    for child in children:
+        if child.type == pyarrow.fs.FileType.Directory:
+            rev_path = posixpath.join(child.path, "rev")
+            if filesystem.get_file_info(rev_path).type == pyarrow.fs.FileType.Directory:
+                return child.base_name
+    raise ValueError(f"No directory with 'rev/' found under {parent_path}")
+def _find_table_path(root_path: str, filesystem: pyarrow.fs.FileSystem):
+    """
+    Finds a path with structure: root/namespace_id/table_id
+    Uses _find_first_child_with_rev to determine the namespace and table ids.
+    param: root_path: The root directory to search for the namespace and table directories.
+    param: filesystem: The filesystem to search for the namespace and table directories.
+    returns: The path to the table directory.
+    raises: ValueError if the namespace or table directories are not found.
+    """
+    try:
+        # Find Namespace (first directory under root with rev/)
+        namespace_id = _find_first_child_with_rev(root_path, filesystem)
+        namespace_path = posixpath.join(root_path, namespace_id)
+        # Find Table (first directory under namespace with rev/)
+        table_id = _find_first_child_with_rev(namespace_path, filesystem)
+        return posixpath.join(namespace_path, table_id)
+    except ValueError as e:
+        raise ValueError(f"Failed to locate Namespace or Table: {e}") from e
+def _find_partition_path(root_path: str, locator: PartitionLocator) -> str:
+    """
+    Finds the path to the partition directory for the specified locator.
+    param: root_uri: The root URI of the dataset.
+    param: locator: The DeltaLocator for the delta.
+    returns: The path to the delta directory.
+    """
+    root_path, filesystem = resolve_path_and_filesystem(root_path)
+    return posixpath.join(
+        _find_table_path(root_path, filesystem),
+        locator.table_version,
+        locator.stream_id,
+    )

deltacat/utils/pyarrow.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Allow classes to use self-referencing Type hints in Python 3.7.
 from __future__ import annotations
-import copy
 import bz2
 import gzip
 import io
@@ -47,19 +46,6 @@ logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
 RAISE_ON_EMPTY_CSV_KWARG = "raise_on_empty_csv"
 READER_TYPE_KWARG = "reader_type"
-OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG = "override_content_encoding_for_parquet"
-"""
-By default, round decimal values using half_to_even round mode when
-rescaling a decimal to the given scale and precision in the schema would cause
-data loss. Setting any non null value of this argument will result
-in an error instead.
-"""
-RAISE_ON_DECIMAL_OVERFLOW = "raise_on_decimal_overflow"
-# Note the maximum from https://arrow.apache.org/docs/python/generated/pyarrow.Decimal256Type.html#pyarrow.Decimal256Type
-DECIMAL256_DEFAULT_SCALE = 38
-DECIMAL256_MAX_PRECISION = 76
-MAX_INT_BYTES = 2147483646
 def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Schema:
@@ -78,164 +64,45 @@ def _filter_schema_for_columns(schema: pa.Schema, columns: List[str]) -> pa.Sche
     return target_schema
-def _extract_arrow_schema_from_read_csv_kwargs(kwargs: Dict[str, Any]) -> pa.Schema:
-    schema = None
-    if (
-        "convert_options" in kwargs
-        and kwargs["convert_options"].column_types is not None
-    ):
-        schema = kwargs["convert_options"].column_types
-        if not isinstance(schema, pa.Schema):
-            schema = pa.schema(schema)
-        if kwargs["convert_options"].include_columns:
-            schema = _filter_schema_for_columns(
-                schema, kwargs["convert_options"].include_columns
-            )
-        elif (
-            kwargs.get("read_options") is not None
-            and kwargs["read_options"].column_names
-        ):
-            schema = _filter_schema_for_columns(
-                schema, kwargs["read_options"].column_names
-            )
-    else:
-        logger.debug(
-            "Schema not specified in the kwargs."
-            " Hence, schema could not be inferred from the empty CSV."
-        )
-    return schema
-def _new_schema_with_replaced_fields(
-    schema: pa.Schema, field_to_replace: Callable[[pa.Field], Optional[pa.Field]]
-) -> pa.Schema:
-    if schema is None:
-        return None
-    new_schema_fields = []
-    for field in schema:
-        new_field = field_to_replace(field)
-        if new_field is not None:
-            new_schema_fields.append(new_field)
-        else:
-            new_schema_fields.append(field)
-    return pa.schema(new_schema_fields, metadata=schema.metadata)
-def _read_csv_rounding_decimal_columns_to_fit_scale(
-    schema: pa.Schema, reader_args: List[Any], reader_kwargs: Dict[str, Any]
-) -> pa.Table:
-    # Note: We read decimals as strings first because CSV
-    # conversion to decimal256 isn't implemented as of pyarrow==12.0.1
-    new_schema = _new_schema_with_replaced_fields(
-        schema,
-        lambda fld: (
-            pa.field(fld.name, pa.string(), metadata=fld.metadata)
-            if pa.types.is_decimal128(fld.type) or pa.types.is_decimal256(fld.type)
-            else None
-        ),
-    )
-    new_kwargs = sanitize_kwargs_by_supported_kwargs(
-        ["read_options", "parse_options", "convert_options", "memory_pool"],
-        reader_kwargs,
-    )
-    # Creating a shallow copy for efficiency
-    new_convert_options = copy.copy(new_kwargs["convert_options"])
-    new_convert_options.column_types = new_schema
-    new_reader_kwargs = {**new_kwargs, "convert_options": new_convert_options}
-    arrow_table = pacsv.read_csv(*reader_args, **new_reader_kwargs)
-    for column_index, field in enumerate(schema):
-        if pa.types.is_decimal128(field.type) or pa.types.is_decimal256(field.type):
-            column_array = arrow_table[field.name]
-            # We always cast to decimal256 to accomodate fixed scale of 38
-            cast_to_type = pa.decimal256(
-                DECIMAL256_MAX_PRECISION, DECIMAL256_DEFAULT_SCALE
-            )
-            casted_decimal_array = pc.cast(column_array, cast_to_type)
-            # Note that scale can be negative
-            rounded_column_array = pc.round(
-                casted_decimal_array, ndigits=field.type.scale
-            )
-            final_decimal_array = pc.cast(rounded_column_array, field.type)
-            arrow_table = arrow_table.set_column(
-                column_index,
-                field,
-                final_decimal_array,
-            )
-            logger.debug(
-                f"Rounded decimal column: {field.name} to {field.type.scale} scale and"
-                f" {field.type.precision} precision"
-            )
-    return arrow_table
-def pyarrow_read_csv_default(*args, **kwargs):
-    new_kwargs = sanitize_kwargs_by_supported_kwargs(
-        ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
-    )
+def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
     try:
+        new_kwargs = sanitize_kwargs_by_supported_kwargs(
+            ["read_options", "parse_options", "convert_options", "memory_pool"], kwargs
+        )
         return pacsv.read_csv(*args, **new_kwargs)
     except pa.lib.ArrowInvalid as e:
-        error_str = e.__str__()
-        schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
-        if error_str == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
-            logger.debug(f"Read CSV empty schema being used: {schema}")
-            return pa.Table.from_pylist([], schema=schema)
-        if not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
-            # Note, this logic requires expensive casting. To prevent downgrading performance
-            # for happy path reads, we are handling this case in response to an error.
-            logger.warning(
-                "Rescaling Decimal to the given scale in the schema. "
-                f"Original error: {error_str}"
-            )
-            if schema is not None and "convert_options" in kwargs:
-                if (
-                    "Rescaling Decimal" in error_str
-                    and "value would cause data loss" in error_str
+        if e.__str__() == "Empty CSV file" and not kwargs.get(RAISE_ON_EMPTY_CSV_KWARG):
+            schema = None
+            if (
+                "convert_options" in kwargs
+                and kwargs["convert_options"].column_types is not None
+            ):
+                schema = kwargs["convert_options"].column_types
+                if not isinstance(schema, pa.Schema):
+                    schema = pa.schema(schema)
+                if kwargs["convert_options"].include_columns:
+                    schema = _filter_schema_for_columns(
+                        schema, kwargs["convert_options"].include_columns
+                    )
+                elif (
+                    kwargs.get("read_options") is not None
+                    and kwargs["read_options"].column_names
                 ):
-                    logger.debug(f"Checking if the file: {args[0]}...")
-                    # Since we are re-reading the file, we have to seek to beginning
-                    if isinstance(args[0], io.IOBase) and args[0].seekable():
-                        logger.debug(f"Seeking to the beginning of the file {args[0]}")
-                        args[0].seek(0)
-                    return _read_csv_rounding_decimal_columns_to_fit_scale(
-                        schema=schema, reader_args=args, reader_kwargs=kwargs
+                    schema = _filter_schema_for_columns(
+                        schema, kwargs["read_options"].column_names
                     )
             else:
                 logger.debug(
-                    "Schema is None when trying to adjust decimal values. "
-                    "Hence, bubbling up exception..."
+                    "Schema not specified in the kwargs."
+                    " Hence, schema could not be inferred from the empty CSV."
                 )
+            logger.debug(f"Read CSV empty schema being used: {schema}")
+            return pa.Table.from_pylist([], schema=schema)
         raise e
-def pyarrow_read_csv(*args, **kwargs) -> pa.Table:
-    schema = _extract_arrow_schema_from_read_csv_kwargs(kwargs)
-    # CSV conversion to decimal256 isn't supported as of pyarrow=12.0.1
-    # Below ensures decimal256 is casted properly.
-    schema_includes_decimal256 = (
-        (True if any([pa.types.is_decimal256(x.type) for x in schema]) else False)
-        if schema is not None
-        else None
-    )
-    if schema_includes_decimal256 and not kwargs.get(RAISE_ON_DECIMAL_OVERFLOW):
-        # falling back to expensive method of reading CSV
-        return _read_csv_rounding_decimal_columns_to_fit_scale(
-            schema, reader_args=args, reader_kwargs=kwargs
-        )
-    else:
-        return pyarrow_read_csv_default(*args, **kwargs)
 CONTENT_TYPE_TO_PA_READ_FUNC: Dict[str, Callable] = {
     ContentType.UNESCAPED_TSV.value: pyarrow_read_csv,
     ContentType.TSV.value: pyarrow_read_csv,
@@ -544,15 +411,6 @@ def s3_file_to_table(
     if pa_read_func_kwargs_provider is not None:
         kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
-    if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
-        new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
-        if content_type == ContentType.PARQUET.value:
-            logger.debug(
-                f"Overriding {s3_url} content encoding from {content_encoding} "
-                f"to {new_content_encoding}"
-            )
-            content_encoding = new_content_encoding
     if (
         content_type == ContentType.PARQUET.value
         and content_encoding == ContentEncoding.IDENTITY.value
@@ -582,8 +440,8 @@ def s3_file_to_table(
                 **s3_client_kwargs,
             )
-    if READER_TYPE_KWARG in kwargs:
-        kwargs.pop(READER_TYPE_KWARG)
+        if READER_TYPE_KWARG in kwargs:
+            kwargs.pop(READER_TYPE_KWARG)
     filesystem = io
     if s3_url.startswith("s3://"):
@@ -617,18 +475,7 @@ def s3_file_to_parquet(
         f"Reading {s3_url} to PyArrow ParquetFile. "
         f"Content type: {content_type}. Encoding: {content_encoding}"
     )
-    kwargs = {}
-    if pa_read_func_kwargs_provider:
-        kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
-    if OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG in kwargs:
-        new_content_encoding = kwargs.pop(OVERRIDE_CONTENT_ENCODING_FOR_PARQUET_KWARG)
-        if content_type == ContentType.PARQUET.value:
-            logger.debug(
-                f"Overriding {s3_url} content encoding from {content_encoding} "
-                f"to {new_content_encoding}"
-            )
-            content_encoding = new_content_encoding
     if (
         content_type != ContentType.PARQUET.value
         or content_encoding != ContentEncoding.IDENTITY
@@ -641,10 +488,15 @@ def s3_file_to_parquet(
     if s3_client_kwargs is None:
         s3_client_kwargs = {}
+    kwargs = {}
     if s3_url.startswith("s3://"):
         s3_file_system = create_s3_file_system(s3_client_kwargs)
         kwargs["filesystem"] = s3_file_system
+    if pa_read_func_kwargs_provider:
+        kwargs = pa_read_func_kwargs_provider(content_type, kwargs)
     logger.debug(f"Pre-sanitize kwargs for {s3_url}: {kwargs}")
     kwargs = sanitize_kwargs_to_callable(ParquetFile.__init__, kwargs)
@@ -931,6 +783,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
     TODO: deprecate this function when pyarrow performs proper ChunkedArray -> ChunkedArray casting
     """
     dtype = array.type
+    MAX_BYTES = 2147483646
     max_str_len = None
     if pa.types.is_integer(dtype):
         max_str_len = _int_max_string_len()
@@ -942,7 +795,7 @@ def sliced_string_cast(array: pa.ChunkedArray) -> pa.ChunkedArray:
         max_str_len = _max_decimal256_string_len()
     if max_str_len is not None:
-        max_elems_per_chunk = MAX_INT_BYTES // (2 * max_str_len)  # safety factor of 2
+        max_elems_per_chunk = MAX_BYTES // (2 * max_str_len)  # safety factor of 2
         all_chunks = []
         for chunk in array.chunks:
             if len(chunk) < max_elems_per_chunk:

deltacat 1.1.36__py3-none-any.whl → 2.0__py3-none-any.whl

deltacat 1.1.36py3-none-any.whl → 2.0py3-none-any.whl