PyPI - daplapath - Versions diffs - 2.0.3__tar.gz → 2.0.6__tar.gz - Mend

daplapath 2.0.3tar.gz → 2.0.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{daplapath-2.0.3 → daplapath-2.0.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: daplapath
-Version: 2.0.3
+Version: 2.0.6
 Summary: A pathlib.Path class for dapla
 License: MIT
 Author: ort

{daplapath-2.0.3 → daplapath-2.0.6}/daplapath/__init__.py RENAMED Viewed

@@ -1,2 +1,3 @@
 from .path import Path
 from .path import PathSeries
+from .path import LocalFileSystem

{daplapath-2.0.3 → daplapath-2.0.6}/daplapath/path.py RENAMED Viewed

@@ -14,8 +14,8 @@ import shutil
 from typing import Callable, Any
 import inspect
 import itertools
-import warnings
+from fsspec.spec import AbstractFileSystem
 import datetime
 import numpy as np
 import pandas as pd
@@ -23,6 +23,8 @@ import pandas.io.formats.format as fmt
 from pandas.api.types import is_dict_like
 import pyarrow
 import pyarrow.parquet as pq
+import pyarrow.dataset as ds
 try:
     import gcsfs
@@ -47,15 +49,15 @@ class Config:
     file_system: Callable
-class LocalFileSystem:
+class LocalFileSystem(AbstractFileSystem):
     """Mimicks GCS's FileSystem but using standard library (os, glob, shutil)."""
     @staticmethod
     def glob(
         path: str,
-        recursive: bool = True,
         detail: bool = False,
-        include_hidden: bool = False,
+        recursive: bool = True,
+        include_hidden: bool = True,
         **kwargs,
     ) -> list[dict] | list[str]:
         relevant_paths = glob.iglob(
@@ -67,14 +69,16 @@ class LocalFileSystem:
         with ThreadPoolExecutor() as executor:
             return list(executor.map(get_file_info, relevant_paths))
+    @classmethod
+    def ls(cls, path: str, detail: bool = False, **kwargs):
+        return cls().glob(
+            str(pathlib.Path(path) / "**"), detail=detail, recursive=False, **kwargs
+        )
     @staticmethod
     def info(path) -> dict[str, Any]:
         return get_file_info(path)
-    @staticmethod
-    def isdir(path: str) -> bool:
-        return os.path.isdir(path)
     @staticmethod
     def open(path: str, *args, **kwargs) -> io.TextIOWrapper:
         return open(path, *args, **kwargs)
@@ -87,8 +91,12 @@ class LocalFileSystem:
     def mv(source: str, destination, **kwargs) -> str:
         return shutil.move(source, destination, **kwargs)
+    @classmethod
+    def cp(cls, source: str, destination, **kwargs) -> str:
+        return cls.cp_file(source, destination, **kwargs)
     @staticmethod
-    def cp(source: str, destination, **kwargs) -> str:
+    def cp_file(self, path1, path2, **kwargs):
         os.makedirs(pathlib.Path(destination).parent, exist_ok=True)
         return shutil.copy2(source, destination, **kwargs)
@@ -96,6 +104,14 @@ class LocalFileSystem:
     def rm_file(path: str, *args, **kwargs) -> None:
         return os.remove(path, *args, **kwargs)
+    @staticmethod
+    def rmdir(path: str, *args, **kwargs) -> None:
+        return shutil.rmtree(path, *args, **kwargs)
+    @staticmethod
+    def makedirs(path: str, exist_ok: bool = False) -> None:
+        return os.makedirs(path, exist_ok=exist_ok)
 class GCSFileSystem(gcsfs.GCSFileSystem):
     def isdir(self, path: str) -> bool:
@@ -110,9 +126,6 @@ else:
     _config = Config(LocalFileSystem)
-gcsfs.GCSFileSystem.isdir
 class Tree:
     """Stores text to be printed/displayed in directory tree format.
@@ -166,6 +179,17 @@ class _PathBase:
 class Path(str, _PathBase):
     """Path object that works like a string, with methods for working with the GCS file system."""
+    _file_system_attrs: set[str] = {
+        "info",
+        "isdir",
+        "open",
+        "exists",
+        "mv",
+        "cp",
+        "rm_file",
+        "rmdir",
+    }
     @property
     def _iterable_type(self) -> type | Callable:
         """Can be overridden in subclass."""
@@ -182,14 +206,26 @@ class Path(str, _PathBase):
             .rstrip("/")
         )
-    def __new__(cls, gcs_path: str | PurePath | None = None):
+    def __new__(cls, gcs_path: str | PurePath | None = None, file_system=None):
         """Construct Path with '/' as delimiter."""
         gcs_path = cls._standardize_path(gcs_path or "")
         obj = super().__new__(cls, gcs_path)
         obj._path = PurePosixPath(obj)
-        obj._file_system = None
+        obj._file_system = file_system
         return obj
+    def buckets_path(self) -> "Path":
+        if self.startswith("/buckets"):
+            return self
+        root = self.parts[0]
+        bucket = root.split("-data-")[-1].split("-prod")[0]
+        try:
+            return self._new(f"/buckets/{bucket}/{'/'.join(self.parts[1:])}")
+        except IndexError:
+            return self._new(f"/buckets/{bucket}")
     def tree(
         self,
         max_rows: int | None = 3,
@@ -365,7 +401,7 @@ class Path(str, _PathBase):
         'file_v201.parquet'
         """
         version_text = f"{self._version_prefix}{version}" if version is not None else ""
-        return self.__class__(
+        return self._new(
             f"{self.parent}/{self.versionless_stem}{version_text}{self.suffix}"
         )
@@ -468,7 +504,7 @@ class Path(str, _PathBase):
         parent = f"{self.parent}/" if self.parent != "." else ""
-        return self.__class__(
+        return self._new(
             f"{parent}{stem}{period_string}{version_string}{self.suffix}".replace(
                 "".join(self.periods), period_string.strip(self._period_prefix)
             )
@@ -509,12 +545,17 @@ class Path(str, _PathBase):
     @property
     def versionless_stem(self) -> str:
         """Return the file stem before the version pattern."""
-        return self.__class__(re.split(self._version_pattern, self._path.name)[0]).stem
+        return self._new(re.split(self._version_pattern, self._path.name)[0]).stem
     @property
     def parent(self) -> "Path":
         """Parent path."""
-        return self.__class__(self._path.parent)
+        return self._new(self._path.parent)
+    @property
+    def parents(self) -> "list[Path]":
+        """Parent path."""
+        return [self._new(parent) for parent in self._path.parents]
     @property
     def name(self) -> str:
@@ -542,52 +583,48 @@ class Path(str, _PathBase):
     @property
     def index_column_names(self) -> list[str]:
-        with self.open("rb") as file:
-            try:
-                schema = pq.read_schema(file)
-                return _get_index_cols(schema)
-            except KeyError:
-                return read_nrows(file, 1).index.names
+        return _get_index_cols(self.schema)
     @property
     def columns(self) -> pd.Index:
         """Columns of the file."""
-        with self.open("rb") as file:
-            try:
-                schema = pq.read_schema(file)
-                index_cols = _get_index_cols(schema)
-                return pd.Index(schema.names).difference(index_cols)
-            except KeyError:
-                return read_nrows(file, 1).columns
+        schema = self.schema
+        try:
+            names = [
+                x["field_name"]
+                for x in json.loads(schema.metadata[b"pandas"].decode())["columns"]
+            ]
+        except (KeyError, TypeError):
+            names = schema.names
+        index_cols = _get_index_cols(schema)
+        return pd.Index(names).difference(index_cols)
     @property
     def schema(self) -> pyarrow.Schema:
         """Date types of the file's columns."""
-        with self.open("rb") as file:
-            return pq.read_schema(file)
+        try:
+            with self.open("rb") as file:
+                return get_schema(file)
+        except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
+            return get_schema(self)
     @property
     def dtypes(self) -> pd.Series:
         """Date types of the file's columns."""
-        with self.open("rb") as file:
-            try:
-                schema = pq.read_schema(file)
-                index_cols = _get_index_cols(schema)
-                return pd.Series(schema.types, index=schema.names).loc[
-                    lambda x: ~x.index.isin(index_cols)
-                ]
-            except KeyError:
-                return read_nrows(file, 1).dtypes
+        schema = self.schema
+        index_cols = _get_index_cols(schema)
+        return pd.Series(schema.types, index=schema.names).loc[
+            lambda x: ~x.index.isin(index_cols)
+        ]
     @property
     def shape(self) -> tuple[int, int]:
         """Number of rows and columns."""
-        with self.open("rb") as file:
-            try:
-                meta = pq.read_metadata(file)
-                return meta.num_rows, meta.num_columns
-            except KeyError:
-                return read_nrows(file, 1).shape
+        try:
+            with self.open("rb") as file:
+                return get_shape(file)
+        except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
+            return get_shape(self)
     @property
     def nrow(self) -> int:
@@ -643,25 +680,27 @@ class Path(str, _PathBase):
     @property
     def partition_root(self) -> "Path":
+        if ".parquet" not in self:
+            return self
         return self.split(".parquet")[0] + ".parquet"
-    def is_dir(self) -> bool:
-        try:
-            return self.file_system.isdir(self)
-        except AttributeError:
-            return self.file_system.is_dir(self)
+    def isfile(self) -> bool:
+        return not self.isdir()
     def is_file(self) -> bool:
-        return not self.is_dir()
+        return self.isfile()
+    def is_dir(self) -> bool:
+        return self.isdir()
     def with_suffix(self, suffix: str):
-        return self.__class__(self._path.with_suffix(suffix))
+        return self._new(self._path.with_suffix(suffix))
     def with_name(self, new_name: str):
-        return self.__class__(self._path.with_name(new_name))
+        return self._new(self._path.with_name(new_name))
     def with_stem(self, new_with_stem: str):
-        return self.__class__(self._path.with_stem(new_with_stem))
+        return self._new(self._path.with_stem(new_with_stem))
     @property
     def file_system(self):
@@ -689,7 +728,7 @@ class Path(str, _PathBase):
                 "unsupported operand type(s) for /: "
                 f"{self.__class__.__name__} and {other.__class__.__name__}"
             )
-        return self.__class__(f"{self}/{as_str(other)}")
+        return self._new(f"{self}/{as_str(other)}")
     def __getattribute__(self, name):
         """stackoverflow hack to ensure we return Path when using string methods.
@@ -721,21 +760,15 @@ class Path(str, _PathBase):
         error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
         if attr.startswith("_"):
             raise AttributeError(error_message)
-        try:
-            return functools.partial(getattr(self.file_system, attr), self)
-        except AttributeError as e:
-            raise AttributeError(error_message) from e
+        if attr not in self._file_system_attrs:
+            raise AttributeError(error_message)
+        return functools.partial(getattr(self.file_system, attr), self)
     def __fspath__(self) -> str:
         return str(self)
     def __dir__(self) -> list[str]:
-        return list(
-            sorted(
-                {x for x in dir(Path)}
-                | {x for x in dir(self._file_system) if not x.startswith("_")}
-            )
-        )
+        return list(sorted({x for x in dir(Path)} | self._file_system_attrs))
     def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
         series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
@@ -743,6 +776,9 @@ class Path(str, _PathBase):
             path._file_system = self._file_system
         return self._iterable_type(series, **kwargs)
+    def _new(self, new_path: str | Path) -> "Path":
+        return self.__class__(new_path, self.file_system)
 class PathSeries(pd.Series, _PathBase):
     """A pandas Series for working with GCS (Google Cloud Storage) paths.
@@ -863,6 +899,12 @@ class PathSeries(pd.Series, _PathBase):
     def partition_root(self) -> "PathSeries":
         return self.files.apply(lambda x: x.partition_root).drop_duplicates()
+    @property
+    def partitioned_files(self) -> "PathSeries":
+        return self.files.loc[
+            lambda x: x.str.count(r"\.parquet") == 2
+        ].partition_root.drop_duplicates()
     @property
     def dirs(self) -> "PathSeries":
         """Select only the directories in the Series."""
@@ -1218,18 +1260,18 @@ def split_path_and_make_copyable_html(
     split: str | None = "/",
     display_prefix: str | None = ".../",
 ) -> str:
-    """Get html text that displays the last part, but makes the full path copyable to clipboard.
+    """Get HTML text that displays the last part, but makes the full path copyable to clipboard.
-    Splits the path on a delimiter and creates an html string that displays only the
+    Splits the path on a delimiter and creates an HTML string that displays only the
     last part, but adds a hyperlink which copies the full path to clipboard when clicked.
     Parameters
     ----------
     path: File or directory path
-    max_parts: Maximum number of path paths to display. Defaults to 2,
+    max_parts: Maximum number of path parts to display. Defaults to 2,
         meaning the two last parts. Set to None to show full paths.
     split: Text pattern to split the path on. Defaults to "/".
-    display_prefix: The text to display instead of the parent directory. Defaults to ".../"
+    display_prefix: The text to display instead of the parent directory. Defaults to ".../".
     Returns
     -------
@@ -1237,7 +1279,8 @@ def split_path_and_make_copyable_html(
     """
     copy_to_clipboard_js = f"""<script>
-function copyToClipboard(text) {{
+function copyToClipboard(text, event) {{
+    event.preventDefault();
     navigator.clipboard.writeText(text)
         .then(() => {{
             const alertBox = document.createElement('div');
@@ -1271,7 +1314,7 @@ function copyToClipboard(text) {{
     else:
         displayed_text = path
-    return f'{copy_to_clipboard_js}<a href="{displayed_text}" title="{path}" onclick="copyToClipboard(\'{path}\')">{displayed_text}</a>'
+    return f'{copy_to_clipboard_js}<a href="#" title="{path}" onclick="copyToClipboard(\'{path}\', event)">{displayed_text}</a>'
 def _get_default_multi_index() -> pd.MultiIndex:
@@ -1458,6 +1501,72 @@ def get_arguments(func: Callable | object) -> list[str]:
     )
+def get_schema(file) -> pyarrow.Schema:
+    try:
+        return pq.read_schema(file)
+    except (
+        PermissionError,
+        pyarrow.ArrowInvalid,
+        FileNotFoundError,
+        IsADirectoryError,
+        OSError,
+    ):
+        # try:
+        #     return ds.dataset(file).schema
+        # except (TypeError, FileNotFoundError) as e:
+        if not hasattr(file, "file_system"):
+            raise e
+        file_system = file.file_system
+        def _get_schema(path):
+            try:
+                return pq.read_schema(path)
+            except FileNotFoundError:
+                with file_system.open(path, "rb") as f:
+                    return pq.read_schema(f)
+        with ThreadPoolExecutor() as executor:
+            return pyarrow.unify_schemas(
+                list(
+                    executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
+                ),
+                promote_options="permissive",
+            )
+def get_num_rows(file):
+    try:
+        return pq.read_metadata(file).num_rows
+    except (
+        PermissionError,
+        pyarrow.ArrowInvalid,
+        FileNotFoundError,
+        TypeError,
+        OSError,
+    ) as e:
+        try:
+            return ds.dataset(file).count_rows()
+        except Exception as e2:
+            if not hasattr(file, "glob"):
+                raise e2 from 2
+            def _get_num_rows(path):
+                with path.open("rb") as file:
+                    return pq.read_metadata(file).num_rows
+            with ThreadPoolExecutor() as executor:
+                return sum(executor.map(_get_num_rows, file.glob("**").files))
+def get_shape(file) -> tuple[int, int]:
+    schema = get_schema(file)
+    index_cols = _get_index_cols(schema)
+    ncol: int = sum(name not in index_cols for name in schema.names)
+    nrow: int = get_num_rows(file)
+    return nrow, ncol
 def read_nrows(file, nrow: int) -> pd.DataFrame:
     """Read first n rows of a parquet file."""
     rows = next(pq.ParquetFile(file).iter_batches(nrow))

{daplapath-2.0.3 → daplapath-2.0.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "daplapath"
-version = "2.0.3"
+version = "2.0.6"
 description = "A pathlib.Path class for dapla"
 authors = ["ort <ort@ssb.no>"]
 license = "MIT"

{daplapath-2.0.3 → daplapath-2.0.6}/LICENSE.md RENAMED Viewed

File without changes

{daplapath-2.0.3 → daplapath-2.0.6}/README.md RENAMED Viewed

File without changes

daplapath 2.0.3__tar.gz → 2.0.6__tar.gz

daplapath 2.0.3tar.gz → 2.0.6tar.gz