PyPI - daplapath - Versions diffs - 2.0.8__tar.gz → 2.0.10__tar.gz - Mend

daplapath 2.0.8tar.gz → 2.0.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

{daplapath-2.0.8 → daplapath-2.0.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: daplapath
-Version: 2.0.8
+Version: 2.0.10
 Summary: A pathlib.Path class for dapla
 License: MIT
 Author: ort

{daplapath-2.0.8 → daplapath-2.0.10}/daplapath/path.py RENAMED Viewed

@@ -31,6 +31,11 @@ try:
 except ImportError:
     pass
+try:
+    from google.cloud import storage
+except ImportError:
+    pass
 # regex with the prefix '_v' followed by an integer of any length
 VERSION_PATTERN = r"_v(\d+)\."
 VERSION_PREFIX = "_v"
@@ -113,15 +118,26 @@ class LocalFileSystem(AbstractFileSystem):
         return os.makedirs(path, exist_ok=exist_ok)
-class GCSFileSystem(gcsfs.GCSFileSystem):
+class MyGCSFileSystem(gcsfs.GCSFileSystem):
     def isdir(self, path: str) -> bool:
         """Check if path is a directory."""
         info = super(gcsfs.GCSFileSystem, self).info(path)
         return info["type"] == "directory"
+    def rmdir(self, path: str) -> None:
+        """Remove contents of a directory in GCS. It might take some time before files are actually deleted."""
+        path = pathlib.Path(path)
+        remaining = self.glob(str(path / "**"))
+        assert all(self.isdir(x) for x in remaining), remaining
+        storage_client = storage.Client()
+        bucket = storage_client.get_bucket(path.parts[0])
+        blobs = bucket.list_blobs(prefix="/".join(path.parts) + "/")
+        for blob in blobs:
+            blob.delete()
 if any("dapla" in key.lower() for key in os.environ) and "gcsfs" in locals():
-    _config = Config(GCSFileSystem)
+    _config = Config(MyGCSFileSystem)
 else:
     _config = Config(LocalFileSystem)
@@ -314,7 +330,14 @@ class Path(str, _PathBase):
         return self._cp_or_mv(destination, "cp")
     def mv(self, destination: "Path | str") -> "Path":
-        return self._cp_or_mv(destination, "mv")
+        was_dir = self.isdir()
+        out_path = self._cp_or_mv(destination, "mv")
+        if was_dir:
+            try:
+                self.file_system.rmdir(str(self))
+            except (FileNotFoundError, NotADirectoryError):
+                pass
+        return out_path
     def versions(self, include_versionless: bool = False) -> "PathSeries":
         """Returns a PathSeries of all versions of the file."""
@@ -599,7 +622,7 @@ class Path(str, _PathBase):
     @property
     def index_column_names(self) -> list[str]:
-        return _get_index_cols(self.schema)
+        return _get_index_cols(self.schema, self)
     @property
     def columns(self) -> pd.Index:
@@ -612,7 +635,7 @@ class Path(str, _PathBase):
             ]
         except (KeyError, TypeError):
             names = schema.names
-        index_cols = _get_index_cols(schema)
+        index_cols = _get_index_cols(schema, self)
         return pd.Index(names).difference(index_cols)
     @property
@@ -621,16 +644,14 @@ class Path(str, _PathBase):
         try:
             with self.open("rb") as file:
                 return get_schema(file)
-        except (
-            Exception
-        ):  # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
+        except Exception:
             return get_schema(self)
     @property
     def dtypes(self) -> pd.Series:
         """Date types of the file's columns."""
         schema = self.schema
-        index_cols = _get_index_cols(schema)
+        index_cols = _get_index_cols(schema, self)
         return pd.Series(schema.types, index=schema.names).loc[
             lambda x: ~x.index.isin(index_cols)
         ]
@@ -641,9 +662,7 @@ class Path(str, _PathBase):
         try:
             with self.open("rb") as file:
                 return get_shape(file)
-        except (
-            Exception
-        ):  # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
+        except Exception:
             return get_shape(self)
     @property
@@ -702,7 +721,12 @@ class Path(str, _PathBase):
     def partition_root(self) -> "Path":
         if ".parquet" not in self:
             return self
-        return self.split(".parquet")[0] + ".parquet"
+        return self._new(self.split(".parquet")[0] + ".parquet")
+    def is_partitioned(self) -> bool:
+        if ".parquet" not in self or self.isfile() and self.count(".parquet") != 2:
+            return False
+        return bool(len(self.glob("**/*.parquet")))
     def isfile(self) -> bool:
         return not self.isdir()
@@ -1517,8 +1541,11 @@ def get_path_tree(
     return tree
-def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
-    cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
+def _get_index_cols(schema: pyarrow.Schema, path_or_file: str | Path) -> list[str]:
+    try:
+        cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
+    except KeyError as e:
+        raise KeyError(f"{e}. For {type(path_or_file)}: {path_or_file}")
     return [x for x in cols if not isinstance(x, dict)]
@@ -1570,13 +1597,21 @@ def get_schema(file) -> pyarrow.Schema:
                 except Exception as e2:
                     raise e2.__class__(f"{e2}. {path}") from e
+        child_paths = file_system.glob(file + "/**/*.parquet")
+        if not len(child_paths):
+            raise e.__class__(f"{e}: {file}") from e
         with ThreadPoolExecutor() as executor:
-            return pyarrow.unify_schemas(
-                list(
-                    executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
-                ),
-                promote_options="permissive",
+            schemas: list[pyarrow.Schema] = list(
+                executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
             )
+        if not schemas:
+            raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
+        return pyarrow.unify_schemas(
+            schemas,
+            promote_options="permissive",
+        )
 def get_num_rows(file):
@@ -1599,7 +1634,7 @@ def get_num_rows(file):
 def get_shape(file) -> tuple[int, int]:
     schema = get_schema(file)
-    index_cols = _get_index_cols(schema)
+    index_cols = _get_index_cols(schema, file)
     ncol: int = sum(name not in index_cols for name in schema.names)
     nrow: int = get_num_rows(file)
     return nrow, ncol

{daplapath-2.0.8 → daplapath-2.0.10}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "daplapath"
-version = "2.0.8"
+version = "2.0.10"
 description = "A pathlib.Path class for dapla"
 authors = ["ort <ort@ssb.no>"]
 license = "MIT"

{daplapath-2.0.8 → daplapath-2.0.10}/LICENSE.md RENAMED Viewed

File without changes

{daplapath-2.0.8 → daplapath-2.0.10}/README.md RENAMED Viewed

File without changes

{daplapath-2.0.8 → daplapath-2.0.10}/daplapath/__init__.py RENAMED Viewed

File without changes

daplapath 2.0.8__tar.gz → 2.0.10__tar.gz

daplapath 2.0.8tar.gz → 2.0.10tar.gz