daplapath 2.0.8__tar.gz → 2.0.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {daplapath-2.0.8 → daplapath-2.0.10}/PKG-INFO +1 -1
- {daplapath-2.0.8 → daplapath-2.0.10}/daplapath/path.py +56 -21
- {daplapath-2.0.8 → daplapath-2.0.10}/pyproject.toml +1 -1
- {daplapath-2.0.8 → daplapath-2.0.10}/LICENSE.md +0 -0
- {daplapath-2.0.8 → daplapath-2.0.10}/README.md +0 -0
- {daplapath-2.0.8 → daplapath-2.0.10}/daplapath/__init__.py +0 -0
|
@@ -31,6 +31,11 @@ try:
|
|
|
31
31
|
except ImportError:
|
|
32
32
|
pass
|
|
33
33
|
|
|
34
|
+
try:
|
|
35
|
+
from google.cloud import storage
|
|
36
|
+
except ImportError:
|
|
37
|
+
pass
|
|
38
|
+
|
|
34
39
|
# regex with the prefix '_v' followed by an integer of any length
|
|
35
40
|
VERSION_PATTERN = r"_v(\d+)\."
|
|
36
41
|
VERSION_PREFIX = "_v"
|
|
@@ -113,15 +118,26 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
113
118
|
return os.makedirs(path, exist_ok=exist_ok)
|
|
114
119
|
|
|
115
120
|
|
|
116
|
-
class
|
|
121
|
+
class MyGCSFileSystem(gcsfs.GCSFileSystem):
|
|
117
122
|
def isdir(self, path: str) -> bool:
|
|
118
123
|
"""Check if path is a directory."""
|
|
119
124
|
info = super(gcsfs.GCSFileSystem, self).info(path)
|
|
120
125
|
return info["type"] == "directory"
|
|
121
126
|
|
|
127
|
+
def rmdir(self, path: str) -> None:
|
|
128
|
+
"""Remove contents of a directory in GCS. It might take some time before files are actually deleted."""
|
|
129
|
+
path = pathlib.Path(path)
|
|
130
|
+
remaining = self.glob(str(path / "**"))
|
|
131
|
+
assert all(self.isdir(x) for x in remaining), remaining
|
|
132
|
+
storage_client = storage.Client()
|
|
133
|
+
bucket = storage_client.get_bucket(path.parts[0])
|
|
134
|
+
blobs = bucket.list_blobs(prefix="/".join(path.parts) + "/")
|
|
135
|
+
for blob in blobs:
|
|
136
|
+
blob.delete()
|
|
137
|
+
|
|
122
138
|
|
|
123
139
|
if any("dapla" in key.lower() for key in os.environ) and "gcsfs" in locals():
|
|
124
|
-
_config = Config(
|
|
140
|
+
_config = Config(MyGCSFileSystem)
|
|
125
141
|
else:
|
|
126
142
|
_config = Config(LocalFileSystem)
|
|
127
143
|
|
|
@@ -314,7 +330,14 @@ class Path(str, _PathBase):
|
|
|
314
330
|
return self._cp_or_mv(destination, "cp")
|
|
315
331
|
|
|
316
332
|
def mv(self, destination: "Path | str") -> "Path":
|
|
317
|
-
|
|
333
|
+
was_dir = self.isdir()
|
|
334
|
+
out_path = self._cp_or_mv(destination, "mv")
|
|
335
|
+
if was_dir:
|
|
336
|
+
try:
|
|
337
|
+
self.file_system.rmdir(str(self))
|
|
338
|
+
except (FileNotFoundError, NotADirectoryError):
|
|
339
|
+
pass
|
|
340
|
+
return out_path
|
|
318
341
|
|
|
319
342
|
def versions(self, include_versionless: bool = False) -> "PathSeries":
|
|
320
343
|
"""Returns a PathSeries of all versions of the file."""
|
|
@@ -599,7 +622,7 @@ class Path(str, _PathBase):
|
|
|
599
622
|
|
|
600
623
|
@property
|
|
601
624
|
def index_column_names(self) -> list[str]:
|
|
602
|
-
return _get_index_cols(self.schema)
|
|
625
|
+
return _get_index_cols(self.schema, self)
|
|
603
626
|
|
|
604
627
|
@property
|
|
605
628
|
def columns(self) -> pd.Index:
|
|
@@ -612,7 +635,7 @@ class Path(str, _PathBase):
|
|
|
612
635
|
]
|
|
613
636
|
except (KeyError, TypeError):
|
|
614
637
|
names = schema.names
|
|
615
|
-
index_cols = _get_index_cols(schema)
|
|
638
|
+
index_cols = _get_index_cols(schema, self)
|
|
616
639
|
return pd.Index(names).difference(index_cols)
|
|
617
640
|
|
|
618
641
|
@property
|
|
@@ -621,16 +644,14 @@ class Path(str, _PathBase):
|
|
|
621
644
|
try:
|
|
622
645
|
with self.open("rb") as file:
|
|
623
646
|
return get_schema(file)
|
|
624
|
-
except
|
|
625
|
-
Exception
|
|
626
|
-
): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
647
|
+
except Exception:
|
|
627
648
|
return get_schema(self)
|
|
628
649
|
|
|
629
650
|
@property
|
|
630
651
|
def dtypes(self) -> pd.Series:
|
|
631
652
|
"""Date types of the file's columns."""
|
|
632
653
|
schema = self.schema
|
|
633
|
-
index_cols = _get_index_cols(schema)
|
|
654
|
+
index_cols = _get_index_cols(schema, self)
|
|
634
655
|
return pd.Series(schema.types, index=schema.names).loc[
|
|
635
656
|
lambda x: ~x.index.isin(index_cols)
|
|
636
657
|
]
|
|
@@ -641,9 +662,7 @@ class Path(str, _PathBase):
|
|
|
641
662
|
try:
|
|
642
663
|
with self.open("rb") as file:
|
|
643
664
|
return get_shape(file)
|
|
644
|
-
except
|
|
645
|
-
Exception
|
|
646
|
-
): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
665
|
+
except Exception:
|
|
647
666
|
return get_shape(self)
|
|
648
667
|
|
|
649
668
|
@property
|
|
@@ -702,7 +721,12 @@ class Path(str, _PathBase):
|
|
|
702
721
|
def partition_root(self) -> "Path":
|
|
703
722
|
if ".parquet" not in self:
|
|
704
723
|
return self
|
|
705
|
-
return self.split(".parquet")[0] + ".parquet"
|
|
724
|
+
return self._new(self.split(".parquet")[0] + ".parquet")
|
|
725
|
+
|
|
726
|
+
def is_partitioned(self) -> bool:
|
|
727
|
+
if ".parquet" not in self or self.isfile() and self.count(".parquet") != 2:
|
|
728
|
+
return False
|
|
729
|
+
return bool(len(self.glob("**/*.parquet")))
|
|
706
730
|
|
|
707
731
|
def isfile(self) -> bool:
|
|
708
732
|
return not self.isdir()
|
|
@@ -1517,8 +1541,11 @@ def get_path_tree(
|
|
|
1517
1541
|
return tree
|
|
1518
1542
|
|
|
1519
1543
|
|
|
1520
|
-
def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
|
|
1521
|
-
|
|
1544
|
+
def _get_index_cols(schema: pyarrow.Schema, path_or_file: str | Path) -> list[str]:
|
|
1545
|
+
try:
|
|
1546
|
+
cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
|
|
1547
|
+
except KeyError as e:
|
|
1548
|
+
raise KeyError(f"{e}. For {type(path_or_file)}: {path_or_file}")
|
|
1522
1549
|
return [x for x in cols if not isinstance(x, dict)]
|
|
1523
1550
|
|
|
1524
1551
|
|
|
@@ -1570,13 +1597,21 @@ def get_schema(file) -> pyarrow.Schema:
|
|
|
1570
1597
|
except Exception as e2:
|
|
1571
1598
|
raise e2.__class__(f"{e2}. {path}") from e
|
|
1572
1599
|
|
|
1600
|
+
child_paths = file_system.glob(file + "/**/*.parquet")
|
|
1601
|
+
if not len(child_paths):
|
|
1602
|
+
raise e.__class__(f"{e}: {file}") from e
|
|
1603
|
+
|
|
1573
1604
|
with ThreadPoolExecutor() as executor:
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
|
|
1577
|
-
),
|
|
1578
|
-
promote_options="permissive",
|
|
1605
|
+
schemas: list[pyarrow.Schema] = list(
|
|
1606
|
+
executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
|
|
1579
1607
|
)
|
|
1608
|
+
if not schemas:
|
|
1609
|
+
raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
|
|
1610
|
+
|
|
1611
|
+
return pyarrow.unify_schemas(
|
|
1612
|
+
schemas,
|
|
1613
|
+
promote_options="permissive",
|
|
1614
|
+
)
|
|
1580
1615
|
|
|
1581
1616
|
|
|
1582
1617
|
def get_num_rows(file):
|
|
@@ -1599,7 +1634,7 @@ def get_num_rows(file):
|
|
|
1599
1634
|
|
|
1600
1635
|
def get_shape(file) -> tuple[int, int]:
|
|
1601
1636
|
schema = get_schema(file)
|
|
1602
|
-
index_cols = _get_index_cols(schema)
|
|
1637
|
+
index_cols = _get_index_cols(schema, file)
|
|
1603
1638
|
ncol: int = sum(name not in index_cols for name in schema.names)
|
|
1604
1639
|
nrow: int = get_num_rows(file)
|
|
1605
1640
|
return nrow, ncol
|
|
File without changes
|
|
File without changes
|
|
File without changes
|