daplapath 2.0.3__tar.gz → 2.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {daplapath-2.0.3 → daplapath-2.0.5}/PKG-INFO +1 -1
- {daplapath-2.0.3 → daplapath-2.0.5}/daplapath/path.py +171 -64
- {daplapath-2.0.3 → daplapath-2.0.5}/pyproject.toml +1 -1
- {daplapath-2.0.3 → daplapath-2.0.5}/LICENSE.md +0 -0
- {daplapath-2.0.3 → daplapath-2.0.5}/README.md +0 -0
- {daplapath-2.0.3 → daplapath-2.0.5}/daplapath/__init__.py +0 -0
|
@@ -14,8 +14,8 @@ import shutil
|
|
|
14
14
|
from typing import Callable, Any
|
|
15
15
|
import inspect
|
|
16
16
|
import itertools
|
|
17
|
-
import warnings
|
|
18
17
|
|
|
18
|
+
from fsspec.spec import AbstractFileSystem
|
|
19
19
|
import datetime
|
|
20
20
|
import numpy as np
|
|
21
21
|
import pandas as pd
|
|
@@ -23,6 +23,8 @@ import pandas.io.formats.format as fmt
|
|
|
23
23
|
from pandas.api.types import is_dict_like
|
|
24
24
|
import pyarrow
|
|
25
25
|
import pyarrow.parquet as pq
|
|
26
|
+
import pyarrow.dataset as ds
|
|
27
|
+
|
|
26
28
|
|
|
27
29
|
try:
|
|
28
30
|
import gcsfs
|
|
@@ -47,15 +49,15 @@ class Config:
|
|
|
47
49
|
file_system: Callable
|
|
48
50
|
|
|
49
51
|
|
|
50
|
-
class LocalFileSystem:
|
|
52
|
+
class LocalFileSystem(AbstractFileSystem):
|
|
51
53
|
"""Mimicks GCS's FileSystem but using standard library (os, glob, shutil)."""
|
|
52
54
|
|
|
53
55
|
@staticmethod
|
|
54
56
|
def glob(
|
|
55
57
|
path: str,
|
|
56
|
-
recursive: bool = True,
|
|
57
58
|
detail: bool = False,
|
|
58
|
-
|
|
59
|
+
recursive: bool = True,
|
|
60
|
+
include_hidden: bool = True,
|
|
59
61
|
**kwargs,
|
|
60
62
|
) -> list[dict] | list[str]:
|
|
61
63
|
relevant_paths = glob.iglob(
|
|
@@ -67,14 +69,16 @@ class LocalFileSystem:
|
|
|
67
69
|
with ThreadPoolExecutor() as executor:
|
|
68
70
|
return list(executor.map(get_file_info, relevant_paths))
|
|
69
71
|
|
|
72
|
+
@classmethod
|
|
73
|
+
def ls(cls, path: str, detail: bool = False, **kwargs):
|
|
74
|
+
return cls().glob(
|
|
75
|
+
str(pathlib.Path(path) / "**"), detail=detail, recursive=False, **kwargs
|
|
76
|
+
)
|
|
77
|
+
|
|
70
78
|
@staticmethod
|
|
71
79
|
def info(path) -> dict[str, Any]:
|
|
72
80
|
return get_file_info(path)
|
|
73
81
|
|
|
74
|
-
@staticmethod
|
|
75
|
-
def isdir(path: str) -> bool:
|
|
76
|
-
return os.path.isdir(path)
|
|
77
|
-
|
|
78
82
|
@staticmethod
|
|
79
83
|
def open(path: str, *args, **kwargs) -> io.TextIOWrapper:
|
|
80
84
|
return open(path, *args, **kwargs)
|
|
@@ -87,8 +91,12 @@ class LocalFileSystem:
|
|
|
87
91
|
def mv(source: str, destination, **kwargs) -> str:
|
|
88
92
|
return shutil.move(source, destination, **kwargs)
|
|
89
93
|
|
|
94
|
+
@classmethod
|
|
95
|
+
def cp(cls, source: str, destination, **kwargs) -> str:
|
|
96
|
+
return cls.cp_file(source, destination, **kwargs)
|
|
97
|
+
|
|
90
98
|
@staticmethod
|
|
91
|
-
def
|
|
99
|
+
def cp_file(self, path1, path2, **kwargs):
|
|
92
100
|
os.makedirs(pathlib.Path(destination).parent, exist_ok=True)
|
|
93
101
|
return shutil.copy2(source, destination, **kwargs)
|
|
94
102
|
|
|
@@ -96,6 +104,14 @@ class LocalFileSystem:
|
|
|
96
104
|
def rm_file(path: str, *args, **kwargs) -> None:
|
|
97
105
|
return os.remove(path, *args, **kwargs)
|
|
98
106
|
|
|
107
|
+
@staticmethod
|
|
108
|
+
def rmdir(path: str, *args, **kwargs) -> None:
|
|
109
|
+
return shutil.rmtree(path, *args, **kwargs)
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def makedirs(path: str, exist_ok: bool = False) -> None:
|
|
113
|
+
return os.makedirs(path, exist_ok=exist_ok)
|
|
114
|
+
|
|
99
115
|
|
|
100
116
|
class GCSFileSystem(gcsfs.GCSFileSystem):
|
|
101
117
|
def isdir(self, path: str) -> bool:
|
|
@@ -110,9 +126,6 @@ else:
|
|
|
110
126
|
_config = Config(LocalFileSystem)
|
|
111
127
|
|
|
112
128
|
|
|
113
|
-
gcsfs.GCSFileSystem.isdir
|
|
114
|
-
|
|
115
|
-
|
|
116
129
|
class Tree:
|
|
117
130
|
"""Stores text to be printed/displayed in directory tree format.
|
|
118
131
|
|
|
@@ -166,6 +179,17 @@ class _PathBase:
|
|
|
166
179
|
class Path(str, _PathBase):
|
|
167
180
|
"""Path object that works like a string, with methods for working with the GCS file system."""
|
|
168
181
|
|
|
182
|
+
_file_system_attrs: set[str] = {
|
|
183
|
+
"info",
|
|
184
|
+
"isdir",
|
|
185
|
+
"open",
|
|
186
|
+
"exists",
|
|
187
|
+
"mv",
|
|
188
|
+
"cp",
|
|
189
|
+
"rm_file",
|
|
190
|
+
"rmdir",
|
|
191
|
+
}
|
|
192
|
+
|
|
169
193
|
@property
|
|
170
194
|
def _iterable_type(self) -> type | Callable:
|
|
171
195
|
"""Can be overridden in subclass."""
|
|
@@ -190,6 +214,19 @@ class Path(str, _PathBase):
|
|
|
190
214
|
obj._file_system = None
|
|
191
215
|
return obj
|
|
192
216
|
|
|
217
|
+
@property
|
|
218
|
+
def local_path(self) -> "Path":
|
|
219
|
+
if self.startswith("/buckets"):
|
|
220
|
+
return self
|
|
221
|
+
|
|
222
|
+
root = self.parts[0]
|
|
223
|
+
bucket = root.split("-data-")[-1].split("-prod")[0]
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
return self.__class__(f"/buckets/{bucket}/{'/'.join(self.parts[1:])}")
|
|
227
|
+
except IndexError:
|
|
228
|
+
return self.__class__(f"/buckets/{bucket}")
|
|
229
|
+
|
|
193
230
|
def tree(
|
|
194
231
|
self,
|
|
195
232
|
max_rows: int | None = 3,
|
|
@@ -516,6 +553,11 @@ class Path(str, _PathBase):
|
|
|
516
553
|
"""Parent path."""
|
|
517
554
|
return self.__class__(self._path.parent)
|
|
518
555
|
|
|
556
|
+
@property
|
|
557
|
+
def parents(self) -> "list[Path]":
|
|
558
|
+
"""Parent path."""
|
|
559
|
+
return [self.__class__(parent) for parent in self._path.parents]
|
|
560
|
+
|
|
519
561
|
@property
|
|
520
562
|
def name(self) -> str:
|
|
521
563
|
"""Final part of the path."""
|
|
@@ -542,52 +584,48 @@ class Path(str, _PathBase):
|
|
|
542
584
|
|
|
543
585
|
@property
|
|
544
586
|
def index_column_names(self) -> list[str]:
|
|
545
|
-
|
|
546
|
-
try:
|
|
547
|
-
schema = pq.read_schema(file)
|
|
548
|
-
return _get_index_cols(schema)
|
|
549
|
-
except KeyError:
|
|
550
|
-
return read_nrows(file, 1).index.names
|
|
587
|
+
return _get_index_cols(self.schema)
|
|
551
588
|
|
|
552
589
|
@property
|
|
553
590
|
def columns(self) -> pd.Index:
|
|
554
591
|
"""Columns of the file."""
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
592
|
+
schema = self.schema
|
|
593
|
+
try:
|
|
594
|
+
names = [
|
|
595
|
+
x["field_name"]
|
|
596
|
+
for x in json.loads(schema.metadata[b"pandas"].decode())["columns"]
|
|
597
|
+
]
|
|
598
|
+
except (KeyError, TypeError):
|
|
599
|
+
names = schema.names
|
|
600
|
+
index_cols = _get_index_cols(schema)
|
|
601
|
+
return pd.Index(names).difference(index_cols)
|
|
562
602
|
|
|
563
603
|
@property
|
|
564
604
|
def schema(self) -> pyarrow.Schema:
|
|
565
605
|
"""Date types of the file's columns."""
|
|
566
|
-
|
|
567
|
-
|
|
606
|
+
try:
|
|
607
|
+
with self.open("rb") as file:
|
|
608
|
+
return get_schema(file)
|
|
609
|
+
except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
610
|
+
return get_schema(self)
|
|
568
611
|
|
|
569
612
|
@property
|
|
570
613
|
def dtypes(self) -> pd.Series:
|
|
571
614
|
"""Date types of the file's columns."""
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
lambda x: ~x.index.isin(index_cols)
|
|
578
|
-
]
|
|
579
|
-
except KeyError:
|
|
580
|
-
return read_nrows(file, 1).dtypes
|
|
615
|
+
schema = self.schema
|
|
616
|
+
index_cols = _get_index_cols(schema)
|
|
617
|
+
return pd.Series(schema.types, index=schema.names).loc[
|
|
618
|
+
lambda x: ~x.index.isin(index_cols)
|
|
619
|
+
]
|
|
581
620
|
|
|
582
621
|
@property
|
|
583
622
|
def shape(self) -> tuple[int, int]:
|
|
584
623
|
"""Number of rows and columns."""
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
return read_nrows(file, 1).shape
|
|
624
|
+
try:
|
|
625
|
+
with self.open("rb") as file:
|
|
626
|
+
return get_shape(file)
|
|
627
|
+
except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
628
|
+
return get_shape(self)
|
|
591
629
|
|
|
592
630
|
@property
|
|
593
631
|
def nrow(self) -> int:
|
|
@@ -643,16 +681,18 @@ class Path(str, _PathBase):
|
|
|
643
681
|
|
|
644
682
|
@property
|
|
645
683
|
def partition_root(self) -> "Path":
|
|
684
|
+
if ".parquet" not in self:
|
|
685
|
+
return self
|
|
646
686
|
return self.split(".parquet")[0] + ".parquet"
|
|
647
687
|
|
|
648
|
-
def
|
|
649
|
-
|
|
650
|
-
return self.file_system.isdir(self)
|
|
651
|
-
except AttributeError:
|
|
652
|
-
return self.file_system.is_dir(self)
|
|
688
|
+
def isfile(self) -> bool:
|
|
689
|
+
return not self.isdir()
|
|
653
690
|
|
|
654
691
|
def is_file(self) -> bool:
|
|
655
|
-
return
|
|
692
|
+
return self.isfile()
|
|
693
|
+
|
|
694
|
+
def is_dir(self) -> bool:
|
|
695
|
+
return self.isdir()
|
|
656
696
|
|
|
657
697
|
def with_suffix(self, suffix: str):
|
|
658
698
|
return self.__class__(self._path.with_suffix(suffix))
|
|
@@ -721,21 +761,15 @@ class Path(str, _PathBase):
|
|
|
721
761
|
error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
|
|
722
762
|
if attr.startswith("_"):
|
|
723
763
|
raise AttributeError(error_message)
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
raise AttributeError(error_message) from e
|
|
764
|
+
if attr not in self._file_system_attrs:
|
|
765
|
+
raise AttributeError(error_message)
|
|
766
|
+
return functools.partial(getattr(self.file_system, attr), self)
|
|
728
767
|
|
|
729
768
|
def __fspath__(self) -> str:
|
|
730
769
|
return str(self)
|
|
731
770
|
|
|
732
771
|
def __dir__(self) -> list[str]:
|
|
733
|
-
return list(
|
|
734
|
-
sorted(
|
|
735
|
-
{x for x in dir(Path)}
|
|
736
|
-
| {x for x in dir(self._file_system) if not x.startswith("_")}
|
|
737
|
-
)
|
|
738
|
-
)
|
|
772
|
+
return list(sorted({x for x in dir(Path)} | self._file_system_attrs))
|
|
739
773
|
|
|
740
774
|
def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
|
|
741
775
|
series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
|
|
@@ -863,6 +897,12 @@ class PathSeries(pd.Series, _PathBase):
|
|
|
863
897
|
def partition_root(self) -> "PathSeries":
|
|
864
898
|
return self.files.apply(lambda x: x.partition_root).drop_duplicates()
|
|
865
899
|
|
|
900
|
+
@property
|
|
901
|
+
def partitioned_files(self) -> "PathSeries":
|
|
902
|
+
return self.files.loc[
|
|
903
|
+
lambda x: x.str.count(r"\.parquet") == 2
|
|
904
|
+
].partition_root.drop_duplicates()
|
|
905
|
+
|
|
866
906
|
@property
|
|
867
907
|
def dirs(self) -> "PathSeries":
|
|
868
908
|
"""Select only the directories in the Series."""
|
|
@@ -1218,18 +1258,18 @@ def split_path_and_make_copyable_html(
|
|
|
1218
1258
|
split: str | None = "/",
|
|
1219
1259
|
display_prefix: str | None = ".../",
|
|
1220
1260
|
) -> str:
|
|
1221
|
-
"""Get
|
|
1261
|
+
"""Get HTML text that displays the last part, but makes the full path copyable to clipboard.
|
|
1222
1262
|
|
|
1223
|
-
Splits the path on a delimiter and creates an
|
|
1263
|
+
Splits the path on a delimiter and creates an HTML string that displays only the
|
|
1224
1264
|
last part, but adds a hyperlink which copies the full path to clipboard when clicked.
|
|
1225
1265
|
|
|
1226
1266
|
Parameters
|
|
1227
1267
|
----------
|
|
1228
1268
|
path: File or directory path
|
|
1229
|
-
max_parts: Maximum number of path
|
|
1269
|
+
max_parts: Maximum number of path parts to display. Defaults to 2,
|
|
1230
1270
|
meaning the two last parts. Set to None to show full paths.
|
|
1231
1271
|
split: Text pattern to split the path on. Defaults to "/".
|
|
1232
|
-
display_prefix: The text to display instead of the parent directory. Defaults to ".../"
|
|
1272
|
+
display_prefix: The text to display instead of the parent directory. Defaults to ".../".
|
|
1233
1273
|
|
|
1234
1274
|
Returns
|
|
1235
1275
|
-------
|
|
@@ -1237,7 +1277,8 @@ def split_path_and_make_copyable_html(
|
|
|
1237
1277
|
"""
|
|
1238
1278
|
|
|
1239
1279
|
copy_to_clipboard_js = f"""<script>
|
|
1240
|
-
function copyToClipboard(text) {{
|
|
1280
|
+
function copyToClipboard(text, event) {{
|
|
1281
|
+
event.preventDefault();
|
|
1241
1282
|
navigator.clipboard.writeText(text)
|
|
1242
1283
|
.then(() => {{
|
|
1243
1284
|
const alertBox = document.createElement('div');
|
|
@@ -1271,7 +1312,7 @@ function copyToClipboard(text) {{
|
|
|
1271
1312
|
else:
|
|
1272
1313
|
displayed_text = path
|
|
1273
1314
|
|
|
1274
|
-
return f'{copy_to_clipboard_js}<a href="
|
|
1315
|
+
return f'{copy_to_clipboard_js}<a href="#" title="{path}" onclick="copyToClipboard(\'{path}\', event)">{displayed_text}</a>'
|
|
1275
1316
|
|
|
1276
1317
|
|
|
1277
1318
|
def _get_default_multi_index() -> pd.MultiIndex:
|
|
@@ -1458,6 +1499,72 @@ def get_arguments(func: Callable | object) -> list[str]:
|
|
|
1458
1499
|
)
|
|
1459
1500
|
|
|
1460
1501
|
|
|
1502
|
+
def get_schema(file) -> pyarrow.Schema:
|
|
1503
|
+
try:
|
|
1504
|
+
return pq.read_schema(file)
|
|
1505
|
+
except (
|
|
1506
|
+
PermissionError,
|
|
1507
|
+
pyarrow.ArrowInvalid,
|
|
1508
|
+
FileNotFoundError,
|
|
1509
|
+
IsADirectoryError,
|
|
1510
|
+
OSError,
|
|
1511
|
+
):
|
|
1512
|
+
# try:
|
|
1513
|
+
# return ds.dataset(file).schema
|
|
1514
|
+
# except (TypeError, FileNotFoundError) as e:
|
|
1515
|
+
if not hasattr(file, "file_system"):
|
|
1516
|
+
raise e
|
|
1517
|
+
|
|
1518
|
+
file_system = file.file_system
|
|
1519
|
+
|
|
1520
|
+
def _get_schema(path):
|
|
1521
|
+
try:
|
|
1522
|
+
return pq.read_schema(path)
|
|
1523
|
+
except FileNotFoundError:
|
|
1524
|
+
with file_system.open(path, "rb") as f:
|
|
1525
|
+
return pq.read_schema(f)
|
|
1526
|
+
|
|
1527
|
+
with ThreadPoolExecutor() as executor:
|
|
1528
|
+
return pyarrow.unify_schemas(
|
|
1529
|
+
list(
|
|
1530
|
+
executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
|
|
1531
|
+
),
|
|
1532
|
+
promote_options="permissive",
|
|
1533
|
+
)
|
|
1534
|
+
|
|
1535
|
+
|
|
1536
|
+
def get_num_rows(file):
|
|
1537
|
+
try:
|
|
1538
|
+
return pq.read_metadata(file).num_rows
|
|
1539
|
+
except (
|
|
1540
|
+
PermissionError,
|
|
1541
|
+
pyarrow.ArrowInvalid,
|
|
1542
|
+
FileNotFoundError,
|
|
1543
|
+
TypeError,
|
|
1544
|
+
OSError,
|
|
1545
|
+
) as e:
|
|
1546
|
+
try:
|
|
1547
|
+
return ds.dataset(file).count_rows()
|
|
1548
|
+
except Exception as e2:
|
|
1549
|
+
if not hasattr(file, "glob"):
|
|
1550
|
+
raise e2 from 2
|
|
1551
|
+
|
|
1552
|
+
def _get_num_rows(path):
|
|
1553
|
+
with path.open("rb") as file:
|
|
1554
|
+
return pq.read_metadata(file).num_rows
|
|
1555
|
+
|
|
1556
|
+
with ThreadPoolExecutor() as executor:
|
|
1557
|
+
return sum(executor.map(_get_num_rows, file.glob("**").files))
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
def get_shape(file) -> tuple[int, int]:
|
|
1561
|
+
schema = get_schema(file)
|
|
1562
|
+
index_cols = _get_index_cols(schema)
|
|
1563
|
+
ncol: int = sum(name not in index_cols for name in schema.names)
|
|
1564
|
+
nrow: int = get_num_rows(file)
|
|
1565
|
+
return nrow, ncol
|
|
1566
|
+
|
|
1567
|
+
|
|
1461
1568
|
def read_nrows(file, nrow: int) -> pd.DataFrame:
|
|
1462
1569
|
"""Read first n rows of a parquet file."""
|
|
1463
1570
|
rows = next(pq.ParquetFile(file).iter_batches(nrow))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|