daplapath 2.0.2__tar.gz → 2.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {daplapath-2.0.2 → daplapath-2.0.5}/PKG-INFO +1 -1
- {daplapath-2.0.2 → daplapath-2.0.5}/daplapath/path.py +181 -70
- {daplapath-2.0.2 → daplapath-2.0.5}/pyproject.toml +1 -1
- {daplapath-2.0.2 → daplapath-2.0.5}/LICENSE.md +0 -0
- {daplapath-2.0.2 → daplapath-2.0.5}/README.md +0 -0
- {daplapath-2.0.2 → daplapath-2.0.5}/daplapath/__init__.py +0 -0
|
@@ -14,8 +14,8 @@ import shutil
|
|
|
14
14
|
from typing import Callable, Any
|
|
15
15
|
import inspect
|
|
16
16
|
import itertools
|
|
17
|
-
import warnings
|
|
18
17
|
|
|
18
|
+
from fsspec.spec import AbstractFileSystem
|
|
19
19
|
import datetime
|
|
20
20
|
import numpy as np
|
|
21
21
|
import pandas as pd
|
|
@@ -23,15 +23,16 @@ import pandas.io.formats.format as fmt
|
|
|
23
23
|
from pandas.api.types import is_dict_like
|
|
24
24
|
import pyarrow
|
|
25
25
|
import pyarrow.parquet as pq
|
|
26
|
+
import pyarrow.dataset as ds
|
|
27
|
+
|
|
26
28
|
|
|
27
29
|
try:
|
|
28
30
|
import gcsfs
|
|
29
31
|
except ImportError:
|
|
30
32
|
pass
|
|
31
33
|
|
|
32
|
-
|
|
33
34
|
# regex with the prefix '_v' followed by an integer of any length
|
|
34
|
-
VERSION_PATTERN = r"_v(\d+)"
|
|
35
|
+
VERSION_PATTERN = r"_v(\d+)\."
|
|
35
36
|
VERSION_PREFIX = "_v"
|
|
36
37
|
|
|
37
38
|
# regex with the prefix '_p' followed by four length integer (year) and OPTIONALLY month and date, separated by '-'
|
|
@@ -48,15 +49,15 @@ class Config:
|
|
|
48
49
|
file_system: Callable
|
|
49
50
|
|
|
50
51
|
|
|
51
|
-
class LocalFileSystem:
|
|
52
|
+
class LocalFileSystem(AbstractFileSystem):
|
|
52
53
|
"""Mimicks GCS's FileSystem but using standard library (os, glob, shutil)."""
|
|
53
54
|
|
|
54
55
|
@staticmethod
|
|
55
56
|
def glob(
|
|
56
57
|
path: str,
|
|
57
|
-
recursive: bool = True,
|
|
58
58
|
detail: bool = False,
|
|
59
|
-
|
|
59
|
+
recursive: bool = True,
|
|
60
|
+
include_hidden: bool = True,
|
|
60
61
|
**kwargs,
|
|
61
62
|
) -> list[dict] | list[str]:
|
|
62
63
|
relevant_paths = glob.iglob(
|
|
@@ -68,14 +69,16 @@ class LocalFileSystem:
|
|
|
68
69
|
with ThreadPoolExecutor() as executor:
|
|
69
70
|
return list(executor.map(get_file_info, relevant_paths))
|
|
70
71
|
|
|
72
|
+
@classmethod
|
|
73
|
+
def ls(cls, path: str, detail: bool = False, **kwargs):
|
|
74
|
+
return cls().glob(
|
|
75
|
+
str(pathlib.Path(path) / "**"), detail=detail, recursive=False, **kwargs
|
|
76
|
+
)
|
|
77
|
+
|
|
71
78
|
@staticmethod
|
|
72
79
|
def info(path) -> dict[str, Any]:
|
|
73
80
|
return get_file_info(path)
|
|
74
81
|
|
|
75
|
-
@staticmethod
|
|
76
|
-
def isdir(path: str) -> bool:
|
|
77
|
-
return os.path.isdir(path)
|
|
78
|
-
|
|
79
82
|
@staticmethod
|
|
80
83
|
def open(path: str, *args, **kwargs) -> io.TextIOWrapper:
|
|
81
84
|
return open(path, *args, **kwargs)
|
|
@@ -88,8 +91,12 @@ class LocalFileSystem:
|
|
|
88
91
|
def mv(source: str, destination, **kwargs) -> str:
|
|
89
92
|
return shutil.move(source, destination, **kwargs)
|
|
90
93
|
|
|
94
|
+
@classmethod
|
|
95
|
+
def cp(cls, source: str, destination, **kwargs) -> str:
|
|
96
|
+
return cls.cp_file(source, destination, **kwargs)
|
|
97
|
+
|
|
91
98
|
@staticmethod
|
|
92
|
-
def
|
|
99
|
+
def cp_file(self, path1, path2, **kwargs):
|
|
93
100
|
os.makedirs(pathlib.Path(destination).parent, exist_ok=True)
|
|
94
101
|
return shutil.copy2(source, destination, **kwargs)
|
|
95
102
|
|
|
@@ -97,6 +104,14 @@ class LocalFileSystem:
|
|
|
97
104
|
def rm_file(path: str, *args, **kwargs) -> None:
|
|
98
105
|
return os.remove(path, *args, **kwargs)
|
|
99
106
|
|
|
107
|
+
@staticmethod
|
|
108
|
+
def rmdir(path: str, *args, **kwargs) -> None:
|
|
109
|
+
return shutil.rmtree(path, *args, **kwargs)
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def makedirs(path: str, exist_ok: bool = False) -> None:
|
|
113
|
+
return os.makedirs(path, exist_ok=exist_ok)
|
|
114
|
+
|
|
100
115
|
|
|
101
116
|
class GCSFileSystem(gcsfs.GCSFileSystem):
|
|
102
117
|
def isdir(self, path: str) -> bool:
|
|
@@ -111,9 +126,6 @@ else:
|
|
|
111
126
|
_config = Config(LocalFileSystem)
|
|
112
127
|
|
|
113
128
|
|
|
114
|
-
gcsfs.GCSFileSystem.isdir
|
|
115
|
-
|
|
116
|
-
|
|
117
129
|
class Tree:
|
|
118
130
|
"""Stores text to be printed/displayed in directory tree format.
|
|
119
131
|
|
|
@@ -167,6 +179,17 @@ class _PathBase:
|
|
|
167
179
|
class Path(str, _PathBase):
|
|
168
180
|
"""Path object that works like a string, with methods for working with the GCS file system."""
|
|
169
181
|
|
|
182
|
+
_file_system_attrs: set[str] = {
|
|
183
|
+
"info",
|
|
184
|
+
"isdir",
|
|
185
|
+
"open",
|
|
186
|
+
"exists",
|
|
187
|
+
"mv",
|
|
188
|
+
"cp",
|
|
189
|
+
"rm_file",
|
|
190
|
+
"rmdir",
|
|
191
|
+
}
|
|
192
|
+
|
|
170
193
|
@property
|
|
171
194
|
def _iterable_type(self) -> type | Callable:
|
|
172
195
|
"""Can be overridden in subclass."""
|
|
@@ -191,6 +214,19 @@ class Path(str, _PathBase):
|
|
|
191
214
|
obj._file_system = None
|
|
192
215
|
return obj
|
|
193
216
|
|
|
217
|
+
@property
|
|
218
|
+
def local_path(self) -> "Path":
|
|
219
|
+
if self.startswith("/buckets"):
|
|
220
|
+
return self
|
|
221
|
+
|
|
222
|
+
root = self.parts[0]
|
|
223
|
+
bucket = root.split("-data-")[-1].split("-prod")[0]
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
return self.__class__(f"/buckets/{bucket}/{'/'.join(self.parts[1:])}")
|
|
227
|
+
except IndexError:
|
|
228
|
+
return self.__class__(f"/buckets/{bucket}")
|
|
229
|
+
|
|
194
230
|
def tree(
|
|
195
231
|
self,
|
|
196
232
|
max_rows: int | None = 3,
|
|
@@ -269,11 +305,16 @@ class Path(str, _PathBase):
|
|
|
269
305
|
"""Returns a PathSeries of all versions of the file."""
|
|
270
306
|
files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
|
|
271
307
|
|
|
308
|
+
if self.version_number:
|
|
309
|
+
start, _, end = re.split(self._version_pattern, self)
|
|
310
|
+
else:
|
|
311
|
+
start, end = self.stem, self.suffix
|
|
312
|
+
|
|
272
313
|
# create boolean mask. With numpy to make it work with both pandas and list
|
|
273
314
|
arr = np.array(files_in_folder)
|
|
274
|
-
is_version_of_this_file = (
|
|
275
|
-
|
|
276
|
-
)
|
|
315
|
+
is_version_of_this_file = (np_str_contains(arr, start)) & (
|
|
316
|
+
np_str_endswith(arr, end)
|
|
317
|
+
)
|
|
277
318
|
if not include_versionless:
|
|
278
319
|
is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
|
|
279
320
|
|
|
@@ -505,13 +546,18 @@ class Path(str, _PathBase):
|
|
|
505
546
|
@property
|
|
506
547
|
def versionless_stem(self) -> str:
|
|
507
548
|
"""Return the file stem before the version pattern."""
|
|
508
|
-
return
|
|
549
|
+
return self.__class__(re.split(self._version_pattern, self._path.name)[0]).stem
|
|
509
550
|
|
|
510
551
|
@property
|
|
511
552
|
def parent(self) -> "Path":
|
|
512
553
|
"""Parent path."""
|
|
513
554
|
return self.__class__(self._path.parent)
|
|
514
555
|
|
|
556
|
+
@property
|
|
557
|
+
def parents(self) -> "list[Path]":
|
|
558
|
+
"""Parent path."""
|
|
559
|
+
return [self.__class__(parent) for parent in self._path.parents]
|
|
560
|
+
|
|
515
561
|
@property
|
|
516
562
|
def name(self) -> str:
|
|
517
563
|
"""Final part of the path."""
|
|
@@ -538,52 +584,48 @@ class Path(str, _PathBase):
|
|
|
538
584
|
|
|
539
585
|
@property
|
|
540
586
|
def index_column_names(self) -> list[str]:
|
|
541
|
-
|
|
542
|
-
try:
|
|
543
|
-
schema = pq.read_schema(file)
|
|
544
|
-
return _get_index_cols(schema)
|
|
545
|
-
except KeyError:
|
|
546
|
-
return read_nrows(file, 1).index.names
|
|
587
|
+
return _get_index_cols(self.schema)
|
|
547
588
|
|
|
548
589
|
@property
|
|
549
590
|
def columns(self) -> pd.Index:
|
|
550
591
|
"""Columns of the file."""
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
592
|
+
schema = self.schema
|
|
593
|
+
try:
|
|
594
|
+
names = [
|
|
595
|
+
x["field_name"]
|
|
596
|
+
for x in json.loads(schema.metadata[b"pandas"].decode())["columns"]
|
|
597
|
+
]
|
|
598
|
+
except (KeyError, TypeError):
|
|
599
|
+
names = schema.names
|
|
600
|
+
index_cols = _get_index_cols(schema)
|
|
601
|
+
return pd.Index(names).difference(index_cols)
|
|
558
602
|
|
|
559
603
|
@property
|
|
560
604
|
def schema(self) -> pyarrow.Schema:
|
|
561
605
|
"""Date types of the file's columns."""
|
|
562
|
-
|
|
563
|
-
|
|
606
|
+
try:
|
|
607
|
+
with self.open("rb") as file:
|
|
608
|
+
return get_schema(file)
|
|
609
|
+
except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
610
|
+
return get_schema(self)
|
|
564
611
|
|
|
565
612
|
@property
|
|
566
613
|
def dtypes(self) -> pd.Series:
|
|
567
614
|
"""Date types of the file's columns."""
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
lambda x: ~x.index.isin(index_cols)
|
|
574
|
-
]
|
|
575
|
-
except KeyError:
|
|
576
|
-
return read_nrows(file, 1).dtypes
|
|
615
|
+
schema = self.schema
|
|
616
|
+
index_cols = _get_index_cols(schema)
|
|
617
|
+
return pd.Series(schema.types, index=schema.names).loc[
|
|
618
|
+
lambda x: ~x.index.isin(index_cols)
|
|
619
|
+
]
|
|
577
620
|
|
|
578
621
|
@property
|
|
579
622
|
def shape(self) -> tuple[int, int]:
|
|
580
623
|
"""Number of rows and columns."""
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
return read_nrows(file, 1).shape
|
|
624
|
+
try:
|
|
625
|
+
with self.open("rb") as file:
|
|
626
|
+
return get_shape(file)
|
|
627
|
+
except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
628
|
+
return get_shape(self)
|
|
587
629
|
|
|
588
630
|
@property
|
|
589
631
|
def nrow(self) -> int:
|
|
@@ -639,16 +681,18 @@ class Path(str, _PathBase):
|
|
|
639
681
|
|
|
640
682
|
@property
|
|
641
683
|
def partition_root(self) -> "Path":
|
|
684
|
+
if ".parquet" not in self:
|
|
685
|
+
return self
|
|
642
686
|
return self.split(".parquet")[0] + ".parquet"
|
|
643
687
|
|
|
644
|
-
def
|
|
645
|
-
|
|
646
|
-
return self.file_system.isdir(self)
|
|
647
|
-
except AttributeError:
|
|
648
|
-
return self.file_system.is_dir(self)
|
|
688
|
+
def isfile(self) -> bool:
|
|
689
|
+
return not self.isdir()
|
|
649
690
|
|
|
650
691
|
def is_file(self) -> bool:
|
|
651
|
-
return
|
|
692
|
+
return self.isfile()
|
|
693
|
+
|
|
694
|
+
def is_dir(self) -> bool:
|
|
695
|
+
return self.isdir()
|
|
652
696
|
|
|
653
697
|
def with_suffix(self, suffix: str):
|
|
654
698
|
return self.__class__(self._path.with_suffix(suffix))
|
|
@@ -717,21 +761,15 @@ class Path(str, _PathBase):
|
|
|
717
761
|
error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
|
|
718
762
|
if attr.startswith("_"):
|
|
719
763
|
raise AttributeError(error_message)
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
raise AttributeError(error_message) from e
|
|
764
|
+
if attr not in self._file_system_attrs:
|
|
765
|
+
raise AttributeError(error_message)
|
|
766
|
+
return functools.partial(getattr(self.file_system, attr), self)
|
|
724
767
|
|
|
725
768
|
def __fspath__(self) -> str:
|
|
726
769
|
return str(self)
|
|
727
770
|
|
|
728
771
|
def __dir__(self) -> list[str]:
|
|
729
|
-
return list(
|
|
730
|
-
sorted(
|
|
731
|
-
{x for x in dir(Path)}
|
|
732
|
-
| {x for x in dir(self._file_system) if not x.startswith("_")}
|
|
733
|
-
)
|
|
734
|
-
)
|
|
772
|
+
return list(sorted({x for x in dir(Path)} | self._file_system_attrs))
|
|
735
773
|
|
|
736
774
|
def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
|
|
737
775
|
series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
|
|
@@ -859,6 +897,12 @@ class PathSeries(pd.Series, _PathBase):
|
|
|
859
897
|
def partition_root(self) -> "PathSeries":
|
|
860
898
|
return self.files.apply(lambda x: x.partition_root).drop_duplicates()
|
|
861
899
|
|
|
900
|
+
@property
|
|
901
|
+
def partitioned_files(self) -> "PathSeries":
|
|
902
|
+
return self.files.loc[
|
|
903
|
+
lambda x: x.str.count(r"\.parquet") == 2
|
|
904
|
+
].partition_root.drop_duplicates()
|
|
905
|
+
|
|
862
906
|
@property
|
|
863
907
|
def dirs(self) -> "PathSeries":
|
|
864
908
|
"""Select only the directories in the Series."""
|
|
@@ -1214,18 +1258,18 @@ def split_path_and_make_copyable_html(
|
|
|
1214
1258
|
split: str | None = "/",
|
|
1215
1259
|
display_prefix: str | None = ".../",
|
|
1216
1260
|
) -> str:
|
|
1217
|
-
"""Get
|
|
1261
|
+
"""Get HTML text that displays the last part, but makes the full path copyable to clipboard.
|
|
1218
1262
|
|
|
1219
|
-
Splits the path on a delimiter and creates an
|
|
1263
|
+
Splits the path on a delimiter and creates an HTML string that displays only the
|
|
1220
1264
|
last part, but adds a hyperlink which copies the full path to clipboard when clicked.
|
|
1221
1265
|
|
|
1222
1266
|
Parameters
|
|
1223
1267
|
----------
|
|
1224
1268
|
path: File or directory path
|
|
1225
|
-
max_parts: Maximum number of path
|
|
1269
|
+
max_parts: Maximum number of path parts to display. Defaults to 2,
|
|
1226
1270
|
meaning the two last parts. Set to None to show full paths.
|
|
1227
1271
|
split: Text pattern to split the path on. Defaults to "/".
|
|
1228
|
-
display_prefix: The text to display instead of the parent directory. Defaults to ".../"
|
|
1272
|
+
display_prefix: The text to display instead of the parent directory. Defaults to ".../".
|
|
1229
1273
|
|
|
1230
1274
|
Returns
|
|
1231
1275
|
-------
|
|
@@ -1233,7 +1277,8 @@ def split_path_and_make_copyable_html(
|
|
|
1233
1277
|
"""
|
|
1234
1278
|
|
|
1235
1279
|
copy_to_clipboard_js = f"""<script>
|
|
1236
|
-
function copyToClipboard(text) {{
|
|
1280
|
+
function copyToClipboard(text, event) {{
|
|
1281
|
+
event.preventDefault();
|
|
1237
1282
|
navigator.clipboard.writeText(text)
|
|
1238
1283
|
.then(() => {{
|
|
1239
1284
|
const alertBox = document.createElement('div');
|
|
@@ -1267,7 +1312,7 @@ function copyToClipboard(text) {{
|
|
|
1267
1312
|
else:
|
|
1268
1313
|
displayed_text = path
|
|
1269
1314
|
|
|
1270
|
-
return f'{copy_to_clipboard_js}<a href="
|
|
1315
|
+
return f'{copy_to_clipboard_js}<a href="#" title="{path}" onclick="copyToClipboard(\'{path}\', event)">{displayed_text}</a>'
|
|
1271
1316
|
|
|
1272
1317
|
|
|
1273
1318
|
def _get_default_multi_index() -> pd.MultiIndex:
|
|
@@ -1454,6 +1499,72 @@ def get_arguments(func: Callable | object) -> list[str]:
|
|
|
1454
1499
|
)
|
|
1455
1500
|
|
|
1456
1501
|
|
|
1502
|
+
def get_schema(file) -> pyarrow.Schema:
|
|
1503
|
+
try:
|
|
1504
|
+
return pq.read_schema(file)
|
|
1505
|
+
except (
|
|
1506
|
+
PermissionError,
|
|
1507
|
+
pyarrow.ArrowInvalid,
|
|
1508
|
+
FileNotFoundError,
|
|
1509
|
+
IsADirectoryError,
|
|
1510
|
+
OSError,
|
|
1511
|
+
):
|
|
1512
|
+
# try:
|
|
1513
|
+
# return ds.dataset(file).schema
|
|
1514
|
+
# except (TypeError, FileNotFoundError) as e:
|
|
1515
|
+
if not hasattr(file, "file_system"):
|
|
1516
|
+
raise e
|
|
1517
|
+
|
|
1518
|
+
file_system = file.file_system
|
|
1519
|
+
|
|
1520
|
+
def _get_schema(path):
|
|
1521
|
+
try:
|
|
1522
|
+
return pq.read_schema(path)
|
|
1523
|
+
except FileNotFoundError:
|
|
1524
|
+
with file_system.open(path, "rb") as f:
|
|
1525
|
+
return pq.read_schema(f)
|
|
1526
|
+
|
|
1527
|
+
with ThreadPoolExecutor() as executor:
|
|
1528
|
+
return pyarrow.unify_schemas(
|
|
1529
|
+
list(
|
|
1530
|
+
executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
|
|
1531
|
+
),
|
|
1532
|
+
promote_options="permissive",
|
|
1533
|
+
)
|
|
1534
|
+
|
|
1535
|
+
|
|
1536
|
+
def get_num_rows(file):
|
|
1537
|
+
try:
|
|
1538
|
+
return pq.read_metadata(file).num_rows
|
|
1539
|
+
except (
|
|
1540
|
+
PermissionError,
|
|
1541
|
+
pyarrow.ArrowInvalid,
|
|
1542
|
+
FileNotFoundError,
|
|
1543
|
+
TypeError,
|
|
1544
|
+
OSError,
|
|
1545
|
+
) as e:
|
|
1546
|
+
try:
|
|
1547
|
+
return ds.dataset(file).count_rows()
|
|
1548
|
+
except Exception as e2:
|
|
1549
|
+
if not hasattr(file, "glob"):
|
|
1550
|
+
raise e2 from 2
|
|
1551
|
+
|
|
1552
|
+
def _get_num_rows(path):
|
|
1553
|
+
with path.open("rb") as file:
|
|
1554
|
+
return pq.read_metadata(file).num_rows
|
|
1555
|
+
|
|
1556
|
+
with ThreadPoolExecutor() as executor:
|
|
1557
|
+
return sum(executor.map(_get_num_rows, file.glob("**").files))
|
|
1558
|
+
|
|
1559
|
+
|
|
1560
|
+
def get_shape(file) -> tuple[int, int]:
|
|
1561
|
+
schema = get_schema(file)
|
|
1562
|
+
index_cols = _get_index_cols(schema)
|
|
1563
|
+
ncol: int = sum(name not in index_cols for name in schema.names)
|
|
1564
|
+
nrow: int = get_num_rows(file)
|
|
1565
|
+
return nrow, ncol
|
|
1566
|
+
|
|
1567
|
+
|
|
1457
1568
|
def read_nrows(file, nrow: int) -> pd.DataFrame:
|
|
1458
1569
|
"""Read first n rows of a parquet file."""
|
|
1459
1570
|
rows = next(pq.ParquetFile(file).iter_batches(nrow))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|