daplapath 2.0.1__tar.gz → 2.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {daplapath-2.0.1 → daplapath-2.0.3}/PKG-INFO +1 -1
- {daplapath-2.0.1 → daplapath-2.0.3}/daplapath/path.py +36 -11
- {daplapath-2.0.1 → daplapath-2.0.3}/pyproject.toml +1 -1
- {daplapath-2.0.1 → daplapath-2.0.3}/LICENSE.md +0 -0
- {daplapath-2.0.1 → daplapath-2.0.3}/README.md +0 -0
- {daplapath-2.0.1 → daplapath-2.0.3}/daplapath/__init__.py +0 -0
|
@@ -29,9 +29,8 @@ try:
|
|
|
29
29
|
except ImportError:
|
|
30
30
|
pass
|
|
31
31
|
|
|
32
|
-
|
|
33
32
|
# regex with the prefix '_v' followed by an integer of any length
|
|
34
|
-
VERSION_PATTERN = r"_v(\d+)"
|
|
33
|
+
VERSION_PATTERN = r"_v(\d+)\."
|
|
35
34
|
VERSION_PREFIX = "_v"
|
|
36
35
|
|
|
37
36
|
# regex with the prefix '_p' followed by four length integer (year) and OPTIONALLY month and date, separated by '-'
|
|
@@ -51,12 +50,11 @@ class Config:
|
|
|
51
50
|
class LocalFileSystem:
|
|
52
51
|
"""Mimicks GCS's FileSystem but using standard library (os, glob, shutil)."""
|
|
53
52
|
|
|
54
|
-
@
|
|
53
|
+
@staticmethod
|
|
55
54
|
def glob(
|
|
56
|
-
cls,
|
|
57
55
|
path: str,
|
|
58
56
|
recursive: bool = True,
|
|
59
|
-
detail: bool =
|
|
57
|
+
detail: bool = False,
|
|
60
58
|
include_hidden: bool = False,
|
|
61
59
|
**kwargs,
|
|
62
60
|
) -> list[dict] | list[str]:
|
|
@@ -65,7 +63,7 @@ class LocalFileSystem:
|
|
|
65
63
|
)
|
|
66
64
|
|
|
67
65
|
if not detail:
|
|
68
|
-
return relevant_paths
|
|
66
|
+
return list(relevant_paths)
|
|
69
67
|
with ThreadPoolExecutor() as executor:
|
|
70
68
|
return list(executor.map(get_file_info, relevant_paths))
|
|
71
69
|
|
|
@@ -270,11 +268,16 @@ class Path(str, _PathBase):
|
|
|
270
268
|
"""Returns a PathSeries of all versions of the file."""
|
|
271
269
|
files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
|
|
272
270
|
|
|
271
|
+
if self.version_number:
|
|
272
|
+
start, _, end = re.split(self._version_pattern, self)
|
|
273
|
+
else:
|
|
274
|
+
start, end = self.stem, self.suffix
|
|
275
|
+
|
|
273
276
|
# create boolean mask. With numpy to make it work with both pandas and list
|
|
274
277
|
arr = np.array(files_in_folder)
|
|
275
|
-
is_version_of_this_file = (
|
|
276
|
-
|
|
277
|
-
)
|
|
278
|
+
is_version_of_this_file = (np_str_contains(arr, start)) & (
|
|
279
|
+
np_str_endswith(arr, end)
|
|
280
|
+
)
|
|
278
281
|
if not include_versionless:
|
|
279
282
|
is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
|
|
280
283
|
|
|
@@ -428,6 +431,8 @@ class Path(str, _PathBase):
|
|
|
428
431
|
"""
|
|
429
432
|
if not isinstance(period, (str, int)):
|
|
430
433
|
raise TypeError(f"'period' should be string or int. Got {type(period)}")
|
|
434
|
+
if str(period) == self.period:
|
|
435
|
+
return self
|
|
431
436
|
return self.with_periods(period)
|
|
432
437
|
|
|
433
438
|
def with_periods(self, from_period: str, to_period: str | None = None) -> "Path":
|
|
@@ -464,7 +469,9 @@ class Path(str, _PathBase):
|
|
|
464
469
|
parent = f"{self.parent}/" if self.parent != "." else ""
|
|
465
470
|
|
|
466
471
|
return self.__class__(
|
|
467
|
-
f"{parent}{stem}{period_string}{version_string}{self.suffix}"
|
|
472
|
+
f"{parent}{stem}{period_string}{version_string}{self.suffix}".replace(
|
|
473
|
+
"".join(self.periods), period_string.strip(self._period_prefix)
|
|
474
|
+
)
|
|
468
475
|
)
|
|
469
476
|
|
|
470
477
|
@property
|
|
@@ -502,7 +509,7 @@ class Path(str, _PathBase):
|
|
|
502
509
|
@property
|
|
503
510
|
def versionless_stem(self) -> str:
|
|
504
511
|
"""Return the file stem before the version pattern."""
|
|
505
|
-
return
|
|
512
|
+
return self.__class__(re.split(self._version_pattern, self._path.name)[0]).stem
|
|
506
513
|
|
|
507
514
|
@property
|
|
508
515
|
def parent(self) -> "Path":
|
|
@@ -553,6 +560,12 @@ class Path(str, _PathBase):
|
|
|
553
560
|
except KeyError:
|
|
554
561
|
return read_nrows(file, 1).columns
|
|
555
562
|
|
|
563
|
+
@property
|
|
564
|
+
def schema(self) -> pyarrow.Schema:
|
|
565
|
+
"""Date types of the file's columns."""
|
|
566
|
+
with self.open("rb") as file:
|
|
567
|
+
return pq.read_schema(file)
|
|
568
|
+
|
|
556
569
|
@property
|
|
557
570
|
def dtypes(self) -> pd.Series:
|
|
558
571
|
"""Date types of the file's columns."""
|
|
@@ -628,6 +641,10 @@ class Path(str, _PathBase):
|
|
|
628
641
|
"""File size in terrabytes."""
|
|
629
642
|
return self.kb / 1_000_000_000
|
|
630
643
|
|
|
644
|
+
@property
|
|
645
|
+
def partition_root(self) -> "Path":
|
|
646
|
+
return self.split(".parquet")[0] + ".parquet"
|
|
647
|
+
|
|
631
648
|
def is_dir(self) -> bool:
|
|
632
649
|
try:
|
|
633
650
|
return self.file_system.isdir(self)
|
|
@@ -842,6 +859,10 @@ class PathSeries(pd.Series, _PathBase):
|
|
|
842
859
|
"""Select only the files in the Series."""
|
|
843
860
|
return self[self.is_file()]
|
|
844
861
|
|
|
862
|
+
@property
|
|
863
|
+
def partition_root(self) -> "PathSeries":
|
|
864
|
+
return self.files.apply(lambda x: x.partition_root).drop_duplicates()
|
|
865
|
+
|
|
845
866
|
@property
|
|
846
867
|
def dirs(self) -> "PathSeries":
|
|
847
868
|
"""Select only the directories in the Series."""
|
|
@@ -1469,6 +1490,10 @@ def as_str(obj) -> str:
|
|
|
1469
1490
|
raise TypeError(type(obj))
|
|
1470
1491
|
|
|
1471
1492
|
|
|
1493
|
+
def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
|
|
1494
|
+
return Path(path1).parts == Path(path2).parts
|
|
1495
|
+
|
|
1496
|
+
|
|
1472
1497
|
def sort_by_period(paths: Iterable[str]) -> Iterable[str]:
|
|
1473
1498
|
try:
|
|
1474
1499
|
periods = [pd.Timestamp(path.period) for path in paths]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|