daplapath 2.0.1__tar.gz → 2.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.0.1
3
+ Version: 2.0.3
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -29,9 +29,8 @@ try:
29
29
  except ImportError:
30
30
  pass
31
31
 
32
-
33
32
  # regex with the prefix '_v' followed by an integer of any length
34
- VERSION_PATTERN = r"_v(\d+)"
33
+ VERSION_PATTERN = r"_v(\d+)\."
35
34
  VERSION_PREFIX = "_v"
36
35
 
37
36
  # regex with the prefix '_p' followed by four length integer (year) and OPTIONALLY month and date, separated by '-'
@@ -51,12 +50,11 @@ class Config:
51
50
  class LocalFileSystem:
52
51
  """Mimicks GCS's FileSystem but using standard library (os, glob, shutil)."""
53
52
 
54
- @classmethod
53
+ @staticmethod
55
54
  def glob(
56
- cls,
57
55
  path: str,
58
56
  recursive: bool = True,
59
- detail: bool = True,
57
+ detail: bool = False,
60
58
  include_hidden: bool = False,
61
59
  **kwargs,
62
60
  ) -> list[dict] | list[str]:
@@ -65,7 +63,7 @@ class LocalFileSystem:
65
63
  )
66
64
 
67
65
  if not detail:
68
- return relevant_paths
66
+ return list(relevant_paths)
69
67
  with ThreadPoolExecutor() as executor:
70
68
  return list(executor.map(get_file_info, relevant_paths))
71
69
 
@@ -270,11 +268,16 @@ class Path(str, _PathBase):
270
268
  """Returns a PathSeries of all versions of the file."""
271
269
  files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
272
270
 
271
+ if self.version_number:
272
+ start, _, end = re.split(self._version_pattern, self)
273
+ else:
274
+ start, end = self.stem, self.suffix
275
+
273
276
  # create boolean mask. With numpy to make it work with both pandas and list
274
277
  arr = np.array(files_in_folder)
275
- is_version_of_this_file = (
276
- np_str_contains(arr, self.versionless_stem)
277
- ) & np_str_endswith(arr, self.suffix)
278
+ is_version_of_this_file = (np_str_contains(arr, start)) & (
279
+ np_str_endswith(arr, end)
280
+ )
278
281
  if not include_versionless:
279
282
  is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
280
283
 
@@ -428,6 +431,8 @@ class Path(str, _PathBase):
428
431
  """
429
432
  if not isinstance(period, (str, int)):
430
433
  raise TypeError(f"'period' should be string or int. Got {type(period)}")
434
+ if str(period) == self.period:
435
+ return self
431
436
  return self.with_periods(period)
432
437
 
433
438
  def with_periods(self, from_period: str, to_period: str | None = None) -> "Path":
@@ -464,7 +469,9 @@ class Path(str, _PathBase):
464
469
  parent = f"{self.parent}/" if self.parent != "." else ""
465
470
 
466
471
  return self.__class__(
467
- f"{parent}{stem}{period_string}{version_string}{self.suffix}"
472
+ f"{parent}{stem}{period_string}{version_string}{self.suffix}".replace(
473
+ "".join(self.periods), period_string.strip(self._period_prefix)
474
+ )
468
475
  )
469
476
 
470
477
  @property
@@ -502,7 +509,7 @@ class Path(str, _PathBase):
502
509
  @property
503
510
  def versionless_stem(self) -> str:
504
511
  """Return the file stem before the version pattern."""
505
- return str(re.sub(self._version_pattern, "", self._path.stem))
512
+ return self.__class__(re.split(self._version_pattern, self._path.name)[0]).stem
506
513
 
507
514
  @property
508
515
  def parent(self) -> "Path":
@@ -553,6 +560,12 @@ class Path(str, _PathBase):
553
560
  except KeyError:
554
561
  return read_nrows(file, 1).columns
555
562
 
563
+ @property
564
+ def schema(self) -> pyarrow.Schema:
565
+ """Date types of the file's columns."""
566
+ with self.open("rb") as file:
567
+ return pq.read_schema(file)
568
+
556
569
  @property
557
570
  def dtypes(self) -> pd.Series:
558
571
  """Date types of the file's columns."""
@@ -628,6 +641,10 @@ class Path(str, _PathBase):
628
641
  """File size in terrabytes."""
629
642
  return self.kb / 1_000_000_000
630
643
 
644
+ @property
645
+ def partition_root(self) -> "Path":
646
+ return self.split(".parquet")[0] + ".parquet"
647
+
631
648
  def is_dir(self) -> bool:
632
649
  try:
633
650
  return self.file_system.isdir(self)
@@ -842,6 +859,10 @@ class PathSeries(pd.Series, _PathBase):
842
859
  """Select only the files in the Series."""
843
860
  return self[self.is_file()]
844
861
 
862
+ @property
863
+ def partition_root(self) -> "PathSeries":
864
+ return self.files.apply(lambda x: x.partition_root).drop_duplicates()
865
+
845
866
  @property
846
867
  def dirs(self) -> "PathSeries":
847
868
  """Select only the directories in the Series."""
@@ -1469,6 +1490,10 @@ def as_str(obj) -> str:
1469
1490
  raise TypeError(type(obj))
1470
1491
 
1471
1492
 
1493
+ def paths_are_equal(path1: Path | str, path2: Path | str) -> bool:
1494
+ return Path(path1).parts == Path(path2).parts
1495
+
1496
+
1472
1497
  def sort_by_period(paths: Iterable[str]) -> Iterable[str]:
1473
1498
  try:
1474
1499
  periods = [pd.Timestamp(path.period) for path in paths]
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.0.1"
3
+ version = "2.0.3"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
File without changes
File without changes