daplapath 2.0.2__tar.gz → 2.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.0.2
3
+ Version: 2.0.5
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -14,8 +14,8 @@ import shutil
14
14
  from typing import Callable, Any
15
15
  import inspect
16
16
  import itertools
17
- import warnings
18
17
 
18
+ from fsspec.spec import AbstractFileSystem
19
19
  import datetime
20
20
  import numpy as np
21
21
  import pandas as pd
@@ -23,15 +23,16 @@ import pandas.io.formats.format as fmt
23
23
  from pandas.api.types import is_dict_like
24
24
  import pyarrow
25
25
  import pyarrow.parquet as pq
26
+ import pyarrow.dataset as ds
27
+
26
28
 
27
29
  try:
28
30
  import gcsfs
29
31
  except ImportError:
30
32
  pass
31
33
 
32
-
33
34
  # regex with the prefix '_v' followed by an integer of any length
34
- VERSION_PATTERN = r"_v(\d+)"
35
+ VERSION_PATTERN = r"_v(\d+)\."
35
36
  VERSION_PREFIX = "_v"
36
37
 
37
38
  # regex with the prefix '_p' followed by four length integer (year) and OPTIONALLY month and date, separated by '-'
@@ -48,15 +49,15 @@ class Config:
48
49
  file_system: Callable
49
50
 
50
51
 
51
- class LocalFileSystem:
52
+ class LocalFileSystem(AbstractFileSystem):
52
53
  """Mimicks GCS's FileSystem but using standard library (os, glob, shutil)."""
53
54
 
54
55
  @staticmethod
55
56
  def glob(
56
57
  path: str,
57
- recursive: bool = True,
58
58
  detail: bool = False,
59
- include_hidden: bool = False,
59
+ recursive: bool = True,
60
+ include_hidden: bool = True,
60
61
  **kwargs,
61
62
  ) -> list[dict] | list[str]:
62
63
  relevant_paths = glob.iglob(
@@ -68,14 +69,16 @@ class LocalFileSystem:
68
69
  with ThreadPoolExecutor() as executor:
69
70
  return list(executor.map(get_file_info, relevant_paths))
70
71
 
72
+ @classmethod
73
+ def ls(cls, path: str, detail: bool = False, **kwargs):
74
+ return cls().glob(
75
+ str(pathlib.Path(path) / "**"), detail=detail, recursive=False, **kwargs
76
+ )
77
+
71
78
  @staticmethod
72
79
  def info(path) -> dict[str, Any]:
73
80
  return get_file_info(path)
74
81
 
75
- @staticmethod
76
- def isdir(path: str) -> bool:
77
- return os.path.isdir(path)
78
-
79
82
  @staticmethod
80
83
  def open(path: str, *args, **kwargs) -> io.TextIOWrapper:
81
84
  return open(path, *args, **kwargs)
@@ -88,8 +91,12 @@ class LocalFileSystem:
88
91
  def mv(source: str, destination, **kwargs) -> str:
89
92
  return shutil.move(source, destination, **kwargs)
90
93
 
94
+ @classmethod
95
+ def cp(cls, source: str, destination, **kwargs) -> str:
96
+ return cls.cp_file(source, destination, **kwargs)
97
+
91
98
  @staticmethod
92
- def cp(source: str, destination, **kwargs) -> str:
99
+ def cp_file(self, path1, path2, **kwargs):
93
100
  os.makedirs(pathlib.Path(destination).parent, exist_ok=True)
94
101
  return shutil.copy2(source, destination, **kwargs)
95
102
 
@@ -97,6 +104,14 @@ class LocalFileSystem:
97
104
  def rm_file(path: str, *args, **kwargs) -> None:
98
105
  return os.remove(path, *args, **kwargs)
99
106
 
107
+ @staticmethod
108
+ def rmdir(path: str, *args, **kwargs) -> None:
109
+ return shutil.rmtree(path, *args, **kwargs)
110
+
111
+ @staticmethod
112
+ def makedirs(path: str, exist_ok: bool = False) -> None:
113
+ return os.makedirs(path, exist_ok=exist_ok)
114
+
100
115
 
101
116
  class GCSFileSystem(gcsfs.GCSFileSystem):
102
117
  def isdir(self, path: str) -> bool:
@@ -111,9 +126,6 @@ else:
111
126
  _config = Config(LocalFileSystem)
112
127
 
113
128
 
114
- gcsfs.GCSFileSystem.isdir
115
-
116
-
117
129
  class Tree:
118
130
  """Stores text to be printed/displayed in directory tree format.
119
131
 
@@ -167,6 +179,17 @@ class _PathBase:
167
179
  class Path(str, _PathBase):
168
180
  """Path object that works like a string, with methods for working with the GCS file system."""
169
181
 
182
+ _file_system_attrs: set[str] = {
183
+ "info",
184
+ "isdir",
185
+ "open",
186
+ "exists",
187
+ "mv",
188
+ "cp",
189
+ "rm_file",
190
+ "rmdir",
191
+ }
192
+
170
193
  @property
171
194
  def _iterable_type(self) -> type | Callable:
172
195
  """Can be overridden in subclass."""
@@ -191,6 +214,19 @@ class Path(str, _PathBase):
191
214
  obj._file_system = None
192
215
  return obj
193
216
 
217
+ @property
218
+ def local_path(self) -> "Path":
219
+ if self.startswith("/buckets"):
220
+ return self
221
+
222
+ root = self.parts[0]
223
+ bucket = root.split("-data-")[-1].split("-prod")[0]
224
+
225
+ try:
226
+ return self.__class__(f"/buckets/{bucket}/{'/'.join(self.parts[1:])}")
227
+ except IndexError:
228
+ return self.__class__(f"/buckets/{bucket}")
229
+
194
230
  def tree(
195
231
  self,
196
232
  max_rows: int | None = 3,
@@ -269,11 +305,16 @@ class Path(str, _PathBase):
269
305
  """Returns a PathSeries of all versions of the file."""
270
306
  files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
271
307
 
308
+ if self.version_number:
309
+ start, _, end = re.split(self._version_pattern, self)
310
+ else:
311
+ start, end = self.stem, self.suffix
312
+
272
313
  # create boolean mask. With numpy to make it work with both pandas and list
273
314
  arr = np.array(files_in_folder)
274
- is_version_of_this_file = (
275
- np_str_contains(arr, self.versionless_stem)
276
- ) & np_str_endswith(arr, self.suffix)
315
+ is_version_of_this_file = (np_str_contains(arr, start)) & (
316
+ np_str_endswith(arr, end)
317
+ )
277
318
  if not include_versionless:
278
319
  is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
279
320
 
@@ -505,13 +546,18 @@ class Path(str, _PathBase):
505
546
  @property
506
547
  def versionless_stem(self) -> str:
507
548
  """Return the file stem before the version pattern."""
508
- return str(re.sub(self._version_pattern, "", self._path.stem))
549
+ return self.__class__(re.split(self._version_pattern, self._path.name)[0]).stem
509
550
 
510
551
  @property
511
552
  def parent(self) -> "Path":
512
553
  """Parent path."""
513
554
  return self.__class__(self._path.parent)
514
555
 
556
+ @property
557
+ def parents(self) -> "list[Path]":
558
+ """Parent path."""
559
+ return [self.__class__(parent) for parent in self._path.parents]
560
+
515
561
  @property
516
562
  def name(self) -> str:
517
563
  """Final part of the path."""
@@ -538,52 +584,48 @@ class Path(str, _PathBase):
538
584
 
539
585
  @property
540
586
  def index_column_names(self) -> list[str]:
541
- with self.open("rb") as file:
542
- try:
543
- schema = pq.read_schema(file)
544
- return _get_index_cols(schema)
545
- except KeyError:
546
- return read_nrows(file, 1).index.names
587
+ return _get_index_cols(self.schema)
547
588
 
548
589
  @property
549
590
  def columns(self) -> pd.Index:
550
591
  """Columns of the file."""
551
- with self.open("rb") as file:
552
- try:
553
- schema = pq.read_schema(file)
554
- index_cols = _get_index_cols(schema)
555
- return pd.Index(schema.names).difference(index_cols)
556
- except KeyError:
557
- return read_nrows(file, 1).columns
592
+ schema = self.schema
593
+ try:
594
+ names = [
595
+ x["field_name"]
596
+ for x in json.loads(schema.metadata[b"pandas"].decode())["columns"]
597
+ ]
598
+ except (KeyError, TypeError):
599
+ names = schema.names
600
+ index_cols = _get_index_cols(schema)
601
+ return pd.Index(names).difference(index_cols)
558
602
 
559
603
  @property
560
604
  def schema(self) -> pyarrow.Schema:
561
605
  """Date types of the file's columns."""
562
- with self.open("rb") as file:
563
- return pq.read_schema(file)
606
+ try:
607
+ with self.open("rb") as file:
608
+ return get_schema(file)
609
+ except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
610
+ return get_schema(self)
564
611
 
565
612
  @property
566
613
  def dtypes(self) -> pd.Series:
567
614
  """Date types of the file's columns."""
568
- with self.open("rb") as file:
569
- try:
570
- schema = pq.read_schema(file)
571
- index_cols = _get_index_cols(schema)
572
- return pd.Series(schema.types, index=schema.names).loc[
573
- lambda x: ~x.index.isin(index_cols)
574
- ]
575
- except KeyError:
576
- return read_nrows(file, 1).dtypes
615
+ schema = self.schema
616
+ index_cols = _get_index_cols(schema)
617
+ return pd.Series(schema.types, index=schema.names).loc[
618
+ lambda x: ~x.index.isin(index_cols)
619
+ ]
577
620
 
578
621
  @property
579
622
  def shape(self) -> tuple[int, int]:
580
623
  """Number of rows and columns."""
581
- with self.open("rb") as file:
582
- try:
583
- meta = pq.read_metadata(file)
584
- return meta.num_rows, meta.num_columns
585
- except KeyError:
586
- return read_nrows(file, 1).shape
624
+ try:
625
+ with self.open("rb") as file:
626
+ return get_shape(file)
627
+ except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
628
+ return get_shape(self)
587
629
 
588
630
  @property
589
631
  def nrow(self) -> int:
@@ -639,16 +681,18 @@ class Path(str, _PathBase):
639
681
 
640
682
  @property
641
683
  def partition_root(self) -> "Path":
684
+ if ".parquet" not in self:
685
+ return self
642
686
  return self.split(".parquet")[0] + ".parquet"
643
687
 
644
- def is_dir(self) -> bool:
645
- try:
646
- return self.file_system.isdir(self)
647
- except AttributeError:
648
- return self.file_system.is_dir(self)
688
+ def isfile(self) -> bool:
689
+ return not self.isdir()
649
690
 
650
691
  def is_file(self) -> bool:
651
- return not self.is_dir()
692
+ return self.isfile()
693
+
694
+ def is_dir(self) -> bool:
695
+ return self.isdir()
652
696
 
653
697
  def with_suffix(self, suffix: str):
654
698
  return self.__class__(self._path.with_suffix(suffix))
@@ -717,21 +761,15 @@ class Path(str, _PathBase):
717
761
  error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
718
762
  if attr.startswith("_"):
719
763
  raise AttributeError(error_message)
720
- try:
721
- return functools.partial(getattr(self.file_system, attr), self)
722
- except AttributeError as e:
723
- raise AttributeError(error_message) from e
764
+ if attr not in self._file_system_attrs:
765
+ raise AttributeError(error_message)
766
+ return functools.partial(getattr(self.file_system, attr), self)
724
767
 
725
768
  def __fspath__(self) -> str:
726
769
  return str(self)
727
770
 
728
771
  def __dir__(self) -> list[str]:
729
- return list(
730
- sorted(
731
- {x for x in dir(Path)}
732
- | {x for x in dir(self._file_system) if not x.startswith("_")}
733
- )
734
- )
772
+ return list(sorted({x for x in dir(Path)} | self._file_system_attrs))
735
773
 
736
774
  def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
737
775
  series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
@@ -859,6 +897,12 @@ class PathSeries(pd.Series, _PathBase):
859
897
  def partition_root(self) -> "PathSeries":
860
898
  return self.files.apply(lambda x: x.partition_root).drop_duplicates()
861
899
 
900
+ @property
901
+ def partitioned_files(self) -> "PathSeries":
902
+ return self.files.loc[
903
+ lambda x: x.str.count(r"\.parquet") == 2
904
+ ].partition_root.drop_duplicates()
905
+
862
906
  @property
863
907
  def dirs(self) -> "PathSeries":
864
908
  """Select only the directories in the Series."""
@@ -1214,18 +1258,18 @@ def split_path_and_make_copyable_html(
1214
1258
  split: str | None = "/",
1215
1259
  display_prefix: str | None = ".../",
1216
1260
  ) -> str:
1217
- """Get html text that displays the last part, but makes the full path copyable to clipboard.
1261
+ """Get HTML text that displays the last part, but makes the full path copyable to clipboard.
1218
1262
 
1219
- Splits the path on a delimiter and creates an html string that displays only the
1263
+ Splits the path on a delimiter and creates an HTML string that displays only the
1220
1264
  last part, but adds a hyperlink which copies the full path to clipboard when clicked.
1221
1265
 
1222
1266
  Parameters
1223
1267
  ----------
1224
1268
  path: File or directory path
1225
- max_parts: Maximum number of path paths to display. Defaults to 2,
1269
+ max_parts: Maximum number of path parts to display. Defaults to 2,
1226
1270
  meaning the two last parts. Set to None to show full paths.
1227
1271
  split: Text pattern to split the path on. Defaults to "/".
1228
- display_prefix: The text to display instead of the parent directory. Defaults to ".../"
1272
+ display_prefix: The text to display instead of the parent directory. Defaults to ".../".
1229
1273
 
1230
1274
  Returns
1231
1275
  -------
@@ -1233,7 +1277,8 @@ def split_path_and_make_copyable_html(
1233
1277
  """
1234
1278
 
1235
1279
  copy_to_clipboard_js = f"""<script>
1236
- function copyToClipboard(text) {{
1280
+ function copyToClipboard(text, event) {{
1281
+ event.preventDefault();
1237
1282
  navigator.clipboard.writeText(text)
1238
1283
  .then(() => {{
1239
1284
  const alertBox = document.createElement('div');
@@ -1267,7 +1312,7 @@ function copyToClipboard(text) {{
1267
1312
  else:
1268
1313
  displayed_text = path
1269
1314
 
1270
- return f'{copy_to_clipboard_js}<a href="{displayed_text}" title="{path}" onclick="copyToClipboard(\'{path}\')">{displayed_text}</a>'
1315
+ return f'{copy_to_clipboard_js}<a href="#" title="{path}" onclick="copyToClipboard(\'{path}\', event)">{displayed_text}</a>'
1271
1316
 
1272
1317
 
1273
1318
  def _get_default_multi_index() -> pd.MultiIndex:
@@ -1454,6 +1499,72 @@ def get_arguments(func: Callable | object) -> list[str]:
1454
1499
  )
1455
1500
 
1456
1501
 
1502
+ def get_schema(file) -> pyarrow.Schema:
1503
+ try:
1504
+ return pq.read_schema(file)
1505
+ except (
1506
+ PermissionError,
1507
+ pyarrow.ArrowInvalid,
1508
+ FileNotFoundError,
1509
+ IsADirectoryError,
1510
+ OSError,
1511
+ ):
1512
+ # try:
1513
+ # return ds.dataset(file).schema
1514
+ # except (TypeError, FileNotFoundError) as e:
1515
+ if not hasattr(file, "file_system"):
1516
+ raise e
1517
+
1518
+ file_system = file.file_system
1519
+
1520
+ def _get_schema(path):
1521
+ try:
1522
+ return pq.read_schema(path)
1523
+ except FileNotFoundError:
1524
+ with file_system.open(path, "rb") as f:
1525
+ return pq.read_schema(f)
1526
+
1527
+ with ThreadPoolExecutor() as executor:
1528
+ return pyarrow.unify_schemas(
1529
+ list(
1530
+ executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1531
+ ),
1532
+ promote_options="permissive",
1533
+ )
1534
+
1535
+
1536
+ def get_num_rows(file):
1537
+ try:
1538
+ return pq.read_metadata(file).num_rows
1539
+ except (
1540
+ PermissionError,
1541
+ pyarrow.ArrowInvalid,
1542
+ FileNotFoundError,
1543
+ TypeError,
1544
+ OSError,
1545
+ ) as e:
1546
+ try:
1547
+ return ds.dataset(file).count_rows()
1548
+ except Exception as e2:
1549
+ if not hasattr(file, "glob"):
1550
+ raise e2 from 2
1551
+
1552
+ def _get_num_rows(path):
1553
+ with path.open("rb") as file:
1554
+ return pq.read_metadata(file).num_rows
1555
+
1556
+ with ThreadPoolExecutor() as executor:
1557
+ return sum(executor.map(_get_num_rows, file.glob("**").files))
1558
+
1559
+
1560
+ def get_shape(file) -> tuple[int, int]:
1561
+ schema = get_schema(file)
1562
+ index_cols = _get_index_cols(schema)
1563
+ ncol: int = sum(name not in index_cols for name in schema.names)
1564
+ nrow: int = get_num_rows(file)
1565
+ return nrow, ncol
1566
+
1567
+
1457
1568
  def read_nrows(file, nrow: int) -> pd.DataFrame:
1458
1569
  """Read first n rows of a parquet file."""
1459
1570
  rows = next(pq.ParquetFile(file).iter_batches(nrow))
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.0.2"
3
+ version = "2.0.5"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
File without changes
File without changes