daplapath 2.0.3__tar.gz → 2.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.0.3
3
+ Version: 2.0.5
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -14,8 +14,8 @@ import shutil
14
14
  from typing import Callable, Any
15
15
  import inspect
16
16
  import itertools
17
- import warnings
18
17
 
18
+ from fsspec.spec import AbstractFileSystem
19
19
  import datetime
20
20
  import numpy as np
21
21
  import pandas as pd
@@ -23,6 +23,8 @@ import pandas.io.formats.format as fmt
23
23
  from pandas.api.types import is_dict_like
24
24
  import pyarrow
25
25
  import pyarrow.parquet as pq
26
+ import pyarrow.dataset as ds
27
+
26
28
 
27
29
  try:
28
30
  import gcsfs
@@ -47,15 +49,15 @@ class Config:
47
49
  file_system: Callable
48
50
 
49
51
 
50
- class LocalFileSystem:
52
+ class LocalFileSystem(AbstractFileSystem):
51
53
  """Mimicks GCS's FileSystem but using standard library (os, glob, shutil)."""
52
54
 
53
55
  @staticmethod
54
56
  def glob(
55
57
  path: str,
56
- recursive: bool = True,
57
58
  detail: bool = False,
58
- include_hidden: bool = False,
59
+ recursive: bool = True,
60
+ include_hidden: bool = True,
59
61
  **kwargs,
60
62
  ) -> list[dict] | list[str]:
61
63
  relevant_paths = glob.iglob(
@@ -67,14 +69,16 @@ class LocalFileSystem:
67
69
  with ThreadPoolExecutor() as executor:
68
70
  return list(executor.map(get_file_info, relevant_paths))
69
71
 
72
+ @classmethod
73
+ def ls(cls, path: str, detail: bool = False, **kwargs):
74
+ return cls().glob(
75
+ str(pathlib.Path(path) / "**"), detail=detail, recursive=False, **kwargs
76
+ )
77
+
70
78
  @staticmethod
71
79
  def info(path) -> dict[str, Any]:
72
80
  return get_file_info(path)
73
81
 
74
- @staticmethod
75
- def isdir(path: str) -> bool:
76
- return os.path.isdir(path)
77
-
78
82
  @staticmethod
79
83
  def open(path: str, *args, **kwargs) -> io.TextIOWrapper:
80
84
  return open(path, *args, **kwargs)
@@ -87,8 +91,12 @@ class LocalFileSystem:
87
91
  def mv(source: str, destination, **kwargs) -> str:
88
92
  return shutil.move(source, destination, **kwargs)
89
93
 
94
+ @classmethod
95
+ def cp(cls, source: str, destination, **kwargs) -> str:
96
+ return cls.cp_file(source, destination, **kwargs)
97
+
90
98
  @staticmethod
91
- def cp(source: str, destination, **kwargs) -> str:
99
+ def cp_file(self, path1, path2, **kwargs):
92
100
  os.makedirs(pathlib.Path(destination).parent, exist_ok=True)
93
101
  return shutil.copy2(source, destination, **kwargs)
94
102
 
@@ -96,6 +104,14 @@ class LocalFileSystem:
96
104
  def rm_file(path: str, *args, **kwargs) -> None:
97
105
  return os.remove(path, *args, **kwargs)
98
106
 
107
+ @staticmethod
108
+ def rmdir(path: str, *args, **kwargs) -> None:
109
+ return shutil.rmtree(path, *args, **kwargs)
110
+
111
+ @staticmethod
112
+ def makedirs(path: str, exist_ok: bool = False) -> None:
113
+ return os.makedirs(path, exist_ok=exist_ok)
114
+
99
115
 
100
116
  class GCSFileSystem(gcsfs.GCSFileSystem):
101
117
  def isdir(self, path: str) -> bool:
@@ -110,9 +126,6 @@ else:
110
126
  _config = Config(LocalFileSystem)
111
127
 
112
128
 
113
- gcsfs.GCSFileSystem.isdir
114
-
115
-
116
129
  class Tree:
117
130
  """Stores text to be printed/displayed in directory tree format.
118
131
 
@@ -166,6 +179,17 @@ class _PathBase:
166
179
  class Path(str, _PathBase):
167
180
  """Path object that works like a string, with methods for working with the GCS file system."""
168
181
 
182
+ _file_system_attrs: set[str] = {
183
+ "info",
184
+ "isdir",
185
+ "open",
186
+ "exists",
187
+ "mv",
188
+ "cp",
189
+ "rm_file",
190
+ "rmdir",
191
+ }
192
+
169
193
  @property
170
194
  def _iterable_type(self) -> type | Callable:
171
195
  """Can be overridden in subclass."""
@@ -190,6 +214,19 @@ class Path(str, _PathBase):
190
214
  obj._file_system = None
191
215
  return obj
192
216
 
217
+ @property
218
+ def local_path(self) -> "Path":
219
+ if self.startswith("/buckets"):
220
+ return self
221
+
222
+ root = self.parts[0]
223
+ bucket = root.split("-data-")[-1].split("-prod")[0]
224
+
225
+ try:
226
+ return self.__class__(f"/buckets/{bucket}/{'/'.join(self.parts[1:])}")
227
+ except IndexError:
228
+ return self.__class__(f"/buckets/{bucket}")
229
+
193
230
  def tree(
194
231
  self,
195
232
  max_rows: int | None = 3,
@@ -516,6 +553,11 @@ class Path(str, _PathBase):
516
553
  """Parent path."""
517
554
  return self.__class__(self._path.parent)
518
555
 
556
+ @property
557
+ def parents(self) -> "list[Path]":
558
+ """Parent path."""
559
+ return [self.__class__(parent) for parent in self._path.parents]
560
+
519
561
  @property
520
562
  def name(self) -> str:
521
563
  """Final part of the path."""
@@ -542,52 +584,48 @@ class Path(str, _PathBase):
542
584
 
543
585
  @property
544
586
  def index_column_names(self) -> list[str]:
545
- with self.open("rb") as file:
546
- try:
547
- schema = pq.read_schema(file)
548
- return _get_index_cols(schema)
549
- except KeyError:
550
- return read_nrows(file, 1).index.names
587
+ return _get_index_cols(self.schema)
551
588
 
552
589
  @property
553
590
  def columns(self) -> pd.Index:
554
591
  """Columns of the file."""
555
- with self.open("rb") as file:
556
- try:
557
- schema = pq.read_schema(file)
558
- index_cols = _get_index_cols(schema)
559
- return pd.Index(schema.names).difference(index_cols)
560
- except KeyError:
561
- return read_nrows(file, 1).columns
592
+ schema = self.schema
593
+ try:
594
+ names = [
595
+ x["field_name"]
596
+ for x in json.loads(schema.metadata[b"pandas"].decode())["columns"]
597
+ ]
598
+ except (KeyError, TypeError):
599
+ names = schema.names
600
+ index_cols = _get_index_cols(schema)
601
+ return pd.Index(names).difference(index_cols)
562
602
 
563
603
  @property
564
604
  def schema(self) -> pyarrow.Schema:
565
605
  """Date types of the file's columns."""
566
- with self.open("rb") as file:
567
- return pq.read_schema(file)
606
+ try:
607
+ with self.open("rb") as file:
608
+ return get_schema(file)
609
+ except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
610
+ return get_schema(self)
568
611
 
569
612
  @property
570
613
  def dtypes(self) -> pd.Series:
571
614
  """Date types of the file's columns."""
572
- with self.open("rb") as file:
573
- try:
574
- schema = pq.read_schema(file)
575
- index_cols = _get_index_cols(schema)
576
- return pd.Series(schema.types, index=schema.names).loc[
577
- lambda x: ~x.index.isin(index_cols)
578
- ]
579
- except KeyError:
580
- return read_nrows(file, 1).dtypes
615
+ schema = self.schema
616
+ index_cols = _get_index_cols(schema)
617
+ return pd.Series(schema.types, index=schema.names).loc[
618
+ lambda x: ~x.index.isin(index_cols)
619
+ ]
581
620
 
582
621
  @property
583
622
  def shape(self) -> tuple[int, int]:
584
623
  """Number of rows and columns."""
585
- with self.open("rb") as file:
586
- try:
587
- meta = pq.read_metadata(file)
588
- return meta.num_rows, meta.num_columns
589
- except KeyError:
590
- return read_nrows(file, 1).shape
624
+ try:
625
+ with self.open("rb") as file:
626
+ return get_shape(file)
627
+ except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
628
+ return get_shape(self)
591
629
 
592
630
  @property
593
631
  def nrow(self) -> int:
@@ -643,16 +681,18 @@ class Path(str, _PathBase):
643
681
 
644
682
  @property
645
683
  def partition_root(self) -> "Path":
684
+ if ".parquet" not in self:
685
+ return self
646
686
  return self.split(".parquet")[0] + ".parquet"
647
687
 
648
- def is_dir(self) -> bool:
649
- try:
650
- return self.file_system.isdir(self)
651
- except AttributeError:
652
- return self.file_system.is_dir(self)
688
+ def isfile(self) -> bool:
689
+ return not self.isdir()
653
690
 
654
691
  def is_file(self) -> bool:
655
- return not self.is_dir()
692
+ return self.isfile()
693
+
694
+ def is_dir(self) -> bool:
695
+ return self.isdir()
656
696
 
657
697
  def with_suffix(self, suffix: str):
658
698
  return self.__class__(self._path.with_suffix(suffix))
@@ -721,21 +761,15 @@ class Path(str, _PathBase):
721
761
  error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
722
762
  if attr.startswith("_"):
723
763
  raise AttributeError(error_message)
724
- try:
725
- return functools.partial(getattr(self.file_system, attr), self)
726
- except AttributeError as e:
727
- raise AttributeError(error_message) from e
764
+ if attr not in self._file_system_attrs:
765
+ raise AttributeError(error_message)
766
+ return functools.partial(getattr(self.file_system, attr), self)
728
767
 
729
768
  def __fspath__(self) -> str:
730
769
  return str(self)
731
770
 
732
771
  def __dir__(self) -> list[str]:
733
- return list(
734
- sorted(
735
- {x for x in dir(Path)}
736
- | {x for x in dir(self._file_system) if not x.startswith("_")}
737
- )
738
- )
772
+ return list(sorted({x for x in dir(Path)} | self._file_system_attrs))
739
773
 
740
774
  def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
741
775
  series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
@@ -863,6 +897,12 @@ class PathSeries(pd.Series, _PathBase):
863
897
  def partition_root(self) -> "PathSeries":
864
898
  return self.files.apply(lambda x: x.partition_root).drop_duplicates()
865
899
 
900
+ @property
901
+ def partitioned_files(self) -> "PathSeries":
902
+ return self.files.loc[
903
+ lambda x: x.str.count(r"\.parquet") == 2
904
+ ].partition_root.drop_duplicates()
905
+
866
906
  @property
867
907
  def dirs(self) -> "PathSeries":
868
908
  """Select only the directories in the Series."""
@@ -1218,18 +1258,18 @@ def split_path_and_make_copyable_html(
1218
1258
  split: str | None = "/",
1219
1259
  display_prefix: str | None = ".../",
1220
1260
  ) -> str:
1221
- """Get html text that displays the last part, but makes the full path copyable to clipboard.
1261
+ """Get HTML text that displays the last part, but makes the full path copyable to clipboard.
1222
1262
 
1223
- Splits the path on a delimiter and creates an html string that displays only the
1263
+ Splits the path on a delimiter and creates an HTML string that displays only the
1224
1264
  last part, but adds a hyperlink which copies the full path to clipboard when clicked.
1225
1265
 
1226
1266
  Parameters
1227
1267
  ----------
1228
1268
  path: File or directory path
1229
- max_parts: Maximum number of path paths to display. Defaults to 2,
1269
+ max_parts: Maximum number of path parts to display. Defaults to 2,
1230
1270
  meaning the two last parts. Set to None to show full paths.
1231
1271
  split: Text pattern to split the path on. Defaults to "/".
1232
- display_prefix: The text to display instead of the parent directory. Defaults to ".../"
1272
+ display_prefix: The text to display instead of the parent directory. Defaults to ".../".
1233
1273
 
1234
1274
  Returns
1235
1275
  -------
@@ -1237,7 +1277,8 @@ def split_path_and_make_copyable_html(
1237
1277
  """
1238
1278
 
1239
1279
  copy_to_clipboard_js = f"""<script>
1240
- function copyToClipboard(text) {{
1280
+ function copyToClipboard(text, event) {{
1281
+ event.preventDefault();
1241
1282
  navigator.clipboard.writeText(text)
1242
1283
  .then(() => {{
1243
1284
  const alertBox = document.createElement('div');
@@ -1271,7 +1312,7 @@ function copyToClipboard(text) {{
1271
1312
  else:
1272
1313
  displayed_text = path
1273
1314
 
1274
- return f'{copy_to_clipboard_js}<a href="{displayed_text}" title="{path}" onclick="copyToClipboard(\'{path}\')">{displayed_text}</a>'
1315
+ return f'{copy_to_clipboard_js}<a href="#" title="{path}" onclick="copyToClipboard(\'{path}\', event)">{displayed_text}</a>'
1275
1316
 
1276
1317
 
1277
1318
  def _get_default_multi_index() -> pd.MultiIndex:
@@ -1458,6 +1499,72 @@ def get_arguments(func: Callable | object) -> list[str]:
1458
1499
  )
1459
1500
 
1460
1501
 
1502
+ def get_schema(file) -> pyarrow.Schema:
1503
+ try:
1504
+ return pq.read_schema(file)
1505
+ except (
1506
+ PermissionError,
1507
+ pyarrow.ArrowInvalid,
1508
+ FileNotFoundError,
1509
+ IsADirectoryError,
1510
+ OSError,
1511
+ ):
1512
+ # try:
1513
+ # return ds.dataset(file).schema
1514
+ # except (TypeError, FileNotFoundError) as e:
1515
+ if not hasattr(file, "file_system"):
1516
+ raise e
1517
+
1518
+ file_system = file.file_system
1519
+
1520
+ def _get_schema(path):
1521
+ try:
1522
+ return pq.read_schema(path)
1523
+ except FileNotFoundError:
1524
+ with file_system.open(path, "rb") as f:
1525
+ return pq.read_schema(f)
1526
+
1527
+ with ThreadPoolExecutor() as executor:
1528
+ return pyarrow.unify_schemas(
1529
+ list(
1530
+ executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1531
+ ),
1532
+ promote_options="permissive",
1533
+ )
1534
+
1535
+
1536
+ def get_num_rows(file):
1537
+ try:
1538
+ return pq.read_metadata(file).num_rows
1539
+ except (
1540
+ PermissionError,
1541
+ pyarrow.ArrowInvalid,
1542
+ FileNotFoundError,
1543
+ TypeError,
1544
+ OSError,
1545
+ ) as e:
1546
+ try:
1547
+ return ds.dataset(file).count_rows()
1548
+ except Exception as e2:
1549
+ if not hasattr(file, "glob"):
1550
+ raise e2 from 2
1551
+
1552
+ def _get_num_rows(path):
1553
+ with path.open("rb") as file:
1554
+ return pq.read_metadata(file).num_rows
1555
+
1556
+ with ThreadPoolExecutor() as executor:
1557
+ return sum(executor.map(_get_num_rows, file.glob("**").files))
1558
+
1559
+
1560
+ def get_shape(file) -> tuple[int, int]:
1561
+ schema = get_schema(file)
1562
+ index_cols = _get_index_cols(schema)
1563
+ ncol: int = sum(name not in index_cols for name in schema.names)
1564
+ nrow: int = get_num_rows(file)
1565
+ return nrow, ncol
1566
+
1567
+
1461
1568
  def read_nrows(file, nrow: int) -> pd.DataFrame:
1462
1569
  """Read first n rows of a parquet file."""
1463
1570
  rows = next(pq.ParquetFile(file).iter_batches(nrow))
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.0.3"
3
+ version = "2.0.5"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
File without changes
File without changes