daplapath 2.0.3__tar.gz → 2.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.0.3
3
+ Version: 2.0.6
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -1,2 +1,3 @@
1
1
  from .path import Path
2
2
  from .path import PathSeries
3
+ from .path import LocalFileSystem
@@ -14,8 +14,8 @@ import shutil
14
14
  from typing import Callable, Any
15
15
  import inspect
16
16
  import itertools
17
- import warnings
18
17
 
18
+ from fsspec.spec import AbstractFileSystem
19
19
  import datetime
20
20
  import numpy as np
21
21
  import pandas as pd
@@ -23,6 +23,8 @@ import pandas.io.formats.format as fmt
23
23
  from pandas.api.types import is_dict_like
24
24
  import pyarrow
25
25
  import pyarrow.parquet as pq
26
+ import pyarrow.dataset as ds
27
+
26
28
 
27
29
  try:
28
30
  import gcsfs
@@ -47,15 +49,15 @@ class Config:
47
49
  file_system: Callable
48
50
 
49
51
 
50
- class LocalFileSystem:
52
+ class LocalFileSystem(AbstractFileSystem):
51
53
  """Mimicks GCS's FileSystem but using standard library (os, glob, shutil)."""
52
54
 
53
55
  @staticmethod
54
56
  def glob(
55
57
  path: str,
56
- recursive: bool = True,
57
58
  detail: bool = False,
58
- include_hidden: bool = False,
59
+ recursive: bool = True,
60
+ include_hidden: bool = True,
59
61
  **kwargs,
60
62
  ) -> list[dict] | list[str]:
61
63
  relevant_paths = glob.iglob(
@@ -67,14 +69,16 @@ class LocalFileSystem:
67
69
  with ThreadPoolExecutor() as executor:
68
70
  return list(executor.map(get_file_info, relevant_paths))
69
71
 
72
+ @classmethod
73
+ def ls(cls, path: str, detail: bool = False, **kwargs):
74
+ return cls().glob(
75
+ str(pathlib.Path(path) / "**"), detail=detail, recursive=False, **kwargs
76
+ )
77
+
70
78
  @staticmethod
71
79
  def info(path) -> dict[str, Any]:
72
80
  return get_file_info(path)
73
81
 
74
- @staticmethod
75
- def isdir(path: str) -> bool:
76
- return os.path.isdir(path)
77
-
78
82
  @staticmethod
79
83
  def open(path: str, *args, **kwargs) -> io.TextIOWrapper:
80
84
  return open(path, *args, **kwargs)
@@ -87,8 +91,12 @@ class LocalFileSystem:
87
91
  def mv(source: str, destination, **kwargs) -> str:
88
92
  return shutil.move(source, destination, **kwargs)
89
93
 
94
+ @classmethod
95
+ def cp(cls, source: str, destination, **kwargs) -> str:
96
+ return cls.cp_file(source, destination, **kwargs)
97
+
90
98
  @staticmethod
91
- def cp(source: str, destination, **kwargs) -> str:
99
+ def cp_file(self, path1, path2, **kwargs):
92
100
  os.makedirs(pathlib.Path(destination).parent, exist_ok=True)
93
101
  return shutil.copy2(source, destination, **kwargs)
94
102
 
@@ -96,6 +104,14 @@ class LocalFileSystem:
96
104
  def rm_file(path: str, *args, **kwargs) -> None:
97
105
  return os.remove(path, *args, **kwargs)
98
106
 
107
+ @staticmethod
108
+ def rmdir(path: str, *args, **kwargs) -> None:
109
+ return shutil.rmtree(path, *args, **kwargs)
110
+
111
+ @staticmethod
112
+ def makedirs(path: str, exist_ok: bool = False) -> None:
113
+ return os.makedirs(path, exist_ok=exist_ok)
114
+
99
115
 
100
116
  class GCSFileSystem(gcsfs.GCSFileSystem):
101
117
  def isdir(self, path: str) -> bool:
@@ -110,9 +126,6 @@ else:
110
126
  _config = Config(LocalFileSystem)
111
127
 
112
128
 
113
- gcsfs.GCSFileSystem.isdir
114
-
115
-
116
129
  class Tree:
117
130
  """Stores text to be printed/displayed in directory tree format.
118
131
 
@@ -166,6 +179,17 @@ class _PathBase:
166
179
  class Path(str, _PathBase):
167
180
  """Path object that works like a string, with methods for working with the GCS file system."""
168
181
 
182
+ _file_system_attrs: set[str] = {
183
+ "info",
184
+ "isdir",
185
+ "open",
186
+ "exists",
187
+ "mv",
188
+ "cp",
189
+ "rm_file",
190
+ "rmdir",
191
+ }
192
+
169
193
  @property
170
194
  def _iterable_type(self) -> type | Callable:
171
195
  """Can be overridden in subclass."""
@@ -182,14 +206,26 @@ class Path(str, _PathBase):
182
206
  .rstrip("/")
183
207
  )
184
208
 
185
- def __new__(cls, gcs_path: str | PurePath | None = None):
209
+ def __new__(cls, gcs_path: str | PurePath | None = None, file_system=None):
186
210
  """Construct Path with '/' as delimiter."""
187
211
  gcs_path = cls._standardize_path(gcs_path or "")
188
212
  obj = super().__new__(cls, gcs_path)
189
213
  obj._path = PurePosixPath(obj)
190
- obj._file_system = None
214
+ obj._file_system = file_system
191
215
  return obj
192
216
 
217
+ def buckets_path(self) -> "Path":
218
+ if self.startswith("/buckets"):
219
+ return self
220
+
221
+ root = self.parts[0]
222
+ bucket = root.split("-data-")[-1].split("-prod")[0]
223
+
224
+ try:
225
+ return self._new(f"/buckets/{bucket}/{'/'.join(self.parts[1:])}")
226
+ except IndexError:
227
+ return self._new(f"/buckets/{bucket}")
228
+
193
229
  def tree(
194
230
  self,
195
231
  max_rows: int | None = 3,
@@ -365,7 +401,7 @@ class Path(str, _PathBase):
365
401
  'file_v201.parquet'
366
402
  """
367
403
  version_text = f"{self._version_prefix}{version}" if version is not None else ""
368
- return self.__class__(
404
+ return self._new(
369
405
  f"{self.parent}/{self.versionless_stem}{version_text}{self.suffix}"
370
406
  )
371
407
 
@@ -468,7 +504,7 @@ class Path(str, _PathBase):
468
504
 
469
505
  parent = f"{self.parent}/" if self.parent != "." else ""
470
506
 
471
- return self.__class__(
507
+ return self._new(
472
508
  f"{parent}{stem}{period_string}{version_string}{self.suffix}".replace(
473
509
  "".join(self.periods), period_string.strip(self._period_prefix)
474
510
  )
@@ -509,12 +545,17 @@ class Path(str, _PathBase):
509
545
  @property
510
546
  def versionless_stem(self) -> str:
511
547
  """Return the file stem before the version pattern."""
512
- return self.__class__(re.split(self._version_pattern, self._path.name)[0]).stem
548
+ return self._new(re.split(self._version_pattern, self._path.name)[0]).stem
513
549
 
514
550
  @property
515
551
  def parent(self) -> "Path":
516
552
  """Parent path."""
517
- return self.__class__(self._path.parent)
553
+ return self._new(self._path.parent)
554
+
555
+ @property
556
+ def parents(self) -> "list[Path]":
557
+ """Parent path."""
558
+ return [self._new(parent) for parent in self._path.parents]
518
559
 
519
560
  @property
520
561
  def name(self) -> str:
@@ -542,52 +583,48 @@ class Path(str, _PathBase):
542
583
 
543
584
  @property
544
585
  def index_column_names(self) -> list[str]:
545
- with self.open("rb") as file:
546
- try:
547
- schema = pq.read_schema(file)
548
- return _get_index_cols(schema)
549
- except KeyError:
550
- return read_nrows(file, 1).index.names
586
+ return _get_index_cols(self.schema)
551
587
 
552
588
  @property
553
589
  def columns(self) -> pd.Index:
554
590
  """Columns of the file."""
555
- with self.open("rb") as file:
556
- try:
557
- schema = pq.read_schema(file)
558
- index_cols = _get_index_cols(schema)
559
- return pd.Index(schema.names).difference(index_cols)
560
- except KeyError:
561
- return read_nrows(file, 1).columns
591
+ schema = self.schema
592
+ try:
593
+ names = [
594
+ x["field_name"]
595
+ for x in json.loads(schema.metadata[b"pandas"].decode())["columns"]
596
+ ]
597
+ except (KeyError, TypeError):
598
+ names = schema.names
599
+ index_cols = _get_index_cols(schema)
600
+ return pd.Index(names).difference(index_cols)
562
601
 
563
602
  @property
564
603
  def schema(self) -> pyarrow.Schema:
565
604
  """Date types of the file's columns."""
566
- with self.open("rb") as file:
567
- return pq.read_schema(file)
605
+ try:
606
+ with self.open("rb") as file:
607
+ return get_schema(file)
608
+ except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
609
+ return get_schema(self)
568
610
 
569
611
  @property
570
612
  def dtypes(self) -> pd.Series:
571
613
  """Date types of the file's columns."""
572
- with self.open("rb") as file:
573
- try:
574
- schema = pq.read_schema(file)
575
- index_cols = _get_index_cols(schema)
576
- return pd.Series(schema.types, index=schema.names).loc[
577
- lambda x: ~x.index.isin(index_cols)
578
- ]
579
- except KeyError:
580
- return read_nrows(file, 1).dtypes
614
+ schema = self.schema
615
+ index_cols = _get_index_cols(schema)
616
+ return pd.Series(schema.types, index=schema.names).loc[
617
+ lambda x: ~x.index.isin(index_cols)
618
+ ]
581
619
 
582
620
  @property
583
621
  def shape(self) -> tuple[int, int]:
584
622
  """Number of rows and columns."""
585
- with self.open("rb") as file:
586
- try:
587
- meta = pq.read_metadata(file)
588
- return meta.num_rows, meta.num_columns
589
- except KeyError:
590
- return read_nrows(file, 1).shape
623
+ try:
624
+ with self.open("rb") as file:
625
+ return get_shape(file)
626
+ except (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
627
+ return get_shape(self)
591
628
 
592
629
  @property
593
630
  def nrow(self) -> int:
@@ -643,25 +680,27 @@ class Path(str, _PathBase):
643
680
 
644
681
  @property
645
682
  def partition_root(self) -> "Path":
683
+ if ".parquet" not in self:
684
+ return self
646
685
  return self.split(".parquet")[0] + ".parquet"
647
686
 
648
- def is_dir(self) -> bool:
649
- try:
650
- return self.file_system.isdir(self)
651
- except AttributeError:
652
- return self.file_system.is_dir(self)
687
+ def isfile(self) -> bool:
688
+ return not self.isdir()
653
689
 
654
690
  def is_file(self) -> bool:
655
- return not self.is_dir()
691
+ return self.isfile()
692
+
693
+ def is_dir(self) -> bool:
694
+ return self.isdir()
656
695
 
657
696
  def with_suffix(self, suffix: str):
658
- return self.__class__(self._path.with_suffix(suffix))
697
+ return self._new(self._path.with_suffix(suffix))
659
698
 
660
699
  def with_name(self, new_name: str):
661
- return self.__class__(self._path.with_name(new_name))
700
+ return self._new(self._path.with_name(new_name))
662
701
 
663
702
  def with_stem(self, new_with_stem: str):
664
- return self.__class__(self._path.with_stem(new_with_stem))
703
+ return self._new(self._path.with_stem(new_with_stem))
665
704
 
666
705
  @property
667
706
  def file_system(self):
@@ -689,7 +728,7 @@ class Path(str, _PathBase):
689
728
  "unsupported operand type(s) for /: "
690
729
  f"{self.__class__.__name__} and {other.__class__.__name__}"
691
730
  )
692
- return self.__class__(f"{self}/{as_str(other)}")
731
+ return self._new(f"{self}/{as_str(other)}")
693
732
 
694
733
  def __getattribute__(self, name):
695
734
  """stackoverflow hack to ensure we return Path when using string methods.
@@ -721,21 +760,15 @@ class Path(str, _PathBase):
721
760
  error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
722
761
  if attr.startswith("_"):
723
762
  raise AttributeError(error_message)
724
- try:
725
- return functools.partial(getattr(self.file_system, attr), self)
726
- except AttributeError as e:
727
- raise AttributeError(error_message) from e
763
+ if attr not in self._file_system_attrs:
764
+ raise AttributeError(error_message)
765
+ return functools.partial(getattr(self.file_system, attr), self)
728
766
 
729
767
  def __fspath__(self) -> str:
730
768
  return str(self)
731
769
 
732
770
  def __dir__(self) -> list[str]:
733
- return list(
734
- sorted(
735
- {x for x in dir(Path)}
736
- | {x for x in dir(self._file_system) if not x.startswith("_")}
737
- )
738
- )
771
+ return list(sorted({x for x in dir(Path)} | self._file_system_attrs))
739
772
 
740
773
  def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
741
774
  series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
@@ -743,6 +776,9 @@ class Path(str, _PathBase):
743
776
  path._file_system = self._file_system
744
777
  return self._iterable_type(series, **kwargs)
745
778
 
779
+ def _new(self, new_path: str | Path) -> "Path":
780
+ return self.__class__(new_path, self.file_system)
781
+
746
782
 
747
783
  class PathSeries(pd.Series, _PathBase):
748
784
  """A pandas Series for working with GCS (Google Cloud Storage) paths.
@@ -863,6 +899,12 @@ class PathSeries(pd.Series, _PathBase):
863
899
  def partition_root(self) -> "PathSeries":
864
900
  return self.files.apply(lambda x: x.partition_root).drop_duplicates()
865
901
 
902
+ @property
903
+ def partitioned_files(self) -> "PathSeries":
904
+ return self.files.loc[
905
+ lambda x: x.str.count(r"\.parquet") == 2
906
+ ].partition_root.drop_duplicates()
907
+
866
908
  @property
867
909
  def dirs(self) -> "PathSeries":
868
910
  """Select only the directories in the Series."""
@@ -1218,18 +1260,18 @@ def split_path_and_make_copyable_html(
1218
1260
  split: str | None = "/",
1219
1261
  display_prefix: str | None = ".../",
1220
1262
  ) -> str:
1221
- """Get html text that displays the last part, but makes the full path copyable to clipboard.
1263
+ """Get HTML text that displays the last part, but makes the full path copyable to clipboard.
1222
1264
 
1223
- Splits the path on a delimiter and creates an html string that displays only the
1265
+ Splits the path on a delimiter and creates an HTML string that displays only the
1224
1266
  last part, but adds a hyperlink which copies the full path to clipboard when clicked.
1225
1267
 
1226
1268
  Parameters
1227
1269
  ----------
1228
1270
  path: File or directory path
1229
- max_parts: Maximum number of path paths to display. Defaults to 2,
1271
+ max_parts: Maximum number of path parts to display. Defaults to 2,
1230
1272
  meaning the two last parts. Set to None to show full paths.
1231
1273
  split: Text pattern to split the path on. Defaults to "/".
1232
- display_prefix: The text to display instead of the parent directory. Defaults to ".../"
1274
+ display_prefix: The text to display instead of the parent directory. Defaults to ".../".
1233
1275
 
1234
1276
  Returns
1235
1277
  -------
@@ -1237,7 +1279,8 @@ def split_path_and_make_copyable_html(
1237
1279
  """
1238
1280
 
1239
1281
  copy_to_clipboard_js = f"""<script>
1240
- function copyToClipboard(text) {{
1282
+ function copyToClipboard(text, event) {{
1283
+ event.preventDefault();
1241
1284
  navigator.clipboard.writeText(text)
1242
1285
  .then(() => {{
1243
1286
  const alertBox = document.createElement('div');
@@ -1271,7 +1314,7 @@ function copyToClipboard(text) {{
1271
1314
  else:
1272
1315
  displayed_text = path
1273
1316
 
1274
- return f'{copy_to_clipboard_js}<a href="{displayed_text}" title="{path}" onclick="copyToClipboard(\'{path}\')">{displayed_text}</a>'
1317
+ return f'{copy_to_clipboard_js}<a href="#" title="{path}" onclick="copyToClipboard(\'{path}\', event)">{displayed_text}</a>'
1275
1318
 
1276
1319
 
1277
1320
  def _get_default_multi_index() -> pd.MultiIndex:
@@ -1458,6 +1501,72 @@ def get_arguments(func: Callable | object) -> list[str]:
1458
1501
  )
1459
1502
 
1460
1503
 
1504
+ def get_schema(file) -> pyarrow.Schema:
1505
+ try:
1506
+ return pq.read_schema(file)
1507
+ except (
1508
+ PermissionError,
1509
+ pyarrow.ArrowInvalid,
1510
+ FileNotFoundError,
1511
+ IsADirectoryError,
1512
+ OSError,
1513
+ ):
1514
+ # try:
1515
+ # return ds.dataset(file).schema
1516
+ # except (TypeError, FileNotFoundError) as e:
1517
+ if not hasattr(file, "file_system"):
1518
+ raise e
1519
+
1520
+ file_system = file.file_system
1521
+
1522
+ def _get_schema(path):
1523
+ try:
1524
+ return pq.read_schema(path)
1525
+ except FileNotFoundError:
1526
+ with file_system.open(path, "rb") as f:
1527
+ return pq.read_schema(f)
1528
+
1529
+ with ThreadPoolExecutor() as executor:
1530
+ return pyarrow.unify_schemas(
1531
+ list(
1532
+ executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1533
+ ),
1534
+ promote_options="permissive",
1535
+ )
1536
+
1537
+
1538
+ def get_num_rows(file):
1539
+ try:
1540
+ return pq.read_metadata(file).num_rows
1541
+ except (
1542
+ PermissionError,
1543
+ pyarrow.ArrowInvalid,
1544
+ FileNotFoundError,
1545
+ TypeError,
1546
+ OSError,
1547
+ ) as e:
1548
+ try:
1549
+ return ds.dataset(file).count_rows()
1550
+ except Exception as e2:
1551
+ if not hasattr(file, "glob"):
1552
+ raise e2 from 2
1553
+
1554
+ def _get_num_rows(path):
1555
+ with path.open("rb") as file:
1556
+ return pq.read_metadata(file).num_rows
1557
+
1558
+ with ThreadPoolExecutor() as executor:
1559
+ return sum(executor.map(_get_num_rows, file.glob("**").files))
1560
+
1561
+
1562
+ def get_shape(file) -> tuple[int, int]:
1563
+ schema = get_schema(file)
1564
+ index_cols = _get_index_cols(schema)
1565
+ ncol: int = sum(name not in index_cols for name in schema.names)
1566
+ nrow: int = get_num_rows(file)
1567
+ return nrow, ncol
1568
+
1569
+
1461
1570
  def read_nrows(file, nrow: int) -> pd.DataFrame:
1462
1571
  """Read first n rows of a parquet file."""
1463
1572
  rows = next(pq.ParquetFile(file).iter_batches(nrow))
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.0.3"
3
+ version = "2.0.6"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
File without changes
File without changes