daplapath 2.1.2__tar.gz → 2.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.1.2
3
+ Version: 2.1.4
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -0,0 +1 @@
1
+ from .path import LocalFileSystem, Path, PathSeries, config
@@ -1,30 +1,30 @@
1
1
  from __future__ import annotations
2
- from dataclasses import dataclass
2
+
3
+ import datetime
3
4
  import functools
4
- from collections.abc import Iterable
5
- from concurrent.futures import ThreadPoolExecutor
6
5
  import glob
6
+ import inspect
7
+ import io
8
+ import itertools
7
9
  import json
10
+ import os
8
11
  import pathlib
9
- from pathlib import PurePosixPath, PurePath
10
12
  import re
11
- import io
12
- import os
13
13
  import shutil
14
- from typing import Callable, Any
15
- import inspect
16
- import itertools
14
+ from collections.abc import Callable, Iterable
15
+ from concurrent.futures import ThreadPoolExecutor
16
+ from dataclasses import dataclass
17
+ from pathlib import PurePath
18
+ from typing import Any
17
19
 
18
- from fsspec.spec import AbstractFileSystem
19
- import datetime
20
20
  import numpy as np
21
21
  import pandas as pd
22
22
  import pandas.io.formats.format as fmt
23
- from pandas.api.types import is_dict_like
24
23
  import pyarrow
25
- import pyarrow.parquet as pq
26
24
  import pyarrow.dataset as ds
27
-
25
+ import pyarrow.parquet as pq
26
+ from fsspec.spec import AbstractFileSystem
27
+ from pandas.api.types import is_dict_like
28
28
 
29
29
  try:
30
30
  import gcsfs
@@ -48,10 +48,18 @@ PERIOD_PREFIX = "_p"
48
48
  INDEX_NAMES = ["timestamp", "mb", "type"]
49
49
 
50
50
 
51
- @dataclass
51
+ @dataclass(slots=True)
52
52
  class Config:
53
- __slots__ = ("file_system",)
54
- file_system: Callable
53
+ fs: Callable
54
+ team: str | None
55
+ env: str
56
+ default_protocol: str = "gs"
57
+ bucket_pattern: str = "{default_protocol}://ssb-{team}-data-{bucket}-prod"
58
+
59
+ def __getitem__(self, key: str) -> Any:
60
+ if not hasattr(self, key):
61
+ raise KeyError(key)
62
+ return getattr(self, key)
55
63
 
56
64
 
57
65
  class LocalFileSystem(AbstractFileSystem):
@@ -72,7 +80,7 @@ class LocalFileSystem(AbstractFileSystem):
72
80
  if not detail:
73
81
  return list(relevant_paths)
74
82
  with ThreadPoolExecutor() as executor:
75
- return list(executor.map(get_file_info, relevant_paths))
83
+ return {x["name"]: x for x in executor.map(get_file_info, relevant_paths)}
76
84
 
77
85
  @classmethod
78
86
  def ls(cls, path: str, detail: bool = False, **kwargs):
@@ -109,6 +117,10 @@ class LocalFileSystem(AbstractFileSystem):
109
117
  def rm_file(path: str, *args, **kwargs) -> None:
110
118
  return os.remove(path, *args, **kwargs)
111
119
 
120
+ @staticmethod
121
+ def rm(path: str, *args, **kwargs) -> None:
122
+ return os.remove(path, *args, **kwargs)
123
+
112
124
  @staticmethod
113
125
  def rmdir(path: str, *args, **kwargs) -> None:
114
126
  return shutil.rmtree(path, *args, **kwargs)
@@ -139,9 +151,19 @@ class MyGCSFileSystem(gcsfs.GCSFileSystem):
139
151
 
140
152
 
141
153
  if any("dapla" in key.lower() for key in os.environ) and "gcsfs" in locals():
142
- _config = Config(MyGCSFileSystem)
154
+ _fs = MyGCSFileSystem()
143
155
  else:
144
- _config = Config(LocalFileSystem)
156
+ _fs = LocalFileSystem()
157
+
158
+ config = Config(
159
+ fs=_fs,
160
+ team=os.environ.get("DAPLA_GROUP_CONTEXT", "")
161
+ .replace("-developers", "")
162
+ .replace("-data-admins", "")
163
+ or None,
164
+ env=os.environ.get("DAPLA_ENVIRONMENT", "prod").lower(),
165
+ )
166
+ del _fs
145
167
 
146
168
 
147
169
  class Tree:
@@ -177,12 +199,12 @@ class _PathBase:
177
199
  _period_prefix: str = PERIOD_PREFIX
178
200
 
179
201
  @staticmethod
180
- def set_option(pat: str, value: Any) -> None:
202
+ def set_config(pat: str, value: Any) -> None:
181
203
  """Change config variable."""
182
- setattr(_config, pat, value)
204
+ setattr(config, pat, value)
183
205
 
184
206
  @property
185
- def _file_system_constructor(self) -> Callable | type:
207
+ def _fs_constructor(self) -> Callable | type:
186
208
  """Can be overridden in subclass.
187
209
 
188
210
  Must return a function or a class that, when called,
@@ -191,13 +213,13 @@ class _PathBase:
191
213
  The 'info' method should return a dict like with at least the keys
192
214
  'updated', 'size', 'name' and 'type'.
193
215
  """
194
- return _config.file_system
216
+ return config.fs
195
217
 
196
218
 
197
219
  class Path(str, _PathBase):
198
220
  """Path object that works like a string, with methods for working with the GCS file system."""
199
221
 
200
- _file_system_attrs: set[str] = {
222
+ _fs_attrs: set[str] = {
201
223
  "info",
202
224
  "isdir",
203
225
  "open",
@@ -214,35 +236,71 @@ class Path(str, _PathBase):
214
236
  return PathSeries
215
237
 
216
238
  @staticmethod
217
- def _standardize_path(path: str | PurePosixPath) -> str:
239
+ def _standardize_path(path: str | PurePath) -> str:
218
240
  """Make sure delimiter is '/' and path ends without '/'."""
219
- return (
220
- str(path)
221
- .replace("\\", "/")
222
- .replace(r"\"", "/")
223
- .replace("//", "/")
224
- .rstrip("/")
225
- )
241
+ return str(path).replace("\\", "/").replace(r"\"", "/")
226
242
 
227
- def __new__(cls, gcs_path: str | PurePath | None = None, file_system=None):
243
+ def __new__(cls, gcs_path: str | os.PathLike | None = None, fs=None):
228
244
  """Construct Path with '/' as delimiter."""
229
245
  gcs_path = cls._standardize_path(gcs_path or "")
230
246
  obj = super().__new__(cls, gcs_path)
231
- obj._path = PurePosixPath(obj)
232
- obj._file_system = file_system
247
+ if fs is not None:
248
+ obj._fs = fs
249
+ elif gcs_path.startswith("/buckets"):
250
+ obj._fs = LocalFileSystem()
251
+ else:
252
+ obj._fs = config.fs
233
253
  return obj
234
254
 
235
255
  def buckets_path(self) -> "Path":
236
- if self.startswith("/buckets"):
237
- return self
238
-
239
- root = self.parts[0]
240
- bucket = root.split("-data-")[-1].split("-prod")[0]
241
-
242
256
  try:
243
- return self._new(f"/buckets/{bucket}/{'/'.join(self.parts[1:])}")
244
- except IndexError:
245
- return self._new(f"/buckets/{bucket}")
257
+ protocol, _ = str(self).split("://")
258
+ except ValueError:
259
+ protocol = ""
260
+ root, *subdirs = str(self).replace(f"{protocol}://", "").split("/")
261
+ bucket = root.split("-data-")[-1].split(f"-{config.env}")[0]
262
+ if config.team in root:
263
+ new_root = "/buckets"
264
+ team = config.team
265
+ else:
266
+ team = root.split("-data-")[0]
267
+ team = team.lstrip(team.split("-")[0]).strip("-")
268
+ bucket = bucket.replace("delt-", "")
269
+ new_root = f"/buckets/shared/{team}"
270
+
271
+ subdirs = "/".join(subdirs).strip("/")
272
+ if subdirs:
273
+ return self.__class__(f"{new_root}/{bucket}/{subdirs}", self.fs)
274
+ else:
275
+ return self.__class__(f"{new_root}/{bucket}", self.fs)
276
+
277
+ def gs_path(self) -> "Path":
278
+ if not str(self).startswith("/buckets/"):
279
+ raise ValueError(
280
+ f"Can only convert paths starting with '/buckets/' to GCS path. Got {self}"
281
+ )
282
+ if "/shared/" in str(self):
283
+ team, bucket, *subdirs = str(self).split("/shared/")[1].split("/")
284
+ bucket = "delt-" + bucket
285
+ elif not config.team:
286
+ raise ValueError(
287
+ "Must set config.team (hint: from daplapath import config; config.team = 'name')"
288
+ )
289
+ else:
290
+ team = config.team
291
+ bucket, *subdirs = str(self).replace("/buckets/", "").split("/")
292
+
293
+ gspath = (
294
+ config.bucket_pattern.format(
295
+ team=team,
296
+ bucket=bucket,
297
+ env=config.env,
298
+ default_protocol=config.default_protocol,
299
+ )
300
+ + "/"
301
+ + "/".join(subdirs)
302
+ )
303
+ return self.__class__(gspath, self.fs)
246
304
 
247
305
  def tree(
248
306
  self,
@@ -274,7 +332,6 @@ class Path(str, _PathBase):
274
332
  self, pattern: str | None = None, recursive: bool = True, **kwargs
275
333
  ) -> "PathSeries":
276
334
  """Create PathSeries of files/directories that match the pattern."""
277
-
278
335
  recursive = kwargs.get("recurse_symlinks", recursive)
279
336
 
280
337
  if pattern:
@@ -290,17 +347,17 @@ class Path(str, _PathBase):
290
347
 
291
348
  kwargs["detail"] = True
292
349
 
293
- if "recursive" in get_arguments(self.file_system.glob):
350
+ if "recursive" in get_arguments(self.fs.glob):
294
351
  kwargs["recursive"] = recursive
295
352
  else:
296
- # try to set to non-recursive if file_system.glob allows argument 'maxdepth'
353
+ # try to set to non-recursive if fs.glob allows argument 'maxdepth'
297
354
  kwargs["maxdepth"] = None if recursive else 1
298
355
 
299
356
  try:
300
- info: list[dict] | dict = self.file_system.glob(pattern, **kwargs)
357
+ info: list[dict] | dict = self.fs.glob(pattern, **kwargs)
301
358
  except TypeError:
302
359
  kwargs.pop("maxdepth", None)
303
- info: list[dict] | dict = self.file_system.glob(pattern, **kwargs)
360
+ info: list[dict] | dict = self.fs.glob(pattern, **kwargs)
304
361
 
305
362
  if isinstance(info, dict):
306
363
  # file system can return single dict if only one file path
@@ -324,10 +381,17 @@ class Path(str, _PathBase):
324
381
  """
325
382
  return self.glob("**", recursive=recursive, **kwargs)
326
383
 
384
+ def unlink(self, missing_ok: bool = False) -> None:
385
+ if not self.exists():
386
+ if not missing_ok:
387
+ raise FileNotFoundError(str(self))
388
+ return
389
+ return self.fs.rm(recursive=False)
390
+
327
391
  def rmdir(self) -> None:
328
392
  files = self.glob("**").files
329
393
  with ThreadPoolExecutor() as executor:
330
- list(executor.map(self.file_system.rm_file, files))
394
+ list(executor.map(self.fs.rm_file, files))
331
395
 
332
396
  def cp(self, destination: "Path | str") -> "Path":
333
397
  return self._cp_or_mv(destination, "cp")
@@ -337,28 +401,25 @@ class Path(str, _PathBase):
337
401
  out_path = self._cp_or_mv(destination, "mv")
338
402
  if was_dir:
339
403
  try:
340
- self.file_system.rmdir(str(self))
404
+ self.fs.rmdir(str(self))
341
405
  except (FileNotFoundError, NotADirectoryError):
342
406
  pass
343
407
  return out_path
344
408
 
345
- def read_text(self, *args, **kwargs):
346
- return self._path.read_text(*args, **kwargs)
347
-
348
409
  def versions(self, include_versionless: bool = False) -> "PathSeries":
349
410
  """Returns a PathSeries of all versions of the file."""
350
- files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
411
+ files_in_folder: Iterable[Path] = self.parent.glob(
412
+ f"*{self.suffix}", recursive=False
413
+ )
351
414
 
352
415
  if self.version_number:
353
- start, _, end = re.split(self._version_pattern, self)
416
+ start, *_ = re.split(self._version_pattern, self.name)
354
417
  else:
355
- start, end = self.stem, self.suffix
418
+ start = self.stem
356
419
 
357
420
  # create boolean mask. With numpy to make it work with both pandas and list
358
421
  arr = np.array(files_in_folder)
359
- is_version_of_this_file = (np_str_contains(arr, start)) & (
360
- np_str_endswith(arr, end)
361
- )
422
+ is_version_of_this_file = np_str_contains(arr, start)
362
423
  if not include_versionless:
363
424
  is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
364
425
 
@@ -382,7 +443,7 @@ class Path(str, _PathBase):
382
443
  Lists files in the parent directory with the same versionless stem
383
444
  and selects the one with the highest version number.
384
445
 
385
- Returns
446
+ Returns:
386
447
  -------
387
448
  A Path.
388
449
  """
@@ -405,11 +466,11 @@ class Path(str, _PathBase):
405
466
  Minutes needed between the timestamp of the current highest
406
467
  numbered version.
407
468
 
408
- Returns
469
+ Returns:
409
470
  ------
410
471
  A Path with a new version number.
411
472
 
412
- Raises
473
+ Raises:
413
474
  ------
414
475
  ValueError:
415
476
  If the method is run before the timeout period is up.
@@ -425,7 +486,7 @@ class Path(str, _PathBase):
425
486
  time_should_be_at_least = pd.Timestamp.now(tz="Europe/Oslo").replace(
426
487
  tzinfo=None
427
488
  ).round("s") - pd.Timedelta(minutes=timeout)
428
- if timestamp > time_should_be_at_least:
489
+ if timestamp is not None and timestamp > time_should_be_at_least:
429
490
  raise ValueError(
430
491
  f"Latest version of the file was updated {timestamp[0]}, which "
431
492
  f"is less than the timeout period of {timeout} minutes. "
@@ -439,7 +500,7 @@ class Path(str, _PathBase):
439
500
  def with_version(self, version: int | None) -> "Path":
440
501
  """Replace the Path's version number, if any, with a new version number.
441
502
 
442
- Examples
503
+ Examples:
443
504
  --------
444
505
  >>> Path('file.parquet').with_version(1)
445
506
  'file_v1.parquet'
@@ -456,13 +517,13 @@ class Path(str, _PathBase):
456
517
  self, include_versionless: bool = False
457
518
  ) -> "PathSeries":
458
519
  """Returns a PathSeries of all periods of the file."""
459
- files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
520
+ files_in_folder: Iterable[Path] = self.parent.glob(
521
+ f"*{self.suffix}", recursive=False
522
+ )
460
523
 
461
524
  # create boolean mask. With numpy to make it work with both pandas and list
462
525
  arr = np.array(files_in_folder)
463
- is_version_of_this_file = (
464
- np_str_contains(arr, self.periodless_stem)
465
- ) & np_str_endswith(arr, self.suffix)
526
+ is_version_of_this_file = np_str_contains(arr, self.periodless_stem)
466
527
  if not include_versionless:
467
528
  is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
468
529
 
@@ -486,12 +547,12 @@ class Path(str, _PathBase):
486
547
  Lists files in the parent directory with the same
487
548
  versionless and periodless stem and selects the path that sorts last.
488
549
 
489
- Raises
550
+ Raises:
490
551
  ------
491
552
  ValueError: If there is mismatch in period patterns, e.g. if one
492
553
  path has the period "2020-01-01" and one path has "2021".
493
554
 
494
- Returns
555
+ Returns:
495
556
  -------
496
557
  A Path.
497
558
  """
@@ -500,14 +561,14 @@ class Path(str, _PathBase):
500
561
  include_versionless=False
501
562
  )
502
563
  sorted_paths = sort_by_period(period_paths)
503
- return next(iter(reversed(sorted_paths)))
564
+ return list(sorted_paths)[-1]
504
565
  except (IndexError, StopIteration) as e:
505
566
  raise FileNotFoundError(self) from e
506
567
 
507
568
  def with_period(self, period: str) -> "Path":
508
569
  """Replace the Path's period, if any, with a new periods.
509
570
 
510
- Examples
571
+ Examples:
511
572
  --------
512
573
  >>> Path('file_v1.parquet').with_period("2024-01-01")
513
574
  'file_p2024-01-01_v1.parquet'
@@ -523,7 +584,7 @@ class Path(str, _PathBase):
523
584
  def with_periods(self, from_period: str, to_period: str | None = None) -> "Path":
524
585
  """Replace the Path's period, if any, with one or two new periods.
525
586
 
526
- Examples
587
+ Examples:
527
588
  --------
528
589
  >>> Path('file_v1.parquet').with_periods("2024-01-01")
529
590
  'file_p2024-01-01_v1.parquet'
@@ -591,46 +652,57 @@ class Path(str, _PathBase):
591
652
  @property
592
653
  def periodless_stem(self) -> str:
593
654
  """Return the file stem before the period pattern."""
594
- return str(re.sub(f"{self._period_pattern}.*", "", self._path.stem))
655
+ return str(re.sub(f"{self._period_pattern}.*", "", self.stem))
595
656
 
596
657
  @property
597
658
  def versionless_stem(self) -> str:
598
659
  """Return the file stem before the version pattern."""
599
- return self._new(re.split(self._version_pattern, self._path.name)[0]).stem
660
+ return self._new(re.split(self._version_pattern, self.name)[0]).stem
600
661
 
601
662
  @property
602
663
  def parent(self) -> "Path":
603
664
  """Parent path."""
604
- return self._new(self._path.parent)
665
+ return self._new("/".join(self.split("/")[:-1]))
605
666
 
606
667
  @property
607
668
  def parents(self) -> "list[Path]":
608
669
  """Parent path."""
609
- return [self._new(parent) for parent in self._path.parents]
670
+ no_protocol = self.split("://")[-1]
671
+ return [
672
+ self._new("/".join(no_protocol.split("/")[:i]))
673
+ for i in range(no_protocol.count("/"))
674
+ ][::-1]
610
675
 
611
676
  @property
612
677
  def name(self) -> str:
613
678
  """Final part of the path."""
614
- return self._path.name
679
+ return self.split("/")[-1]
615
680
 
616
681
  @property
617
682
  def stem(self) -> str:
618
683
  """File name without the suffix"""
619
- return self._path.stem
684
+ return self.split("/")[-1].replace(self.suffix, "")
620
685
 
621
686
  @property
622
687
  def parts(self) -> tuple[str]:
623
- return self._path.parts
688
+ no_protocol = self.split("://")[-1]
689
+ return tuple(no_protocol.split("/"))
624
690
 
625
691
  @property
626
692
  def suffix(self) -> str:
627
693
  """Final file path suffix."""
628
- return self._path.suffix
694
+ name = self.name
695
+ if "." not in name:
696
+ return ""
697
+ return "." + (name).split(".")[-1]
629
698
 
630
699
  @property
631
700
  def suffixes(self) -> list[str]:
632
701
  """File path suffixes, if multiple."""
633
- return self._path.suffixes
702
+ name = self.name
703
+ if "." not in name:
704
+ return []
705
+ return ["." + suff for suff in (name).split(".")[1:]]
634
706
 
635
707
  @property
636
708
  def index_column_names(self) -> list[str]:
@@ -691,9 +763,14 @@ class Path(str, _PathBase):
691
763
  try:
692
764
  info = self._info
693
765
  except AttributeError:
694
- info = self.file_system.info(self)
766
+ info = self.fs.info(self)
695
767
  self._info = info
696
- return _get_timestamps(info["updated"])
768
+ try:
769
+ return _get_timestamps(info["updated"])
770
+ except Exception as e:
771
+ if not self.exists() or self.is_dir():
772
+ return None
773
+ raise e
697
774
 
698
775
  @property
699
776
  def type(self) -> str:
@@ -705,7 +782,7 @@ class Path(str, _PathBase):
705
782
  try:
706
783
  info = self._info
707
784
  except AttributeError:
708
- info = self.file_system.info(self)
785
+ info = self.fs.info(self)
709
786
  self._info = info
710
787
  return info["size"]
711
788
 
@@ -770,29 +847,29 @@ class Path(str, _PathBase):
770
847
  return self.isdir()
771
848
 
772
849
  def with_suffix(self, suffix: str):
773
- return self._new(self._path.with_suffix(suffix))
850
+ return self._new(self.replace(self.suffix, suffix))
774
851
 
775
852
  def with_name(self, new_name: str):
776
- return self._new(self._path.with_name(new_name))
853
+ return self._new(self.replace(self.name, new_name))
777
854
 
778
- def with_stem(self, new_with_stem: str):
779
- return self._new(self._path.with_stem(new_with_stem))
855
+ def with_stem(self, new_stem: str):
856
+ return self._new(self.replace(self.stem, new_stem))
780
857
 
781
858
  @property
782
- def file_system(self):
783
- if self._file_system is None:
784
- self._file_system = self._file_system_constructor()
785
- return self._file_system
859
+ def fs(self):
860
+ if self._fs is None:
861
+ self._fs = self._fs_constructor()
862
+ return self._fs
786
863
 
787
- @file_system.setter
788
- def file_system(self, val):
789
- self._file_system = val
790
- return self._file_system
864
+ @fs.setter
865
+ def fs(self, val):
866
+ self._fs = val
867
+ return self._fs
791
868
 
792
869
  def __truediv__(self, other: str | os.PathLike | PurePath) -> "Path":
793
870
  """Append a string or Path to the path with a forward slash.
794
871
 
795
- Example
872
+ Example:
796
873
  -------
797
874
  >>> folder = 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2023'
798
875
  >>> file_path = folder / "ABAS_kommune_flate_p2023_v1.parquet"
@@ -807,13 +884,12 @@ class Path(str, _PathBase):
807
884
  return self._new(f"{self}/{as_str(other)}")
808
885
 
809
886
  def __getattribute__(self, name):
810
- """stackoverflow hack to ensure we return Path when using string methods.
887
+ """Stackoverflow hack to ensure we return Path when using string methods.
811
888
 
812
889
  It works for all but the string magigmethods, importantly __add__.
813
890
  """
814
-
815
891
  # skip magic methods
816
- if name not in dir(str) or name.startswith("__") and name.endswith("__"):
892
+ if name not in dir(str) or (name.startswith("__") and name.endswith("__")):
817
893
  return super().__getattribute__(name)
818
894
 
819
895
  def method(self, *args, **kwargs):
@@ -832,35 +908,35 @@ class Path(str, _PathBase):
832
908
  return method.__get__(self)
833
909
 
834
910
  def __getattr__(self, attr: str) -> Any:
835
- """Get file_system attribute."""
911
+ """Get fs attribute."""
836
912
  error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
837
913
  if attr.startswith("_"):
838
914
  raise AttributeError(error_message)
839
- if attr not in self._file_system_attrs:
915
+ if attr not in self._fs_attrs:
840
916
  raise AttributeError(error_message)
841
- return functools.partial(getattr(self.file_system, attr), self)
917
+ return functools.partial(getattr(self.fs, attr), self)
842
918
 
843
919
  def __fspath__(self) -> str:
844
920
  return str(self)
845
921
 
846
922
  def __dir__(self) -> list[str]:
847
- return list(sorted({x for x in dir(Path)} | self._file_system_attrs))
923
+ return list(sorted({x for x in dir(Path)} | self._fs_attrs))
848
924
 
849
925
  def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
850
926
  series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
851
927
  for path in series:
852
- path._file_system = self._file_system
928
+ path._fs = self._fs
853
929
  return self._iterable_type(series, **kwargs)
854
930
 
855
931
  def _new(self, new_path: str | Path) -> "Path":
856
- return self.__class__(new_path, self.file_system)
932
+ return self.__class__(new_path, self.fs)
857
933
 
858
934
  def _cp_or_mv(self, destination: "Path | str", attr: str) -> "Path":
859
- func: Callable = getattr(self.file_system, attr)
935
+ func: Callable = getattr(self.fs, attr)
860
936
  try:
861
937
  func(self, destination)
862
938
  except FileNotFoundError:
863
- destination = self.__class__(destination)
939
+ destination = self.__class__(destination, self.fs)
864
940
  sources = list(self.glob("**").files)
865
941
  destinations = [path.replace(self, destination) for path in sources]
866
942
  with ThreadPoolExecutor() as executor:
@@ -921,7 +997,7 @@ class PathSeries(pd.Series, _PathBase):
921
997
  names: Series
922
998
  The names of the file paths.
923
999
 
924
- Methods
1000
+ Methods:
925
1001
  -------
926
1002
  tree():
927
1003
  con
@@ -961,18 +1037,24 @@ class PathSeries(pd.Series, _PathBase):
961
1037
  data is not None
962
1038
  and len(data)
963
1039
  and not (
964
- isinstance(data, pd.Series)
965
- and len(data.index.names) == len(self._index_names)
966
- or isinstance(index, pd.MultiIndex)
967
- and len(index.names) == len(self._index_names)
1040
+ (
1041
+ isinstance(data, pd.Series)
1042
+ and len(data.index.names) == len(self._index_names)
1043
+ )
1044
+ or (
1045
+ isinstance(index, pd.MultiIndex)
1046
+ and len(index.names) == len(self._index_names)
1047
+ )
968
1048
  # dict with e.g. tuple keys, turned into MultiIndex
969
- or is_dict_like(data)
970
- and all(len(key) == len(self._index_names) for key in data.keys())
1049
+ or (
1050
+ is_dict_like(data)
1051
+ and all(len(key) == len(self._index_names) for key in data.keys())
1052
+ )
971
1053
  )
972
1054
  )
973
1055
  if should_construct_index:
974
- file_system = kwargs.get("file_system", self._file_system_constructor())
975
- data = _get_paths_and_index([file_system.info(path) for path in data])
1056
+ fs = kwargs.get("fs", self._fs_constructor())
1057
+ data = _get_paths_and_index([fs.info(path) for path in data])
976
1058
 
977
1059
  super().__init__(data, index=index, **kwargs)
978
1060
 
@@ -1338,7 +1420,7 @@ def _pathseries_constructor_with_fallback(
1338
1420
  max_parts: int | None = 2,
1339
1421
  path_series_type: type | None = None,
1340
1422
  **kwargs,
1341
- ) -> "PathSeries | pd.Series":
1423
+ ) -> PathSeries | pd.Series:
1342
1424
  path_series_type = path_series_type or PathSeries
1343
1425
 
1344
1426
  kwargs["name"] = kwargs.pop("name", "path")
@@ -1372,7 +1454,7 @@ def _pathseries_constructor_with_fallback(
1372
1454
  return series
1373
1455
 
1374
1456
 
1375
- def _dataframe_constructor(data=None, index=None, **kwargs) -> "pd.DataFrame":
1457
+ def _dataframe_constructor(data=None, index=None, **kwargs) -> pd.DataFrame:
1376
1458
  data.name = "path"
1377
1459
  return pd.DataFrame(data, index=index, **kwargs)
1378
1460
 
@@ -1396,16 +1478,15 @@ def split_path_and_make_copyable_html(
1396
1478
  split: Text pattern to split the path on. Defaults to "/".
1397
1479
  display_prefix: The text to display instead of the parent directory. Defaults to ".../".
1398
1480
 
1399
- Returns
1481
+ Returns:
1400
1482
  -------
1401
1483
  A string that holds the HTML and JavaScript code to be passed to IPython.display.display.
1402
1484
  """
1403
-
1404
- copy_to_clipboard_js = f"""<script>
1405
- function copyToClipboard(text, event) {{
1485
+ copy_to_clipboard_js = """<script>
1486
+ function copyToClipboard(text, event) {
1406
1487
  event.preventDefault();
1407
1488
  navigator.clipboard.writeText(text)
1408
- .then(() => {{
1489
+ .then(() => {
1409
1490
  const alertBox = document.createElement('div');
1410
1491
  const selection = window.getSelection();
1411
1492
 
@@ -1418,14 +1499,14 @@ function copyToClipboard(text, event) {{
1418
1499
  alertBox.innerHTML = 'Copied to clipboard';
1419
1500
  document.body.appendChild(alertBox);
1420
1501
 
1421
- setTimeout(function() {{
1502
+ setTimeout(function() {
1422
1503
  alertBox.style.display = 'none';
1423
- }}, 1500); // 1.5 seconds
1424
- }})
1425
- .catch(err => {{
1504
+ }, 1500); // 1.5 seconds
1505
+ })
1506
+ .catch(err => {
1426
1507
  console.error('Could not copy text: ', err);
1427
- }});
1428
- }}
1508
+ });
1509
+ }
1429
1510
  </script>"""
1430
1511
 
1431
1512
  if split is not None:
@@ -1640,28 +1721,28 @@ def get_schema(file) -> pyarrow.Schema:
1640
1721
  # try:
1641
1722
  # return ds.dataset(file).schema
1642
1723
  # except (TypeError, FileNotFoundError) as e:
1643
- if not hasattr(file, "file_system"):
1724
+ if not hasattr(file, "fs"):
1644
1725
  raise e
1645
1726
 
1646
- file_system = file.file_system
1727
+ fs = file.fs
1647
1728
 
1648
1729
  def _get_schema(path):
1649
1730
  try:
1650
1731
  return pq.read_schema(path)
1651
1732
  except FileNotFoundError as e:
1652
1733
  try:
1653
- with file_system.open(path, "rb") as f:
1734
+ with fs.open(path, "rb") as f:
1654
1735
  return pq.read_schema(f)
1655
1736
  except Exception as e2:
1656
1737
  raise e2.__class__(f"{e2}. {path}") from e
1657
1738
 
1658
- child_paths = file_system.glob(file + "/**/*.parquet")
1739
+ child_paths = fs.glob(file + "/**/*.parquet")
1659
1740
  if not len(child_paths):
1660
1741
  raise e.__class__(f"{e}: {file}") from e
1661
1742
 
1662
1743
  with ThreadPoolExecutor() as executor:
1663
1744
  schemas: list[pyarrow.Schema] = list(
1664
- executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1745
+ executor.map(_get_schema, fs.glob(file + "/**/*.parquet"))
1665
1746
  )
1666
1747
  if not schemas:
1667
1748
  raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
@@ -1740,15 +1821,14 @@ def sort_by_period(paths: Iterable[str]) -> Iterable[str]:
1740
1821
  except ValueError:
1741
1822
  # select last period
1742
1823
  periods = [pd.Timestamp(next(iter(reversed(path.periods)))) for path in paths]
1743
- combined = list(zip(periods, range(len(paths)), paths, strict=True))
1824
+ combined = list(zip(periods, paths, list(range(len(paths))), strict=True))
1744
1825
  combined.sort()
1745
- indices: list[int] = [x[1] for x in combined]
1826
+ indices: list[int] = [x[2] for x in combined]
1746
1827
  try:
1747
1828
  return paths.iloc[indices]
1748
1829
  except AttributeError:
1749
- return paths.__class__([x[2] for x in combined])
1830
+ return paths.__class__([x[1] for x in combined])
1750
1831
 
1751
1832
 
1752
1833
  np_str_contains: Callable = np.vectorize(str.__contains__)
1753
- np_str_endswith: Callable = np.vectorize(str.endswith)
1754
1834
  np_str_matches: Callable = np.vectorize(lambda txt, pat: bool(re.search(pat, txt)))
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.1.2"
3
+ version = "2.1.4"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
@@ -1,3 +0,0 @@
1
- from .path import Path
2
- from .path import PathSeries
3
- from .path import LocalFileSystem
File without changes
File without changes