daplapath 2.1.3__tar.gz → 2.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.1.3
3
+ Version: 2.1.4
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -0,0 +1 @@
1
+ from .path import LocalFileSystem, Path, PathSeries, config
@@ -1,30 +1,30 @@
1
1
  from __future__ import annotations
2
- from dataclasses import dataclass
2
+
3
+ import datetime
3
4
  import functools
4
- from collections.abc import Iterable
5
- from concurrent.futures import ThreadPoolExecutor
6
5
  import glob
6
+ import inspect
7
+ import io
8
+ import itertools
7
9
  import json
10
+ import os
8
11
  import pathlib
9
- from pathlib import PurePosixPath, PurePath
10
12
  import re
11
- import io
12
- import os
13
13
  import shutil
14
- from typing import Callable, Any
15
- import inspect
16
- import itertools
14
+ from collections.abc import Callable, Iterable
15
+ from concurrent.futures import ThreadPoolExecutor
16
+ from dataclasses import dataclass
17
+ from pathlib import PurePath
18
+ from typing import Any
17
19
 
18
- from fsspec.spec import AbstractFileSystem
19
- import datetime
20
20
  import numpy as np
21
21
  import pandas as pd
22
22
  import pandas.io.formats.format as fmt
23
- from pandas.api.types import is_dict_like
24
23
  import pyarrow
25
- import pyarrow.parquet as pq
26
24
  import pyarrow.dataset as ds
27
-
25
+ import pyarrow.parquet as pq
26
+ from fsspec.spec import AbstractFileSystem
27
+ from pandas.api.types import is_dict_like
28
28
 
29
29
  try:
30
30
  import gcsfs
@@ -48,10 +48,18 @@ PERIOD_PREFIX = "_p"
48
48
  INDEX_NAMES = ["timestamp", "mb", "type"]
49
49
 
50
50
 
51
- @dataclass
51
+ @dataclass(slots=True)
52
52
  class Config:
53
- __slots__ = ("file_system",)
54
- file_system: Callable
53
+ fs: Callable
54
+ team: str | None
55
+ env: str
56
+ default_protocol: str = "gs"
57
+ bucket_pattern: str = "{default_protocol}://ssb-{team}-data-{bucket}-prod"
58
+
59
+ def __getitem__(self, key: str) -> Any:
60
+ if not hasattr(self, key):
61
+ raise KeyError(key)
62
+ return getattr(self, key)
55
63
 
56
64
 
57
65
  class LocalFileSystem(AbstractFileSystem):
@@ -72,7 +80,7 @@ class LocalFileSystem(AbstractFileSystem):
72
80
  if not detail:
73
81
  return list(relevant_paths)
74
82
  with ThreadPoolExecutor() as executor:
75
- return list(executor.map(get_file_info, relevant_paths))
83
+ return {x["name"]: x for x in executor.map(get_file_info, relevant_paths)}
76
84
 
77
85
  @classmethod
78
86
  def ls(cls, path: str, detail: bool = False, **kwargs):
@@ -109,6 +117,10 @@ class LocalFileSystem(AbstractFileSystem):
109
117
  def rm_file(path: str, *args, **kwargs) -> None:
110
118
  return os.remove(path, *args, **kwargs)
111
119
 
120
+ @staticmethod
121
+ def rm(path: str, *args, **kwargs) -> None:
122
+ return os.remove(path, *args, **kwargs)
123
+
112
124
  @staticmethod
113
125
  def rmdir(path: str, *args, **kwargs) -> None:
114
126
  return shutil.rmtree(path, *args, **kwargs)
@@ -139,9 +151,19 @@ class MyGCSFileSystem(gcsfs.GCSFileSystem):
139
151
 
140
152
 
141
153
  if any("dapla" in key.lower() for key in os.environ) and "gcsfs" in locals():
142
- _config = Config(MyGCSFileSystem)
154
+ _fs = MyGCSFileSystem()
143
155
  else:
144
- _config = Config(LocalFileSystem)
156
+ _fs = LocalFileSystem()
157
+
158
+ config = Config(
159
+ fs=_fs,
160
+ team=os.environ.get("DAPLA_GROUP_CONTEXT", "")
161
+ .replace("-developers", "")
162
+ .replace("-data-admins", "")
163
+ or None,
164
+ env=os.environ.get("DAPLA_ENVIRONMENT", "prod").lower(),
165
+ )
166
+ del _fs
145
167
 
146
168
 
147
169
  class Tree:
@@ -177,12 +199,12 @@ class _PathBase:
177
199
  _period_prefix: str = PERIOD_PREFIX
178
200
 
179
201
  @staticmethod
180
- def set_option(pat: str, value: Any) -> None:
202
+ def set_config(pat: str, value: Any) -> None:
181
203
  """Change config variable."""
182
- setattr(_config, pat, value)
204
+ setattr(config, pat, value)
183
205
 
184
206
  @property
185
- def _file_system_constructor(self) -> Callable | type:
207
+ def _fs_constructor(self) -> Callable | type:
186
208
  """Can be overridden in subclass.
187
209
 
188
210
  Must return a function or a class that, when called,
@@ -191,13 +213,13 @@ class _PathBase:
191
213
  The 'info' method should return a dict like with at least the keys
192
214
  'updated', 'size', 'name' and 'type'.
193
215
  """
194
- return _config.file_system
216
+ return config.fs
195
217
 
196
218
 
197
219
  class Path(str, _PathBase):
198
220
  """Path object that works like a string, with methods for working with the GCS file system."""
199
221
 
200
- _file_system_attrs: set[str] = {
222
+ _fs_attrs: set[str] = {
201
223
  "info",
202
224
  "isdir",
203
225
  "open",
@@ -214,29 +236,71 @@ class Path(str, _PathBase):
214
236
  return PathSeries
215
237
 
216
238
  @staticmethod
217
- def _standardize_path(path: str | PurePosixPath) -> str:
239
+ def _standardize_path(path: str | PurePath) -> str:
218
240
  """Make sure delimiter is '/' and path ends without '/'."""
219
241
  return str(path).replace("\\", "/").replace(r"\"", "/")
220
242
 
221
- def __new__(cls, gcs_path: str | PurePath | None = None, file_system=None):
243
+ def __new__(cls, gcs_path: str | os.PathLike | None = None, fs=None):
222
244
  """Construct Path with '/' as delimiter."""
223
245
  gcs_path = cls._standardize_path(gcs_path or "")
224
246
  obj = super().__new__(cls, gcs_path)
225
- obj._path = PurePosixPath(obj)
226
- obj._file_system = file_system
247
+ if fs is not None:
248
+ obj._fs = fs
249
+ elif gcs_path.startswith("/buckets"):
250
+ obj._fs = LocalFileSystem()
251
+ else:
252
+ obj._fs = config.fs
227
253
  return obj
228
254
 
229
255
  def buckets_path(self) -> "Path":
230
- if self.startswith("/buckets"):
231
- return self
232
-
233
- root = self.parts[0]
234
- bucket = root.split("-data-")[-1].split("-prod")[0]
235
-
236
256
  try:
237
- return self._new(f"/buckets/{bucket}/{'/'.join(self.parts[1:])}")
238
- except IndexError:
239
- return self._new(f"/buckets/{bucket}")
257
+ protocol, _ = str(self).split("://")
258
+ except ValueError:
259
+ protocol = ""
260
+ root, *subdirs = str(self).replace(f"{protocol}://", "").split("/")
261
+ bucket = root.split("-data-")[-1].split(f"-{config.env}")[0]
262
+ if config.team in root:
263
+ new_root = "/buckets"
264
+ team = config.team
265
+ else:
266
+ team = root.split("-data-")[0]
267
+ team = team.lstrip(team.split("-")[0]).strip("-")
268
+ bucket = bucket.replace("delt-", "")
269
+ new_root = f"/buckets/shared/{team}"
270
+
271
+ subdirs = "/".join(subdirs).strip("/")
272
+ if subdirs:
273
+ return self.__class__(f"{new_root}/{bucket}/{subdirs}", self.fs)
274
+ else:
275
+ return self.__class__(f"{new_root}/{bucket}", self.fs)
276
+
277
+ def gs_path(self) -> "Path":
278
+ if not str(self).startswith("/buckets/"):
279
+ raise ValueError(
280
+ f"Can only convert paths starting with '/buckets/' to GCS path. Got {self}"
281
+ )
282
+ if "/shared/" in str(self):
283
+ team, bucket, *subdirs = str(self).split("/shared/")[1].split("/")
284
+ bucket = "delt-" + bucket
285
+ elif not config.team:
286
+ raise ValueError(
287
+ "Must set config.team (hint: from daplapath import config; config.team = 'name')"
288
+ )
289
+ else:
290
+ team = config.team
291
+ bucket, *subdirs = str(self).replace("/buckets/", "").split("/")
292
+
293
+ gspath = (
294
+ config.bucket_pattern.format(
295
+ team=team,
296
+ bucket=bucket,
297
+ env=config.env,
298
+ default_protocol=config.default_protocol,
299
+ )
300
+ + "/"
301
+ + "/".join(subdirs)
302
+ )
303
+ return self.__class__(gspath, self.fs)
240
304
 
241
305
  def tree(
242
306
  self,
@@ -268,7 +332,6 @@ class Path(str, _PathBase):
268
332
  self, pattern: str | None = None, recursive: bool = True, **kwargs
269
333
  ) -> "PathSeries":
270
334
  """Create PathSeries of files/directories that match the pattern."""
271
-
272
335
  recursive = kwargs.get("recurse_symlinks", recursive)
273
336
 
274
337
  if pattern:
@@ -284,17 +347,17 @@ class Path(str, _PathBase):
284
347
 
285
348
  kwargs["detail"] = True
286
349
 
287
- if "recursive" in get_arguments(self.file_system.glob):
350
+ if "recursive" in get_arguments(self.fs.glob):
288
351
  kwargs["recursive"] = recursive
289
352
  else:
290
- # try to set to non-recursive if file_system.glob allows argument 'maxdepth'
353
+ # try to set to non-recursive if fs.glob allows argument 'maxdepth'
291
354
  kwargs["maxdepth"] = None if recursive else 1
292
355
 
293
356
  try:
294
- info: list[dict] | dict = self.file_system.glob(pattern, **kwargs)
357
+ info: list[dict] | dict = self.fs.glob(pattern, **kwargs)
295
358
  except TypeError:
296
359
  kwargs.pop("maxdepth", None)
297
- info: list[dict] | dict = self.file_system.glob(pattern, **kwargs)
360
+ info: list[dict] | dict = self.fs.glob(pattern, **kwargs)
298
361
 
299
362
  if isinstance(info, dict):
300
363
  # file system can return single dict if only one file path
@@ -318,10 +381,17 @@ class Path(str, _PathBase):
318
381
  """
319
382
  return self.glob("**", recursive=recursive, **kwargs)
320
383
 
384
+ def unlink(self, missing_ok: bool = False) -> None:
385
+ if not self.exists():
386
+ if not missing_ok:
387
+ raise FileNotFoundError(str(self))
388
+ return
389
+ return self.fs.rm(recursive=False)
390
+
321
391
  def rmdir(self) -> None:
322
392
  files = self.glob("**").files
323
393
  with ThreadPoolExecutor() as executor:
324
- list(executor.map(self.file_system.rm_file, files))
394
+ list(executor.map(self.fs.rm_file, files))
325
395
 
326
396
  def cp(self, destination: "Path | str") -> "Path":
327
397
  return self._cp_or_mv(destination, "cp")
@@ -331,28 +401,25 @@ class Path(str, _PathBase):
331
401
  out_path = self._cp_or_mv(destination, "mv")
332
402
  if was_dir:
333
403
  try:
334
- self.file_system.rmdir(str(self))
404
+ self.fs.rmdir(str(self))
335
405
  except (FileNotFoundError, NotADirectoryError):
336
406
  pass
337
407
  return out_path
338
408
 
339
- def read_text(self, *args, **kwargs):
340
- return self._path.read_text(*args, **kwargs)
341
-
342
409
  def versions(self, include_versionless: bool = False) -> "PathSeries":
343
410
  """Returns a PathSeries of all versions of the file."""
344
- files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
411
+ files_in_folder: Iterable[Path] = self.parent.glob(
412
+ f"*{self.suffix}", recursive=False
413
+ )
345
414
 
346
415
  if self.version_number:
347
- start, _, end = re.split(self._version_pattern, self)
416
+ start, *_ = re.split(self._version_pattern, self.name)
348
417
  else:
349
- start, end = self.stem, self.suffix
418
+ start = self.stem
350
419
 
351
420
  # create boolean mask. With numpy to make it work with both pandas and list
352
421
  arr = np.array(files_in_folder)
353
- is_version_of_this_file = (np_str_contains(arr, start)) & (
354
- np_str_endswith(arr, end)
355
- )
422
+ is_version_of_this_file = np_str_contains(arr, start)
356
423
  if not include_versionless:
357
424
  is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
358
425
 
@@ -376,7 +443,7 @@ class Path(str, _PathBase):
376
443
  Lists files in the parent directory with the same versionless stem
377
444
  and selects the one with the highest version number.
378
445
 
379
- Returns
446
+ Returns:
380
447
  -------
381
448
  A Path.
382
449
  """
@@ -399,11 +466,11 @@ class Path(str, _PathBase):
399
466
  Minutes needed between the timestamp of the current highest
400
467
  numbered version.
401
468
 
402
- Returns
469
+ Returns:
403
470
  ------
404
471
  A Path with a new version number.
405
472
 
406
- Raises
473
+ Raises:
407
474
  ------
408
475
  ValueError:
409
476
  If the method is run before the timeout period is up.
@@ -419,7 +486,7 @@ class Path(str, _PathBase):
419
486
  time_should_be_at_least = pd.Timestamp.now(tz="Europe/Oslo").replace(
420
487
  tzinfo=None
421
488
  ).round("s") - pd.Timedelta(minutes=timeout)
422
- if timestamp > time_should_be_at_least:
489
+ if timestamp is not None and timestamp > time_should_be_at_least:
423
490
  raise ValueError(
424
491
  f"Latest version of the file was updated {timestamp[0]}, which "
425
492
  f"is less than the timeout period of {timeout} minutes. "
@@ -433,7 +500,7 @@ class Path(str, _PathBase):
433
500
  def with_version(self, version: int | None) -> "Path":
434
501
  """Replace the Path's version number, if any, with a new version number.
435
502
 
436
- Examples
503
+ Examples:
437
504
  --------
438
505
  >>> Path('file.parquet').with_version(1)
439
506
  'file_v1.parquet'
@@ -450,13 +517,13 @@ class Path(str, _PathBase):
450
517
  self, include_versionless: bool = False
451
518
  ) -> "PathSeries":
452
519
  """Returns a PathSeries of all periods of the file."""
453
- files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
520
+ files_in_folder: Iterable[Path] = self.parent.glob(
521
+ f"*{self.suffix}", recursive=False
522
+ )
454
523
 
455
524
  # create boolean mask. With numpy to make it work with both pandas and list
456
525
  arr = np.array(files_in_folder)
457
- is_version_of_this_file = (
458
- np_str_contains(arr, self.periodless_stem)
459
- ) & np_str_endswith(arr, self.suffix)
526
+ is_version_of_this_file = np_str_contains(arr, self.periodless_stem)
460
527
  if not include_versionless:
461
528
  is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
462
529
 
@@ -480,12 +547,12 @@ class Path(str, _PathBase):
480
547
  Lists files in the parent directory with the same
481
548
  versionless and periodless stem and selects the path that sorts last.
482
549
 
483
- Raises
550
+ Raises:
484
551
  ------
485
552
  ValueError: If there is mismatch in period patterns, e.g. if one
486
553
  path has the period "2020-01-01" and one path has "2021".
487
554
 
488
- Returns
555
+ Returns:
489
556
  -------
490
557
  A Path.
491
558
  """
@@ -494,14 +561,14 @@ class Path(str, _PathBase):
494
561
  include_versionless=False
495
562
  )
496
563
  sorted_paths = sort_by_period(period_paths)
497
- return next(iter(reversed(sorted_paths)))
564
+ return list(sorted_paths)[-1]
498
565
  except (IndexError, StopIteration) as e:
499
566
  raise FileNotFoundError(self) from e
500
567
 
501
568
  def with_period(self, period: str) -> "Path":
502
569
  """Replace the Path's period, if any, with a new periods.
503
570
 
504
- Examples
571
+ Examples:
505
572
  --------
506
573
  >>> Path('file_v1.parquet').with_period("2024-01-01")
507
574
  'file_p2024-01-01_v1.parquet'
@@ -517,7 +584,7 @@ class Path(str, _PathBase):
517
584
  def with_periods(self, from_period: str, to_period: str | None = None) -> "Path":
518
585
  """Replace the Path's period, if any, with one or two new periods.
519
586
 
520
- Examples
587
+ Examples:
521
588
  --------
522
589
  >>> Path('file_v1.parquet').with_periods("2024-01-01")
523
590
  'file_p2024-01-01_v1.parquet'
@@ -585,46 +652,57 @@ class Path(str, _PathBase):
585
652
  @property
586
653
  def periodless_stem(self) -> str:
587
654
  """Return the file stem before the period pattern."""
588
- return str(re.sub(f"{self._period_pattern}.*", "", self._path.stem))
655
+ return str(re.sub(f"{self._period_pattern}.*", "", self.stem))
589
656
 
590
657
  @property
591
658
  def versionless_stem(self) -> str:
592
659
  """Return the file stem before the version pattern."""
593
- return self._new(re.split(self._version_pattern, self._path.name)[0]).stem
660
+ return self._new(re.split(self._version_pattern, self.name)[0]).stem
594
661
 
595
662
  @property
596
663
  def parent(self) -> "Path":
597
664
  """Parent path."""
598
- return self._new(self._path.parent)
665
+ return self._new("/".join(self.split("/")[:-1]))
599
666
 
600
667
  @property
601
668
  def parents(self) -> "list[Path]":
602
669
  """Parent path."""
603
- return [self._new(parent) for parent in self._path.parents]
670
+ no_protocol = self.split("://")[-1]
671
+ return [
672
+ self._new("/".join(no_protocol.split("/")[:i]))
673
+ for i in range(no_protocol.count("/"))
674
+ ][::-1]
604
675
 
605
676
  @property
606
677
  def name(self) -> str:
607
678
  """Final part of the path."""
608
- return self._path.name
679
+ return self.split("/")[-1]
609
680
 
610
681
  @property
611
682
  def stem(self) -> str:
612
683
  """File name without the suffix"""
613
- return self._path.stem
684
+ return self.split("/")[-1].replace(self.suffix, "")
614
685
 
615
686
  @property
616
687
  def parts(self) -> tuple[str]:
617
- return self._path.parts
688
+ no_protocol = self.split("://")[-1]
689
+ return tuple(no_protocol.split("/"))
618
690
 
619
691
  @property
620
692
  def suffix(self) -> str:
621
693
  """Final file path suffix."""
622
- return self._path.suffix
694
+ name = self.name
695
+ if "." not in name:
696
+ return ""
697
+ return "." + (name).split(".")[-1]
623
698
 
624
699
  @property
625
700
  def suffixes(self) -> list[str]:
626
701
  """File path suffixes, if multiple."""
627
- return self._path.suffixes
702
+ name = self.name
703
+ if "." not in name:
704
+ return []
705
+ return ["." + suff for suff in (name).split(".")[1:]]
628
706
 
629
707
  @property
630
708
  def index_column_names(self) -> list[str]:
@@ -685,9 +763,14 @@ class Path(str, _PathBase):
685
763
  try:
686
764
  info = self._info
687
765
  except AttributeError:
688
- info = self.file_system.info(self)
766
+ info = self.fs.info(self)
689
767
  self._info = info
690
- return _get_timestamps(info["updated"])
768
+ try:
769
+ return _get_timestamps(info["updated"])
770
+ except Exception as e:
771
+ if not self.exists() or self.is_dir():
772
+ return None
773
+ raise e
691
774
 
692
775
  @property
693
776
  def type(self) -> str:
@@ -699,7 +782,7 @@ class Path(str, _PathBase):
699
782
  try:
700
783
  info = self._info
701
784
  except AttributeError:
702
- info = self.file_system.info(self)
785
+ info = self.fs.info(self)
703
786
  self._info = info
704
787
  return info["size"]
705
788
 
@@ -764,29 +847,29 @@ class Path(str, _PathBase):
764
847
  return self.isdir()
765
848
 
766
849
  def with_suffix(self, suffix: str):
767
- return self._new(self._path.with_suffix(suffix))
850
+ return self._new(self.replace(self.suffix, suffix))
768
851
 
769
852
  def with_name(self, new_name: str):
770
- return self._new(self._path.with_name(new_name))
853
+ return self._new(self.replace(self.name, new_name))
771
854
 
772
- def with_stem(self, new_with_stem: str):
773
- return self._new(self._path.with_stem(new_with_stem))
855
+ def with_stem(self, new_stem: str):
856
+ return self._new(self.replace(self.stem, new_stem))
774
857
 
775
858
  @property
776
- def file_system(self):
777
- if self._file_system is None:
778
- self._file_system = self._file_system_constructor()
779
- return self._file_system
859
+ def fs(self):
860
+ if self._fs is None:
861
+ self._fs = self._fs_constructor()
862
+ return self._fs
780
863
 
781
- @file_system.setter
782
- def file_system(self, val):
783
- self._file_system = val
784
- return self._file_system
864
+ @fs.setter
865
+ def fs(self, val):
866
+ self._fs = val
867
+ return self._fs
785
868
 
786
869
  def __truediv__(self, other: str | os.PathLike | PurePath) -> "Path":
787
870
  """Append a string or Path to the path with a forward slash.
788
871
 
789
- Example
872
+ Example:
790
873
  -------
791
874
  >>> folder = 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2023'
792
875
  >>> file_path = folder / "ABAS_kommune_flate_p2023_v1.parquet"
@@ -801,13 +884,12 @@ class Path(str, _PathBase):
801
884
  return self._new(f"{self}/{as_str(other)}")
802
885
 
803
886
  def __getattribute__(self, name):
804
- """stackoverflow hack to ensure we return Path when using string methods.
887
+ """Stackoverflow hack to ensure we return Path when using string methods.
805
888
 
806
889
  It works for all but the string magigmethods, importantly __add__.
807
890
  """
808
-
809
891
  # skip magic methods
810
- if name not in dir(str) or name.startswith("__") and name.endswith("__"):
892
+ if name not in dir(str) or (name.startswith("__") and name.endswith("__")):
811
893
  return super().__getattribute__(name)
812
894
 
813
895
  def method(self, *args, **kwargs):
@@ -826,35 +908,35 @@ class Path(str, _PathBase):
826
908
  return method.__get__(self)
827
909
 
828
910
  def __getattr__(self, attr: str) -> Any:
829
- """Get file_system attribute."""
911
+ """Get fs attribute."""
830
912
  error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
831
913
  if attr.startswith("_"):
832
914
  raise AttributeError(error_message)
833
- if attr not in self._file_system_attrs:
915
+ if attr not in self._fs_attrs:
834
916
  raise AttributeError(error_message)
835
- return functools.partial(getattr(self.file_system, attr), self)
917
+ return functools.partial(getattr(self.fs, attr), self)
836
918
 
837
919
  def __fspath__(self) -> str:
838
920
  return str(self)
839
921
 
840
922
  def __dir__(self) -> list[str]:
841
- return list(sorted({x for x in dir(Path)} | self._file_system_attrs))
923
+ return list(sorted({x for x in dir(Path)} | self._fs_attrs))
842
924
 
843
925
  def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
844
926
  series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
845
927
  for path in series:
846
- path._file_system = self._file_system
928
+ path._fs = self._fs
847
929
  return self._iterable_type(series, **kwargs)
848
930
 
849
931
  def _new(self, new_path: str | Path) -> "Path":
850
- return self.__class__(new_path, self.file_system)
932
+ return self.__class__(new_path, self.fs)
851
933
 
852
934
  def _cp_or_mv(self, destination: "Path | str", attr: str) -> "Path":
853
- func: Callable = getattr(self.file_system, attr)
935
+ func: Callable = getattr(self.fs, attr)
854
936
  try:
855
937
  func(self, destination)
856
938
  except FileNotFoundError:
857
- destination = self.__class__(destination)
939
+ destination = self.__class__(destination, self.fs)
858
940
  sources = list(self.glob("**").files)
859
941
  destinations = [path.replace(self, destination) for path in sources]
860
942
  with ThreadPoolExecutor() as executor:
@@ -915,7 +997,7 @@ class PathSeries(pd.Series, _PathBase):
915
997
  names: Series
916
998
  The names of the file paths.
917
999
 
918
- Methods
1000
+ Methods:
919
1001
  -------
920
1002
  tree():
921
1003
  con
@@ -955,18 +1037,24 @@ class PathSeries(pd.Series, _PathBase):
955
1037
  data is not None
956
1038
  and len(data)
957
1039
  and not (
958
- isinstance(data, pd.Series)
959
- and len(data.index.names) == len(self._index_names)
960
- or isinstance(index, pd.MultiIndex)
961
- and len(index.names) == len(self._index_names)
1040
+ (
1041
+ isinstance(data, pd.Series)
1042
+ and len(data.index.names) == len(self._index_names)
1043
+ )
1044
+ or (
1045
+ isinstance(index, pd.MultiIndex)
1046
+ and len(index.names) == len(self._index_names)
1047
+ )
962
1048
  # dict with e.g. tuple keys, turned into MultiIndex
963
- or is_dict_like(data)
964
- and all(len(key) == len(self._index_names) for key in data.keys())
1049
+ or (
1050
+ is_dict_like(data)
1051
+ and all(len(key) == len(self._index_names) for key in data.keys())
1052
+ )
965
1053
  )
966
1054
  )
967
1055
  if should_construct_index:
968
- file_system = kwargs.get("file_system", self._file_system_constructor())
969
- data = _get_paths_and_index([file_system.info(path) for path in data])
1056
+ fs = kwargs.get("fs", self._fs_constructor())
1057
+ data = _get_paths_and_index([fs.info(path) for path in data])
970
1058
 
971
1059
  super().__init__(data, index=index, **kwargs)
972
1060
 
@@ -1332,7 +1420,7 @@ def _pathseries_constructor_with_fallback(
1332
1420
  max_parts: int | None = 2,
1333
1421
  path_series_type: type | None = None,
1334
1422
  **kwargs,
1335
- ) -> "PathSeries | pd.Series":
1423
+ ) -> PathSeries | pd.Series:
1336
1424
  path_series_type = path_series_type or PathSeries
1337
1425
 
1338
1426
  kwargs["name"] = kwargs.pop("name", "path")
@@ -1366,7 +1454,7 @@ def _pathseries_constructor_with_fallback(
1366
1454
  return series
1367
1455
 
1368
1456
 
1369
- def _dataframe_constructor(data=None, index=None, **kwargs) -> "pd.DataFrame":
1457
+ def _dataframe_constructor(data=None, index=None, **kwargs) -> pd.DataFrame:
1370
1458
  data.name = "path"
1371
1459
  return pd.DataFrame(data, index=index, **kwargs)
1372
1460
 
@@ -1390,16 +1478,15 @@ def split_path_and_make_copyable_html(
1390
1478
  split: Text pattern to split the path on. Defaults to "/".
1391
1479
  display_prefix: The text to display instead of the parent directory. Defaults to ".../".
1392
1480
 
1393
- Returns
1481
+ Returns:
1394
1482
  -------
1395
1483
  A string that holds the HTML and JavaScript code to be passed to IPython.display.display.
1396
1484
  """
1397
-
1398
- copy_to_clipboard_js = f"""<script>
1399
- function copyToClipboard(text, event) {{
1485
+ copy_to_clipboard_js = """<script>
1486
+ function copyToClipboard(text, event) {
1400
1487
  event.preventDefault();
1401
1488
  navigator.clipboard.writeText(text)
1402
- .then(() => {{
1489
+ .then(() => {
1403
1490
  const alertBox = document.createElement('div');
1404
1491
  const selection = window.getSelection();
1405
1492
 
@@ -1412,14 +1499,14 @@ function copyToClipboard(text, event) {{
1412
1499
  alertBox.innerHTML = 'Copied to clipboard';
1413
1500
  document.body.appendChild(alertBox);
1414
1501
 
1415
- setTimeout(function() {{
1502
+ setTimeout(function() {
1416
1503
  alertBox.style.display = 'none';
1417
- }}, 1500); // 1.5 seconds
1418
- }})
1419
- .catch(err => {{
1504
+ }, 1500); // 1.5 seconds
1505
+ })
1506
+ .catch(err => {
1420
1507
  console.error('Could not copy text: ', err);
1421
- }});
1422
- }}
1508
+ });
1509
+ }
1423
1510
  </script>"""
1424
1511
 
1425
1512
  if split is not None:
@@ -1634,28 +1721,28 @@ def get_schema(file) -> pyarrow.Schema:
1634
1721
  # try:
1635
1722
  # return ds.dataset(file).schema
1636
1723
  # except (TypeError, FileNotFoundError) as e:
1637
- if not hasattr(file, "file_system"):
1724
+ if not hasattr(file, "fs"):
1638
1725
  raise e
1639
1726
 
1640
- file_system = file.file_system
1727
+ fs = file.fs
1641
1728
 
1642
1729
  def _get_schema(path):
1643
1730
  try:
1644
1731
  return pq.read_schema(path)
1645
1732
  except FileNotFoundError as e:
1646
1733
  try:
1647
- with file_system.open(path, "rb") as f:
1734
+ with fs.open(path, "rb") as f:
1648
1735
  return pq.read_schema(f)
1649
1736
  except Exception as e2:
1650
1737
  raise e2.__class__(f"{e2}. {path}") from e
1651
1738
 
1652
- child_paths = file_system.glob(file + "/**/*.parquet")
1739
+ child_paths = fs.glob(file + "/**/*.parquet")
1653
1740
  if not len(child_paths):
1654
1741
  raise e.__class__(f"{e}: {file}") from e
1655
1742
 
1656
1743
  with ThreadPoolExecutor() as executor:
1657
1744
  schemas: list[pyarrow.Schema] = list(
1658
- executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1745
+ executor.map(_get_schema, fs.glob(file + "/**/*.parquet"))
1659
1746
  )
1660
1747
  if not schemas:
1661
1748
  raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
@@ -1734,15 +1821,14 @@ def sort_by_period(paths: Iterable[str]) -> Iterable[str]:
1734
1821
  except ValueError:
1735
1822
  # select last period
1736
1823
  periods = [pd.Timestamp(next(iter(reversed(path.periods)))) for path in paths]
1737
- combined = list(zip(periods, range(len(paths)), paths, strict=True))
1824
+ combined = list(zip(periods, paths, list(range(len(paths))), strict=True))
1738
1825
  combined.sort()
1739
- indices: list[int] = [x[1] for x in combined]
1826
+ indices: list[int] = [x[2] for x in combined]
1740
1827
  try:
1741
1828
  return paths.iloc[indices]
1742
1829
  except AttributeError:
1743
- return paths.__class__([x[2] for x in combined])
1830
+ return paths.__class__([x[1] for x in combined])
1744
1831
 
1745
1832
 
1746
1833
  np_str_contains: Callable = np.vectorize(str.__contains__)
1747
- np_str_endswith: Callable = np.vectorize(str.endswith)
1748
1834
  np_str_matches: Callable = np.vectorize(lambda txt, pat: bool(re.search(pat, txt)))
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.1.3"
3
+ version = "2.1.4"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
@@ -1,3 +0,0 @@
1
- from .path import Path
2
- from .path import PathSeries
3
- from .path import LocalFileSystem
File without changes
File without changes