daplapath 2.1.2__tar.gz → 2.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {daplapath-2.1.2 → daplapath-2.1.4}/PKG-INFO +1 -1
- daplapath-2.1.4/daplapath/__init__.py +1 -0
- {daplapath-2.1.2 → daplapath-2.1.4}/daplapath/path.py +222 -142
- {daplapath-2.1.2 → daplapath-2.1.4}/pyproject.toml +1 -1
- daplapath-2.1.2/daplapath/__init__.py +0 -3
- {daplapath-2.1.2 → daplapath-2.1.4}/LICENSE.md +0 -0
- {daplapath-2.1.2 → daplapath-2.1.4}/README.md +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .path import LocalFileSystem, Path, PathSeries, config
|
|
@@ -1,30 +1,30 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
3
4
|
import functools
|
|
4
|
-
from collections.abc import Iterable
|
|
5
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
6
5
|
import glob
|
|
6
|
+
import inspect
|
|
7
|
+
import io
|
|
8
|
+
import itertools
|
|
7
9
|
import json
|
|
10
|
+
import os
|
|
8
11
|
import pathlib
|
|
9
|
-
from pathlib import PurePosixPath, PurePath
|
|
10
12
|
import re
|
|
11
|
-
import io
|
|
12
|
-
import os
|
|
13
13
|
import shutil
|
|
14
|
-
from
|
|
15
|
-
import
|
|
16
|
-
import
|
|
14
|
+
from collections.abc import Callable, Iterable
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from pathlib import PurePath
|
|
18
|
+
from typing import Any
|
|
17
19
|
|
|
18
|
-
from fsspec.spec import AbstractFileSystem
|
|
19
|
-
import datetime
|
|
20
20
|
import numpy as np
|
|
21
21
|
import pandas as pd
|
|
22
22
|
import pandas.io.formats.format as fmt
|
|
23
|
-
from pandas.api.types import is_dict_like
|
|
24
23
|
import pyarrow
|
|
25
|
-
import pyarrow.parquet as pq
|
|
26
24
|
import pyarrow.dataset as ds
|
|
27
|
-
|
|
25
|
+
import pyarrow.parquet as pq
|
|
26
|
+
from fsspec.spec import AbstractFileSystem
|
|
27
|
+
from pandas.api.types import is_dict_like
|
|
28
28
|
|
|
29
29
|
try:
|
|
30
30
|
import gcsfs
|
|
@@ -48,10 +48,18 @@ PERIOD_PREFIX = "_p"
|
|
|
48
48
|
INDEX_NAMES = ["timestamp", "mb", "type"]
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
@dataclass
|
|
51
|
+
@dataclass(slots=True)
|
|
52
52
|
class Config:
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
fs: Callable
|
|
54
|
+
team: str | None
|
|
55
|
+
env: str
|
|
56
|
+
default_protocol: str = "gs"
|
|
57
|
+
bucket_pattern: str = "{default_protocol}://ssb-{team}-data-{bucket}-prod"
|
|
58
|
+
|
|
59
|
+
def __getitem__(self, key: str) -> Any:
|
|
60
|
+
if not hasattr(self, key):
|
|
61
|
+
raise KeyError(key)
|
|
62
|
+
return getattr(self, key)
|
|
55
63
|
|
|
56
64
|
|
|
57
65
|
class LocalFileSystem(AbstractFileSystem):
|
|
@@ -72,7 +80,7 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
72
80
|
if not detail:
|
|
73
81
|
return list(relevant_paths)
|
|
74
82
|
with ThreadPoolExecutor() as executor:
|
|
75
|
-
return
|
|
83
|
+
return {x["name"]: x for x in executor.map(get_file_info, relevant_paths)}
|
|
76
84
|
|
|
77
85
|
@classmethod
|
|
78
86
|
def ls(cls, path: str, detail: bool = False, **kwargs):
|
|
@@ -109,6 +117,10 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
109
117
|
def rm_file(path: str, *args, **kwargs) -> None:
|
|
110
118
|
return os.remove(path, *args, **kwargs)
|
|
111
119
|
|
|
120
|
+
@staticmethod
|
|
121
|
+
def rm(path: str, *args, **kwargs) -> None:
|
|
122
|
+
return os.remove(path, *args, **kwargs)
|
|
123
|
+
|
|
112
124
|
@staticmethod
|
|
113
125
|
def rmdir(path: str, *args, **kwargs) -> None:
|
|
114
126
|
return shutil.rmtree(path, *args, **kwargs)
|
|
@@ -139,9 +151,19 @@ class MyGCSFileSystem(gcsfs.GCSFileSystem):
|
|
|
139
151
|
|
|
140
152
|
|
|
141
153
|
if any("dapla" in key.lower() for key in os.environ) and "gcsfs" in locals():
|
|
142
|
-
|
|
154
|
+
_fs = MyGCSFileSystem()
|
|
143
155
|
else:
|
|
144
|
-
|
|
156
|
+
_fs = LocalFileSystem()
|
|
157
|
+
|
|
158
|
+
config = Config(
|
|
159
|
+
fs=_fs,
|
|
160
|
+
team=os.environ.get("DAPLA_GROUP_CONTEXT", "")
|
|
161
|
+
.replace("-developers", "")
|
|
162
|
+
.replace("-data-admins", "")
|
|
163
|
+
or None,
|
|
164
|
+
env=os.environ.get("DAPLA_ENVIRONMENT", "prod").lower(),
|
|
165
|
+
)
|
|
166
|
+
del _fs
|
|
145
167
|
|
|
146
168
|
|
|
147
169
|
class Tree:
|
|
@@ -177,12 +199,12 @@ class _PathBase:
|
|
|
177
199
|
_period_prefix: str = PERIOD_PREFIX
|
|
178
200
|
|
|
179
201
|
@staticmethod
|
|
180
|
-
def
|
|
202
|
+
def set_config(pat: str, value: Any) -> None:
|
|
181
203
|
"""Change config variable."""
|
|
182
|
-
setattr(
|
|
204
|
+
setattr(config, pat, value)
|
|
183
205
|
|
|
184
206
|
@property
|
|
185
|
-
def
|
|
207
|
+
def _fs_constructor(self) -> Callable | type:
|
|
186
208
|
"""Can be overridden in subclass.
|
|
187
209
|
|
|
188
210
|
Must return a function or a class that, when called,
|
|
@@ -191,13 +213,13 @@ class _PathBase:
|
|
|
191
213
|
The 'info' method should return a dict like with at least the keys
|
|
192
214
|
'updated', 'size', 'name' and 'type'.
|
|
193
215
|
"""
|
|
194
|
-
return
|
|
216
|
+
return config.fs
|
|
195
217
|
|
|
196
218
|
|
|
197
219
|
class Path(str, _PathBase):
|
|
198
220
|
"""Path object that works like a string, with methods for working with the GCS file system."""
|
|
199
221
|
|
|
200
|
-
|
|
222
|
+
_fs_attrs: set[str] = {
|
|
201
223
|
"info",
|
|
202
224
|
"isdir",
|
|
203
225
|
"open",
|
|
@@ -214,35 +236,71 @@ class Path(str, _PathBase):
|
|
|
214
236
|
return PathSeries
|
|
215
237
|
|
|
216
238
|
@staticmethod
|
|
217
|
-
def _standardize_path(path: str |
|
|
239
|
+
def _standardize_path(path: str | PurePath) -> str:
|
|
218
240
|
"""Make sure delimiter is '/' and path ends without '/'."""
|
|
219
|
-
return (
|
|
220
|
-
str(path)
|
|
221
|
-
.replace("\\", "/")
|
|
222
|
-
.replace(r"\"", "/")
|
|
223
|
-
.replace("//", "/")
|
|
224
|
-
.rstrip("/")
|
|
225
|
-
)
|
|
241
|
+
return str(path).replace("\\", "/").replace(r"\"", "/")
|
|
226
242
|
|
|
227
|
-
def __new__(cls, gcs_path: str |
|
|
243
|
+
def __new__(cls, gcs_path: str | os.PathLike | None = None, fs=None):
|
|
228
244
|
"""Construct Path with '/' as delimiter."""
|
|
229
245
|
gcs_path = cls._standardize_path(gcs_path or "")
|
|
230
246
|
obj = super().__new__(cls, gcs_path)
|
|
231
|
-
|
|
232
|
-
|
|
247
|
+
if fs is not None:
|
|
248
|
+
obj._fs = fs
|
|
249
|
+
elif gcs_path.startswith("/buckets"):
|
|
250
|
+
obj._fs = LocalFileSystem()
|
|
251
|
+
else:
|
|
252
|
+
obj._fs = config.fs
|
|
233
253
|
return obj
|
|
234
254
|
|
|
235
255
|
def buckets_path(self) -> "Path":
|
|
236
|
-
if self.startswith("/buckets"):
|
|
237
|
-
return self
|
|
238
|
-
|
|
239
|
-
root = self.parts[0]
|
|
240
|
-
bucket = root.split("-data-")[-1].split("-prod")[0]
|
|
241
|
-
|
|
242
256
|
try:
|
|
243
|
-
|
|
244
|
-
except
|
|
245
|
-
|
|
257
|
+
protocol, _ = str(self).split("://")
|
|
258
|
+
except ValueError:
|
|
259
|
+
protocol = ""
|
|
260
|
+
root, *subdirs = str(self).replace(f"{protocol}://", "").split("/")
|
|
261
|
+
bucket = root.split("-data-")[-1].split(f"-{config.env}")[0]
|
|
262
|
+
if config.team in root:
|
|
263
|
+
new_root = "/buckets"
|
|
264
|
+
team = config.team
|
|
265
|
+
else:
|
|
266
|
+
team = root.split("-data-")[0]
|
|
267
|
+
team = team.lstrip(team.split("-")[0]).strip("-")
|
|
268
|
+
bucket = bucket.replace("delt-", "")
|
|
269
|
+
new_root = f"/buckets/shared/{team}"
|
|
270
|
+
|
|
271
|
+
subdirs = "/".join(subdirs).strip("/")
|
|
272
|
+
if subdirs:
|
|
273
|
+
return self.__class__(f"{new_root}/{bucket}/{subdirs}", self.fs)
|
|
274
|
+
else:
|
|
275
|
+
return self.__class__(f"{new_root}/{bucket}", self.fs)
|
|
276
|
+
|
|
277
|
+
def gs_path(self) -> "Path":
|
|
278
|
+
if not str(self).startswith("/buckets/"):
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"Can only convert paths starting with '/buckets/' to GCS path. Got {self}"
|
|
281
|
+
)
|
|
282
|
+
if "/shared/" in str(self):
|
|
283
|
+
team, bucket, *subdirs = str(self).split("/shared/")[1].split("/")
|
|
284
|
+
bucket = "delt-" + bucket
|
|
285
|
+
elif not config.team:
|
|
286
|
+
raise ValueError(
|
|
287
|
+
"Must set config.team (hint: from daplapath import config; config.team = 'name')"
|
|
288
|
+
)
|
|
289
|
+
else:
|
|
290
|
+
team = config.team
|
|
291
|
+
bucket, *subdirs = str(self).replace("/buckets/", "").split("/")
|
|
292
|
+
|
|
293
|
+
gspath = (
|
|
294
|
+
config.bucket_pattern.format(
|
|
295
|
+
team=team,
|
|
296
|
+
bucket=bucket,
|
|
297
|
+
env=config.env,
|
|
298
|
+
default_protocol=config.default_protocol,
|
|
299
|
+
)
|
|
300
|
+
+ "/"
|
|
301
|
+
+ "/".join(subdirs)
|
|
302
|
+
)
|
|
303
|
+
return self.__class__(gspath, self.fs)
|
|
246
304
|
|
|
247
305
|
def tree(
|
|
248
306
|
self,
|
|
@@ -274,7 +332,6 @@ class Path(str, _PathBase):
|
|
|
274
332
|
self, pattern: str | None = None, recursive: bool = True, **kwargs
|
|
275
333
|
) -> "PathSeries":
|
|
276
334
|
"""Create PathSeries of files/directories that match the pattern."""
|
|
277
|
-
|
|
278
335
|
recursive = kwargs.get("recurse_symlinks", recursive)
|
|
279
336
|
|
|
280
337
|
if pattern:
|
|
@@ -290,17 +347,17 @@ class Path(str, _PathBase):
|
|
|
290
347
|
|
|
291
348
|
kwargs["detail"] = True
|
|
292
349
|
|
|
293
|
-
if "recursive" in get_arguments(self.
|
|
350
|
+
if "recursive" in get_arguments(self.fs.glob):
|
|
294
351
|
kwargs["recursive"] = recursive
|
|
295
352
|
else:
|
|
296
|
-
# try to set to non-recursive if
|
|
353
|
+
# try to set to non-recursive if fs.glob allows argument 'maxdepth'
|
|
297
354
|
kwargs["maxdepth"] = None if recursive else 1
|
|
298
355
|
|
|
299
356
|
try:
|
|
300
|
-
info: list[dict] | dict = self.
|
|
357
|
+
info: list[dict] | dict = self.fs.glob(pattern, **kwargs)
|
|
301
358
|
except TypeError:
|
|
302
359
|
kwargs.pop("maxdepth", None)
|
|
303
|
-
info: list[dict] | dict = self.
|
|
360
|
+
info: list[dict] | dict = self.fs.glob(pattern, **kwargs)
|
|
304
361
|
|
|
305
362
|
if isinstance(info, dict):
|
|
306
363
|
# file system can return single dict if only one file path
|
|
@@ -324,10 +381,17 @@ class Path(str, _PathBase):
|
|
|
324
381
|
"""
|
|
325
382
|
return self.glob("**", recursive=recursive, **kwargs)
|
|
326
383
|
|
|
384
|
+
def unlink(self, missing_ok: bool = False) -> None:
|
|
385
|
+
if not self.exists():
|
|
386
|
+
if not missing_ok:
|
|
387
|
+
raise FileNotFoundError(str(self))
|
|
388
|
+
return
|
|
389
|
+
return self.fs.rm(recursive=False)
|
|
390
|
+
|
|
327
391
|
def rmdir(self) -> None:
|
|
328
392
|
files = self.glob("**").files
|
|
329
393
|
with ThreadPoolExecutor() as executor:
|
|
330
|
-
list(executor.map(self.
|
|
394
|
+
list(executor.map(self.fs.rm_file, files))
|
|
331
395
|
|
|
332
396
|
def cp(self, destination: "Path | str") -> "Path":
|
|
333
397
|
return self._cp_or_mv(destination, "cp")
|
|
@@ -337,28 +401,25 @@ class Path(str, _PathBase):
|
|
|
337
401
|
out_path = self._cp_or_mv(destination, "mv")
|
|
338
402
|
if was_dir:
|
|
339
403
|
try:
|
|
340
|
-
self.
|
|
404
|
+
self.fs.rmdir(str(self))
|
|
341
405
|
except (FileNotFoundError, NotADirectoryError):
|
|
342
406
|
pass
|
|
343
407
|
return out_path
|
|
344
408
|
|
|
345
|
-
def read_text(self, *args, **kwargs):
|
|
346
|
-
return self._path.read_text(*args, **kwargs)
|
|
347
|
-
|
|
348
409
|
def versions(self, include_versionless: bool = False) -> "PathSeries":
|
|
349
410
|
"""Returns a PathSeries of all versions of the file."""
|
|
350
|
-
files_in_folder: Iterable[Path] = self.parent.glob(
|
|
411
|
+
files_in_folder: Iterable[Path] = self.parent.glob(
|
|
412
|
+
f"*{self.suffix}", recursive=False
|
|
413
|
+
)
|
|
351
414
|
|
|
352
415
|
if self.version_number:
|
|
353
|
-
start, _
|
|
416
|
+
start, *_ = re.split(self._version_pattern, self.name)
|
|
354
417
|
else:
|
|
355
|
-
start
|
|
418
|
+
start = self.stem
|
|
356
419
|
|
|
357
420
|
# create boolean mask. With numpy to make it work with both pandas and list
|
|
358
421
|
arr = np.array(files_in_folder)
|
|
359
|
-
is_version_of_this_file =
|
|
360
|
-
np_str_endswith(arr, end)
|
|
361
|
-
)
|
|
422
|
+
is_version_of_this_file = np_str_contains(arr, start)
|
|
362
423
|
if not include_versionless:
|
|
363
424
|
is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
|
|
364
425
|
|
|
@@ -382,7 +443,7 @@ class Path(str, _PathBase):
|
|
|
382
443
|
Lists files in the parent directory with the same versionless stem
|
|
383
444
|
and selects the one with the highest version number.
|
|
384
445
|
|
|
385
|
-
Returns
|
|
446
|
+
Returns:
|
|
386
447
|
-------
|
|
387
448
|
A Path.
|
|
388
449
|
"""
|
|
@@ -405,11 +466,11 @@ class Path(str, _PathBase):
|
|
|
405
466
|
Minutes needed between the timestamp of the current highest
|
|
406
467
|
numbered version.
|
|
407
468
|
|
|
408
|
-
Returns
|
|
469
|
+
Returns:
|
|
409
470
|
------
|
|
410
471
|
A Path with a new version number.
|
|
411
472
|
|
|
412
|
-
Raises
|
|
473
|
+
Raises:
|
|
413
474
|
------
|
|
414
475
|
ValueError:
|
|
415
476
|
If the method is run before the timeout period is up.
|
|
@@ -425,7 +486,7 @@ class Path(str, _PathBase):
|
|
|
425
486
|
time_should_be_at_least = pd.Timestamp.now(tz="Europe/Oslo").replace(
|
|
426
487
|
tzinfo=None
|
|
427
488
|
).round("s") - pd.Timedelta(minutes=timeout)
|
|
428
|
-
if timestamp > time_should_be_at_least:
|
|
489
|
+
if timestamp is not None and timestamp > time_should_be_at_least:
|
|
429
490
|
raise ValueError(
|
|
430
491
|
f"Latest version of the file was updated {timestamp[0]}, which "
|
|
431
492
|
f"is less than the timeout period of {timeout} minutes. "
|
|
@@ -439,7 +500,7 @@ class Path(str, _PathBase):
|
|
|
439
500
|
def with_version(self, version: int | None) -> "Path":
|
|
440
501
|
"""Replace the Path's version number, if any, with a new version number.
|
|
441
502
|
|
|
442
|
-
Examples
|
|
503
|
+
Examples:
|
|
443
504
|
--------
|
|
444
505
|
>>> Path('file.parquet').with_version(1)
|
|
445
506
|
'file_v1.parquet'
|
|
@@ -456,13 +517,13 @@ class Path(str, _PathBase):
|
|
|
456
517
|
self, include_versionless: bool = False
|
|
457
518
|
) -> "PathSeries":
|
|
458
519
|
"""Returns a PathSeries of all periods of the file."""
|
|
459
|
-
files_in_folder: Iterable[Path] = self.parent.glob(
|
|
520
|
+
files_in_folder: Iterable[Path] = self.parent.glob(
|
|
521
|
+
f"*{self.suffix}", recursive=False
|
|
522
|
+
)
|
|
460
523
|
|
|
461
524
|
# create boolean mask. With numpy to make it work with both pandas and list
|
|
462
525
|
arr = np.array(files_in_folder)
|
|
463
|
-
is_version_of_this_file = (
|
|
464
|
-
np_str_contains(arr, self.periodless_stem)
|
|
465
|
-
) & np_str_endswith(arr, self.suffix)
|
|
526
|
+
is_version_of_this_file = np_str_contains(arr, self.periodless_stem)
|
|
466
527
|
if not include_versionless:
|
|
467
528
|
is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
|
|
468
529
|
|
|
@@ -486,12 +547,12 @@ class Path(str, _PathBase):
|
|
|
486
547
|
Lists files in the parent directory with the same
|
|
487
548
|
versionless and periodless stem and selects the path that sorts last.
|
|
488
549
|
|
|
489
|
-
Raises
|
|
550
|
+
Raises:
|
|
490
551
|
------
|
|
491
552
|
ValueError: If there is mismatch in period patterns, e.g. if one
|
|
492
553
|
path has the period "2020-01-01" and one path has "2021".
|
|
493
554
|
|
|
494
|
-
Returns
|
|
555
|
+
Returns:
|
|
495
556
|
-------
|
|
496
557
|
A Path.
|
|
497
558
|
"""
|
|
@@ -500,14 +561,14 @@ class Path(str, _PathBase):
|
|
|
500
561
|
include_versionless=False
|
|
501
562
|
)
|
|
502
563
|
sorted_paths = sort_by_period(period_paths)
|
|
503
|
-
return
|
|
564
|
+
return list(sorted_paths)[-1]
|
|
504
565
|
except (IndexError, StopIteration) as e:
|
|
505
566
|
raise FileNotFoundError(self) from e
|
|
506
567
|
|
|
507
568
|
def with_period(self, period: str) -> "Path":
|
|
508
569
|
"""Replace the Path's period, if any, with a new periods.
|
|
509
570
|
|
|
510
|
-
Examples
|
|
571
|
+
Examples:
|
|
511
572
|
--------
|
|
512
573
|
>>> Path('file_v1.parquet').with_period("2024-01-01")
|
|
513
574
|
'file_p2024-01-01_v1.parquet'
|
|
@@ -523,7 +584,7 @@ class Path(str, _PathBase):
|
|
|
523
584
|
def with_periods(self, from_period: str, to_period: str | None = None) -> "Path":
|
|
524
585
|
"""Replace the Path's period, if any, with one or two new periods.
|
|
525
586
|
|
|
526
|
-
Examples
|
|
587
|
+
Examples:
|
|
527
588
|
--------
|
|
528
589
|
>>> Path('file_v1.parquet').with_periods("2024-01-01")
|
|
529
590
|
'file_p2024-01-01_v1.parquet'
|
|
@@ -591,46 +652,57 @@ class Path(str, _PathBase):
|
|
|
591
652
|
@property
|
|
592
653
|
def periodless_stem(self) -> str:
|
|
593
654
|
"""Return the file stem before the period pattern."""
|
|
594
|
-
return str(re.sub(f"{self._period_pattern}.*", "", self.
|
|
655
|
+
return str(re.sub(f"{self._period_pattern}.*", "", self.stem))
|
|
595
656
|
|
|
596
657
|
@property
|
|
597
658
|
def versionless_stem(self) -> str:
|
|
598
659
|
"""Return the file stem before the version pattern."""
|
|
599
|
-
return self._new(re.split(self._version_pattern, self.
|
|
660
|
+
return self._new(re.split(self._version_pattern, self.name)[0]).stem
|
|
600
661
|
|
|
601
662
|
@property
|
|
602
663
|
def parent(self) -> "Path":
|
|
603
664
|
"""Parent path."""
|
|
604
|
-
return self._new(self.
|
|
665
|
+
return self._new("/".join(self.split("/")[:-1]))
|
|
605
666
|
|
|
606
667
|
@property
|
|
607
668
|
def parents(self) -> "list[Path]":
|
|
608
669
|
"""Parent path."""
|
|
609
|
-
|
|
670
|
+
no_protocol = self.split("://")[-1]
|
|
671
|
+
return [
|
|
672
|
+
self._new("/".join(no_protocol.split("/")[:i]))
|
|
673
|
+
for i in range(no_protocol.count("/"))
|
|
674
|
+
][::-1]
|
|
610
675
|
|
|
611
676
|
@property
|
|
612
677
|
def name(self) -> str:
|
|
613
678
|
"""Final part of the path."""
|
|
614
|
-
return self.
|
|
679
|
+
return self.split("/")[-1]
|
|
615
680
|
|
|
616
681
|
@property
|
|
617
682
|
def stem(self) -> str:
|
|
618
683
|
"""File name without the suffix"""
|
|
619
|
-
return self.
|
|
684
|
+
return self.split("/")[-1].replace(self.suffix, "")
|
|
620
685
|
|
|
621
686
|
@property
|
|
622
687
|
def parts(self) -> tuple[str]:
|
|
623
|
-
|
|
688
|
+
no_protocol = self.split("://")[-1]
|
|
689
|
+
return tuple(no_protocol.split("/"))
|
|
624
690
|
|
|
625
691
|
@property
|
|
626
692
|
def suffix(self) -> str:
|
|
627
693
|
"""Final file path suffix."""
|
|
628
|
-
|
|
694
|
+
name = self.name
|
|
695
|
+
if "." not in name:
|
|
696
|
+
return ""
|
|
697
|
+
return "." + (name).split(".")[-1]
|
|
629
698
|
|
|
630
699
|
@property
|
|
631
700
|
def suffixes(self) -> list[str]:
|
|
632
701
|
"""File path suffixes, if multiple."""
|
|
633
|
-
|
|
702
|
+
name = self.name
|
|
703
|
+
if "." not in name:
|
|
704
|
+
return []
|
|
705
|
+
return ["." + suff for suff in (name).split(".")[1:]]
|
|
634
706
|
|
|
635
707
|
@property
|
|
636
708
|
def index_column_names(self) -> list[str]:
|
|
@@ -691,9 +763,14 @@ class Path(str, _PathBase):
|
|
|
691
763
|
try:
|
|
692
764
|
info = self._info
|
|
693
765
|
except AttributeError:
|
|
694
|
-
info = self.
|
|
766
|
+
info = self.fs.info(self)
|
|
695
767
|
self._info = info
|
|
696
|
-
|
|
768
|
+
try:
|
|
769
|
+
return _get_timestamps(info["updated"])
|
|
770
|
+
except Exception as e:
|
|
771
|
+
if not self.exists() or self.is_dir():
|
|
772
|
+
return None
|
|
773
|
+
raise e
|
|
697
774
|
|
|
698
775
|
@property
|
|
699
776
|
def type(self) -> str:
|
|
@@ -705,7 +782,7 @@ class Path(str, _PathBase):
|
|
|
705
782
|
try:
|
|
706
783
|
info = self._info
|
|
707
784
|
except AttributeError:
|
|
708
|
-
info = self.
|
|
785
|
+
info = self.fs.info(self)
|
|
709
786
|
self._info = info
|
|
710
787
|
return info["size"]
|
|
711
788
|
|
|
@@ -770,29 +847,29 @@ class Path(str, _PathBase):
|
|
|
770
847
|
return self.isdir()
|
|
771
848
|
|
|
772
849
|
def with_suffix(self, suffix: str):
|
|
773
|
-
return self._new(self.
|
|
850
|
+
return self._new(self.replace(self.suffix, suffix))
|
|
774
851
|
|
|
775
852
|
def with_name(self, new_name: str):
|
|
776
|
-
return self._new(self.
|
|
853
|
+
return self._new(self.replace(self.name, new_name))
|
|
777
854
|
|
|
778
|
-
def with_stem(self,
|
|
779
|
-
return self._new(self.
|
|
855
|
+
def with_stem(self, new_stem: str):
|
|
856
|
+
return self._new(self.replace(self.stem, new_stem))
|
|
780
857
|
|
|
781
858
|
@property
|
|
782
|
-
def
|
|
783
|
-
if self.
|
|
784
|
-
self.
|
|
785
|
-
return self.
|
|
859
|
+
def fs(self):
|
|
860
|
+
if self._fs is None:
|
|
861
|
+
self._fs = self._fs_constructor()
|
|
862
|
+
return self._fs
|
|
786
863
|
|
|
787
|
-
@
|
|
788
|
-
def
|
|
789
|
-
self.
|
|
790
|
-
return self.
|
|
864
|
+
@fs.setter
|
|
865
|
+
def fs(self, val):
|
|
866
|
+
self._fs = val
|
|
867
|
+
return self._fs
|
|
791
868
|
|
|
792
869
|
def __truediv__(self, other: str | os.PathLike | PurePath) -> "Path":
|
|
793
870
|
"""Append a string or Path to the path with a forward slash.
|
|
794
871
|
|
|
795
|
-
Example
|
|
872
|
+
Example:
|
|
796
873
|
-------
|
|
797
874
|
>>> folder = 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2023'
|
|
798
875
|
>>> file_path = folder / "ABAS_kommune_flate_p2023_v1.parquet"
|
|
@@ -807,13 +884,12 @@ class Path(str, _PathBase):
|
|
|
807
884
|
return self._new(f"{self}/{as_str(other)}")
|
|
808
885
|
|
|
809
886
|
def __getattribute__(self, name):
|
|
810
|
-
"""
|
|
887
|
+
"""Stackoverflow hack to ensure we return Path when using string methods.
|
|
811
888
|
|
|
812
889
|
It works for all but the string magigmethods, importantly __add__.
|
|
813
890
|
"""
|
|
814
|
-
|
|
815
891
|
# skip magic methods
|
|
816
|
-
if name not in dir(str) or name.startswith("__") and name.endswith("__"):
|
|
892
|
+
if name not in dir(str) or (name.startswith("__") and name.endswith("__")):
|
|
817
893
|
return super().__getattribute__(name)
|
|
818
894
|
|
|
819
895
|
def method(self, *args, **kwargs):
|
|
@@ -832,35 +908,35 @@ class Path(str, _PathBase):
|
|
|
832
908
|
return method.__get__(self)
|
|
833
909
|
|
|
834
910
|
def __getattr__(self, attr: str) -> Any:
|
|
835
|
-
"""Get
|
|
911
|
+
"""Get fs attribute."""
|
|
836
912
|
error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
|
|
837
913
|
if attr.startswith("_"):
|
|
838
914
|
raise AttributeError(error_message)
|
|
839
|
-
if attr not in self.
|
|
915
|
+
if attr not in self._fs_attrs:
|
|
840
916
|
raise AttributeError(error_message)
|
|
841
|
-
return functools.partial(getattr(self.
|
|
917
|
+
return functools.partial(getattr(self.fs, attr), self)
|
|
842
918
|
|
|
843
919
|
def __fspath__(self) -> str:
|
|
844
920
|
return str(self)
|
|
845
921
|
|
|
846
922
|
def __dir__(self) -> list[str]:
|
|
847
|
-
return list(sorted({x for x in dir(Path)} | self.
|
|
923
|
+
return list(sorted({x for x in dir(Path)} | self._fs_attrs))
|
|
848
924
|
|
|
849
925
|
def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
|
|
850
926
|
series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
|
|
851
927
|
for path in series:
|
|
852
|
-
path.
|
|
928
|
+
path._fs = self._fs
|
|
853
929
|
return self._iterable_type(series, **kwargs)
|
|
854
930
|
|
|
855
931
|
def _new(self, new_path: str | Path) -> "Path":
|
|
856
|
-
return self.__class__(new_path, self.
|
|
932
|
+
return self.__class__(new_path, self.fs)
|
|
857
933
|
|
|
858
934
|
def _cp_or_mv(self, destination: "Path | str", attr: str) -> "Path":
|
|
859
|
-
func: Callable = getattr(self.
|
|
935
|
+
func: Callable = getattr(self.fs, attr)
|
|
860
936
|
try:
|
|
861
937
|
func(self, destination)
|
|
862
938
|
except FileNotFoundError:
|
|
863
|
-
destination = self.__class__(destination)
|
|
939
|
+
destination = self.__class__(destination, self.fs)
|
|
864
940
|
sources = list(self.glob("**").files)
|
|
865
941
|
destinations = [path.replace(self, destination) for path in sources]
|
|
866
942
|
with ThreadPoolExecutor() as executor:
|
|
@@ -921,7 +997,7 @@ class PathSeries(pd.Series, _PathBase):
|
|
|
921
997
|
names: Series
|
|
922
998
|
The names of the file paths.
|
|
923
999
|
|
|
924
|
-
Methods
|
|
1000
|
+
Methods:
|
|
925
1001
|
-------
|
|
926
1002
|
tree():
|
|
927
1003
|
con
|
|
@@ -961,18 +1037,24 @@ class PathSeries(pd.Series, _PathBase):
|
|
|
961
1037
|
data is not None
|
|
962
1038
|
and len(data)
|
|
963
1039
|
and not (
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
1040
|
+
(
|
|
1041
|
+
isinstance(data, pd.Series)
|
|
1042
|
+
and len(data.index.names) == len(self._index_names)
|
|
1043
|
+
)
|
|
1044
|
+
or (
|
|
1045
|
+
isinstance(index, pd.MultiIndex)
|
|
1046
|
+
and len(index.names) == len(self._index_names)
|
|
1047
|
+
)
|
|
968
1048
|
# dict with e.g. tuple keys, turned into MultiIndex
|
|
969
|
-
or
|
|
970
|
-
|
|
1049
|
+
or (
|
|
1050
|
+
is_dict_like(data)
|
|
1051
|
+
and all(len(key) == len(self._index_names) for key in data.keys())
|
|
1052
|
+
)
|
|
971
1053
|
)
|
|
972
1054
|
)
|
|
973
1055
|
if should_construct_index:
|
|
974
|
-
|
|
975
|
-
data = _get_paths_and_index([
|
|
1056
|
+
fs = kwargs.get("fs", self._fs_constructor())
|
|
1057
|
+
data = _get_paths_and_index([fs.info(path) for path in data])
|
|
976
1058
|
|
|
977
1059
|
super().__init__(data, index=index, **kwargs)
|
|
978
1060
|
|
|
@@ -1338,7 +1420,7 @@ def _pathseries_constructor_with_fallback(
|
|
|
1338
1420
|
max_parts: int | None = 2,
|
|
1339
1421
|
path_series_type: type | None = None,
|
|
1340
1422
|
**kwargs,
|
|
1341
|
-
) ->
|
|
1423
|
+
) -> PathSeries | pd.Series:
|
|
1342
1424
|
path_series_type = path_series_type or PathSeries
|
|
1343
1425
|
|
|
1344
1426
|
kwargs["name"] = kwargs.pop("name", "path")
|
|
@@ -1372,7 +1454,7 @@ def _pathseries_constructor_with_fallback(
|
|
|
1372
1454
|
return series
|
|
1373
1455
|
|
|
1374
1456
|
|
|
1375
|
-
def _dataframe_constructor(data=None, index=None, **kwargs) ->
|
|
1457
|
+
def _dataframe_constructor(data=None, index=None, **kwargs) -> pd.DataFrame:
|
|
1376
1458
|
data.name = "path"
|
|
1377
1459
|
return pd.DataFrame(data, index=index, **kwargs)
|
|
1378
1460
|
|
|
@@ -1396,16 +1478,15 @@ def split_path_and_make_copyable_html(
|
|
|
1396
1478
|
split: Text pattern to split the path on. Defaults to "/".
|
|
1397
1479
|
display_prefix: The text to display instead of the parent directory. Defaults to ".../".
|
|
1398
1480
|
|
|
1399
|
-
Returns
|
|
1481
|
+
Returns:
|
|
1400
1482
|
-------
|
|
1401
1483
|
A string that holds the HTML and JavaScript code to be passed to IPython.display.display.
|
|
1402
1484
|
"""
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
function copyToClipboard(text, event) {{
|
|
1485
|
+
copy_to_clipboard_js = """<script>
|
|
1486
|
+
function copyToClipboard(text, event) {
|
|
1406
1487
|
event.preventDefault();
|
|
1407
1488
|
navigator.clipboard.writeText(text)
|
|
1408
|
-
.then(() => {
|
|
1489
|
+
.then(() => {
|
|
1409
1490
|
const alertBox = document.createElement('div');
|
|
1410
1491
|
const selection = window.getSelection();
|
|
1411
1492
|
|
|
@@ -1418,14 +1499,14 @@ function copyToClipboard(text, event) {{
|
|
|
1418
1499
|
alertBox.innerHTML = 'Copied to clipboard';
|
|
1419
1500
|
document.body.appendChild(alertBox);
|
|
1420
1501
|
|
|
1421
|
-
setTimeout(function() {
|
|
1502
|
+
setTimeout(function() {
|
|
1422
1503
|
alertBox.style.display = 'none';
|
|
1423
|
-
}
|
|
1424
|
-
}
|
|
1425
|
-
.catch(err => {
|
|
1504
|
+
}, 1500); // 1.5 seconds
|
|
1505
|
+
})
|
|
1506
|
+
.catch(err => {
|
|
1426
1507
|
console.error('Could not copy text: ', err);
|
|
1427
|
-
}
|
|
1428
|
-
}
|
|
1508
|
+
});
|
|
1509
|
+
}
|
|
1429
1510
|
</script>"""
|
|
1430
1511
|
|
|
1431
1512
|
if split is not None:
|
|
@@ -1640,28 +1721,28 @@ def get_schema(file) -> pyarrow.Schema:
|
|
|
1640
1721
|
# try:
|
|
1641
1722
|
# return ds.dataset(file).schema
|
|
1642
1723
|
# except (TypeError, FileNotFoundError) as e:
|
|
1643
|
-
if not hasattr(file, "
|
|
1724
|
+
if not hasattr(file, "fs"):
|
|
1644
1725
|
raise e
|
|
1645
1726
|
|
|
1646
|
-
|
|
1727
|
+
fs = file.fs
|
|
1647
1728
|
|
|
1648
1729
|
def _get_schema(path):
|
|
1649
1730
|
try:
|
|
1650
1731
|
return pq.read_schema(path)
|
|
1651
1732
|
except FileNotFoundError as e:
|
|
1652
1733
|
try:
|
|
1653
|
-
with
|
|
1734
|
+
with fs.open(path, "rb") as f:
|
|
1654
1735
|
return pq.read_schema(f)
|
|
1655
1736
|
except Exception as e2:
|
|
1656
1737
|
raise e2.__class__(f"{e2}. {path}") from e
|
|
1657
1738
|
|
|
1658
|
-
child_paths =
|
|
1739
|
+
child_paths = fs.glob(file + "/**/*.parquet")
|
|
1659
1740
|
if not len(child_paths):
|
|
1660
1741
|
raise e.__class__(f"{e}: {file}") from e
|
|
1661
1742
|
|
|
1662
1743
|
with ThreadPoolExecutor() as executor:
|
|
1663
1744
|
schemas: list[pyarrow.Schema] = list(
|
|
1664
|
-
executor.map(_get_schema,
|
|
1745
|
+
executor.map(_get_schema, fs.glob(file + "/**/*.parquet"))
|
|
1665
1746
|
)
|
|
1666
1747
|
if not schemas:
|
|
1667
1748
|
raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
|
|
@@ -1740,15 +1821,14 @@ def sort_by_period(paths: Iterable[str]) -> Iterable[str]:
|
|
|
1740
1821
|
except ValueError:
|
|
1741
1822
|
# select last period
|
|
1742
1823
|
periods = [pd.Timestamp(next(iter(reversed(path.periods)))) for path in paths]
|
|
1743
|
-
combined = list(zip(periods, range(len(paths)),
|
|
1824
|
+
combined = list(zip(periods, paths, list(range(len(paths))), strict=True))
|
|
1744
1825
|
combined.sort()
|
|
1745
|
-
indices: list[int] = [x[
|
|
1826
|
+
indices: list[int] = [x[2] for x in combined]
|
|
1746
1827
|
try:
|
|
1747
1828
|
return paths.iloc[indices]
|
|
1748
1829
|
except AttributeError:
|
|
1749
|
-
return paths.__class__([x[
|
|
1830
|
+
return paths.__class__([x[1] for x in combined])
|
|
1750
1831
|
|
|
1751
1832
|
|
|
1752
1833
|
np_str_contains: Callable = np.vectorize(str.__contains__)
|
|
1753
|
-
np_str_endswith: Callable = np.vectorize(str.endswith)
|
|
1754
1834
|
np_str_matches: Callable = np.vectorize(lambda txt, pat: bool(re.search(pat, txt)))
|
|
File without changes
|
|
File without changes
|