daplapath 2.1.3__tar.gz → 2.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {daplapath-2.1.3 → daplapath-2.1.4}/PKG-INFO +1 -1
- daplapath-2.1.4/daplapath/__init__.py +1 -0
- {daplapath-2.1.3 → daplapath-2.1.4}/daplapath/path.py +221 -135
- {daplapath-2.1.3 → daplapath-2.1.4}/pyproject.toml +1 -1
- daplapath-2.1.3/daplapath/__init__.py +0 -3
- {daplapath-2.1.3 → daplapath-2.1.4}/LICENSE.md +0 -0
- {daplapath-2.1.3 → daplapath-2.1.4}/README.md +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .path import LocalFileSystem, Path, PathSeries, config
|
|
@@ -1,30 +1,30 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
|
+
import datetime
|
|
3
4
|
import functools
|
|
4
|
-
from collections.abc import Iterable
|
|
5
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
6
5
|
import glob
|
|
6
|
+
import inspect
|
|
7
|
+
import io
|
|
8
|
+
import itertools
|
|
7
9
|
import json
|
|
10
|
+
import os
|
|
8
11
|
import pathlib
|
|
9
|
-
from pathlib import PurePosixPath, PurePath
|
|
10
12
|
import re
|
|
11
|
-
import io
|
|
12
|
-
import os
|
|
13
13
|
import shutil
|
|
14
|
-
from
|
|
15
|
-
import
|
|
16
|
-
import
|
|
14
|
+
from collections.abc import Callable, Iterable
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from pathlib import PurePath
|
|
18
|
+
from typing import Any
|
|
17
19
|
|
|
18
|
-
from fsspec.spec import AbstractFileSystem
|
|
19
|
-
import datetime
|
|
20
20
|
import numpy as np
|
|
21
21
|
import pandas as pd
|
|
22
22
|
import pandas.io.formats.format as fmt
|
|
23
|
-
from pandas.api.types import is_dict_like
|
|
24
23
|
import pyarrow
|
|
25
|
-
import pyarrow.parquet as pq
|
|
26
24
|
import pyarrow.dataset as ds
|
|
27
|
-
|
|
25
|
+
import pyarrow.parquet as pq
|
|
26
|
+
from fsspec.spec import AbstractFileSystem
|
|
27
|
+
from pandas.api.types import is_dict_like
|
|
28
28
|
|
|
29
29
|
try:
|
|
30
30
|
import gcsfs
|
|
@@ -48,10 +48,18 @@ PERIOD_PREFIX = "_p"
|
|
|
48
48
|
INDEX_NAMES = ["timestamp", "mb", "type"]
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
@dataclass
|
|
51
|
+
@dataclass(slots=True)
|
|
52
52
|
class Config:
|
|
53
|
-
|
|
54
|
-
|
|
53
|
+
fs: Callable
|
|
54
|
+
team: str | None
|
|
55
|
+
env: str
|
|
56
|
+
default_protocol: str = "gs"
|
|
57
|
+
bucket_pattern: str = "{default_protocol}://ssb-{team}-data-{bucket}-prod"
|
|
58
|
+
|
|
59
|
+
def __getitem__(self, key: str) -> Any:
|
|
60
|
+
if not hasattr(self, key):
|
|
61
|
+
raise KeyError(key)
|
|
62
|
+
return getattr(self, key)
|
|
55
63
|
|
|
56
64
|
|
|
57
65
|
class LocalFileSystem(AbstractFileSystem):
|
|
@@ -72,7 +80,7 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
72
80
|
if not detail:
|
|
73
81
|
return list(relevant_paths)
|
|
74
82
|
with ThreadPoolExecutor() as executor:
|
|
75
|
-
return
|
|
83
|
+
return {x["name"]: x for x in executor.map(get_file_info, relevant_paths)}
|
|
76
84
|
|
|
77
85
|
@classmethod
|
|
78
86
|
def ls(cls, path: str, detail: bool = False, **kwargs):
|
|
@@ -109,6 +117,10 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
109
117
|
def rm_file(path: str, *args, **kwargs) -> None:
|
|
110
118
|
return os.remove(path, *args, **kwargs)
|
|
111
119
|
|
|
120
|
+
@staticmethod
|
|
121
|
+
def rm(path: str, *args, **kwargs) -> None:
|
|
122
|
+
return os.remove(path, *args, **kwargs)
|
|
123
|
+
|
|
112
124
|
@staticmethod
|
|
113
125
|
def rmdir(path: str, *args, **kwargs) -> None:
|
|
114
126
|
return shutil.rmtree(path, *args, **kwargs)
|
|
@@ -139,9 +151,19 @@ class MyGCSFileSystem(gcsfs.GCSFileSystem):
|
|
|
139
151
|
|
|
140
152
|
|
|
141
153
|
if any("dapla" in key.lower() for key in os.environ) and "gcsfs" in locals():
|
|
142
|
-
|
|
154
|
+
_fs = MyGCSFileSystem()
|
|
143
155
|
else:
|
|
144
|
-
|
|
156
|
+
_fs = LocalFileSystem()
|
|
157
|
+
|
|
158
|
+
config = Config(
|
|
159
|
+
fs=_fs,
|
|
160
|
+
team=os.environ.get("DAPLA_GROUP_CONTEXT", "")
|
|
161
|
+
.replace("-developers", "")
|
|
162
|
+
.replace("-data-admins", "")
|
|
163
|
+
or None,
|
|
164
|
+
env=os.environ.get("DAPLA_ENVIRONMENT", "prod").lower(),
|
|
165
|
+
)
|
|
166
|
+
del _fs
|
|
145
167
|
|
|
146
168
|
|
|
147
169
|
class Tree:
|
|
@@ -177,12 +199,12 @@ class _PathBase:
|
|
|
177
199
|
_period_prefix: str = PERIOD_PREFIX
|
|
178
200
|
|
|
179
201
|
@staticmethod
|
|
180
|
-
def
|
|
202
|
+
def set_config(pat: str, value: Any) -> None:
|
|
181
203
|
"""Change config variable."""
|
|
182
|
-
setattr(
|
|
204
|
+
setattr(config, pat, value)
|
|
183
205
|
|
|
184
206
|
@property
|
|
185
|
-
def
|
|
207
|
+
def _fs_constructor(self) -> Callable | type:
|
|
186
208
|
"""Can be overridden in subclass.
|
|
187
209
|
|
|
188
210
|
Must return a function or a class that, when called,
|
|
@@ -191,13 +213,13 @@ class _PathBase:
|
|
|
191
213
|
The 'info' method should return a dict like with at least the keys
|
|
192
214
|
'updated', 'size', 'name' and 'type'.
|
|
193
215
|
"""
|
|
194
|
-
return
|
|
216
|
+
return config.fs
|
|
195
217
|
|
|
196
218
|
|
|
197
219
|
class Path(str, _PathBase):
|
|
198
220
|
"""Path object that works like a string, with methods for working with the GCS file system."""
|
|
199
221
|
|
|
200
|
-
|
|
222
|
+
_fs_attrs: set[str] = {
|
|
201
223
|
"info",
|
|
202
224
|
"isdir",
|
|
203
225
|
"open",
|
|
@@ -214,29 +236,71 @@ class Path(str, _PathBase):
|
|
|
214
236
|
return PathSeries
|
|
215
237
|
|
|
216
238
|
@staticmethod
|
|
217
|
-
def _standardize_path(path: str |
|
|
239
|
+
def _standardize_path(path: str | PurePath) -> str:
|
|
218
240
|
"""Make sure delimiter is '/' and path ends without '/'."""
|
|
219
241
|
return str(path).replace("\\", "/").replace(r"\"", "/")
|
|
220
242
|
|
|
221
|
-
def __new__(cls, gcs_path: str |
|
|
243
|
+
def __new__(cls, gcs_path: str | os.PathLike | None = None, fs=None):
|
|
222
244
|
"""Construct Path with '/' as delimiter."""
|
|
223
245
|
gcs_path = cls._standardize_path(gcs_path or "")
|
|
224
246
|
obj = super().__new__(cls, gcs_path)
|
|
225
|
-
|
|
226
|
-
|
|
247
|
+
if fs is not None:
|
|
248
|
+
obj._fs = fs
|
|
249
|
+
elif gcs_path.startswith("/buckets"):
|
|
250
|
+
obj._fs = LocalFileSystem()
|
|
251
|
+
else:
|
|
252
|
+
obj._fs = config.fs
|
|
227
253
|
return obj
|
|
228
254
|
|
|
229
255
|
def buckets_path(self) -> "Path":
|
|
230
|
-
if self.startswith("/buckets"):
|
|
231
|
-
return self
|
|
232
|
-
|
|
233
|
-
root = self.parts[0]
|
|
234
|
-
bucket = root.split("-data-")[-1].split("-prod")[0]
|
|
235
|
-
|
|
236
256
|
try:
|
|
237
|
-
|
|
238
|
-
except
|
|
239
|
-
|
|
257
|
+
protocol, _ = str(self).split("://")
|
|
258
|
+
except ValueError:
|
|
259
|
+
protocol = ""
|
|
260
|
+
root, *subdirs = str(self).replace(f"{protocol}://", "").split("/")
|
|
261
|
+
bucket = root.split("-data-")[-1].split(f"-{config.env}")[0]
|
|
262
|
+
if config.team in root:
|
|
263
|
+
new_root = "/buckets"
|
|
264
|
+
team = config.team
|
|
265
|
+
else:
|
|
266
|
+
team = root.split("-data-")[0]
|
|
267
|
+
team = team.lstrip(team.split("-")[0]).strip("-")
|
|
268
|
+
bucket = bucket.replace("delt-", "")
|
|
269
|
+
new_root = f"/buckets/shared/{team}"
|
|
270
|
+
|
|
271
|
+
subdirs = "/".join(subdirs).strip("/")
|
|
272
|
+
if subdirs:
|
|
273
|
+
return self.__class__(f"{new_root}/{bucket}/{subdirs}", self.fs)
|
|
274
|
+
else:
|
|
275
|
+
return self.__class__(f"{new_root}/{bucket}", self.fs)
|
|
276
|
+
|
|
277
|
+
def gs_path(self) -> "Path":
|
|
278
|
+
if not str(self).startswith("/buckets/"):
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"Can only convert paths starting with '/buckets/' to GCS path. Got {self}"
|
|
281
|
+
)
|
|
282
|
+
if "/shared/" in str(self):
|
|
283
|
+
team, bucket, *subdirs = str(self).split("/shared/")[1].split("/")
|
|
284
|
+
bucket = "delt-" + bucket
|
|
285
|
+
elif not config.team:
|
|
286
|
+
raise ValueError(
|
|
287
|
+
"Must set config.team (hint: from daplapath import config; config.team = 'name')"
|
|
288
|
+
)
|
|
289
|
+
else:
|
|
290
|
+
team = config.team
|
|
291
|
+
bucket, *subdirs = str(self).replace("/buckets/", "").split("/")
|
|
292
|
+
|
|
293
|
+
gspath = (
|
|
294
|
+
config.bucket_pattern.format(
|
|
295
|
+
team=team,
|
|
296
|
+
bucket=bucket,
|
|
297
|
+
env=config.env,
|
|
298
|
+
default_protocol=config.default_protocol,
|
|
299
|
+
)
|
|
300
|
+
+ "/"
|
|
301
|
+
+ "/".join(subdirs)
|
|
302
|
+
)
|
|
303
|
+
return self.__class__(gspath, self.fs)
|
|
240
304
|
|
|
241
305
|
def tree(
|
|
242
306
|
self,
|
|
@@ -268,7 +332,6 @@ class Path(str, _PathBase):
|
|
|
268
332
|
self, pattern: str | None = None, recursive: bool = True, **kwargs
|
|
269
333
|
) -> "PathSeries":
|
|
270
334
|
"""Create PathSeries of files/directories that match the pattern."""
|
|
271
|
-
|
|
272
335
|
recursive = kwargs.get("recurse_symlinks", recursive)
|
|
273
336
|
|
|
274
337
|
if pattern:
|
|
@@ -284,17 +347,17 @@ class Path(str, _PathBase):
|
|
|
284
347
|
|
|
285
348
|
kwargs["detail"] = True
|
|
286
349
|
|
|
287
|
-
if "recursive" in get_arguments(self.
|
|
350
|
+
if "recursive" in get_arguments(self.fs.glob):
|
|
288
351
|
kwargs["recursive"] = recursive
|
|
289
352
|
else:
|
|
290
|
-
# try to set to non-recursive if
|
|
353
|
+
# try to set to non-recursive if fs.glob allows argument 'maxdepth'
|
|
291
354
|
kwargs["maxdepth"] = None if recursive else 1
|
|
292
355
|
|
|
293
356
|
try:
|
|
294
|
-
info: list[dict] | dict = self.
|
|
357
|
+
info: list[dict] | dict = self.fs.glob(pattern, **kwargs)
|
|
295
358
|
except TypeError:
|
|
296
359
|
kwargs.pop("maxdepth", None)
|
|
297
|
-
info: list[dict] | dict = self.
|
|
360
|
+
info: list[dict] | dict = self.fs.glob(pattern, **kwargs)
|
|
298
361
|
|
|
299
362
|
if isinstance(info, dict):
|
|
300
363
|
# file system can return single dict if only one file path
|
|
@@ -318,10 +381,17 @@ class Path(str, _PathBase):
|
|
|
318
381
|
"""
|
|
319
382
|
return self.glob("**", recursive=recursive, **kwargs)
|
|
320
383
|
|
|
384
|
+
def unlink(self, missing_ok: bool = False) -> None:
|
|
385
|
+
if not self.exists():
|
|
386
|
+
if not missing_ok:
|
|
387
|
+
raise FileNotFoundError(str(self))
|
|
388
|
+
return
|
|
389
|
+
return self.fs.rm(recursive=False)
|
|
390
|
+
|
|
321
391
|
def rmdir(self) -> None:
|
|
322
392
|
files = self.glob("**").files
|
|
323
393
|
with ThreadPoolExecutor() as executor:
|
|
324
|
-
list(executor.map(self.
|
|
394
|
+
list(executor.map(self.fs.rm_file, files))
|
|
325
395
|
|
|
326
396
|
def cp(self, destination: "Path | str") -> "Path":
|
|
327
397
|
return self._cp_or_mv(destination, "cp")
|
|
@@ -331,28 +401,25 @@ class Path(str, _PathBase):
|
|
|
331
401
|
out_path = self._cp_or_mv(destination, "mv")
|
|
332
402
|
if was_dir:
|
|
333
403
|
try:
|
|
334
|
-
self.
|
|
404
|
+
self.fs.rmdir(str(self))
|
|
335
405
|
except (FileNotFoundError, NotADirectoryError):
|
|
336
406
|
pass
|
|
337
407
|
return out_path
|
|
338
408
|
|
|
339
|
-
def read_text(self, *args, **kwargs):
|
|
340
|
-
return self._path.read_text(*args, **kwargs)
|
|
341
|
-
|
|
342
409
|
def versions(self, include_versionless: bool = False) -> "PathSeries":
|
|
343
410
|
"""Returns a PathSeries of all versions of the file."""
|
|
344
|
-
files_in_folder: Iterable[Path] = self.parent.glob(
|
|
411
|
+
files_in_folder: Iterable[Path] = self.parent.glob(
|
|
412
|
+
f"*{self.suffix}", recursive=False
|
|
413
|
+
)
|
|
345
414
|
|
|
346
415
|
if self.version_number:
|
|
347
|
-
start, _
|
|
416
|
+
start, *_ = re.split(self._version_pattern, self.name)
|
|
348
417
|
else:
|
|
349
|
-
start
|
|
418
|
+
start = self.stem
|
|
350
419
|
|
|
351
420
|
# create boolean mask. With numpy to make it work with both pandas and list
|
|
352
421
|
arr = np.array(files_in_folder)
|
|
353
|
-
is_version_of_this_file =
|
|
354
|
-
np_str_endswith(arr, end)
|
|
355
|
-
)
|
|
422
|
+
is_version_of_this_file = np_str_contains(arr, start)
|
|
356
423
|
if not include_versionless:
|
|
357
424
|
is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
|
|
358
425
|
|
|
@@ -376,7 +443,7 @@ class Path(str, _PathBase):
|
|
|
376
443
|
Lists files in the parent directory with the same versionless stem
|
|
377
444
|
and selects the one with the highest version number.
|
|
378
445
|
|
|
379
|
-
Returns
|
|
446
|
+
Returns:
|
|
380
447
|
-------
|
|
381
448
|
A Path.
|
|
382
449
|
"""
|
|
@@ -399,11 +466,11 @@ class Path(str, _PathBase):
|
|
|
399
466
|
Minutes needed between the timestamp of the current highest
|
|
400
467
|
numbered version.
|
|
401
468
|
|
|
402
|
-
Returns
|
|
469
|
+
Returns:
|
|
403
470
|
------
|
|
404
471
|
A Path with a new version number.
|
|
405
472
|
|
|
406
|
-
Raises
|
|
473
|
+
Raises:
|
|
407
474
|
------
|
|
408
475
|
ValueError:
|
|
409
476
|
If the method is run before the timeout period is up.
|
|
@@ -419,7 +486,7 @@ class Path(str, _PathBase):
|
|
|
419
486
|
time_should_be_at_least = pd.Timestamp.now(tz="Europe/Oslo").replace(
|
|
420
487
|
tzinfo=None
|
|
421
488
|
).round("s") - pd.Timedelta(minutes=timeout)
|
|
422
|
-
if timestamp > time_should_be_at_least:
|
|
489
|
+
if timestamp is not None and timestamp > time_should_be_at_least:
|
|
423
490
|
raise ValueError(
|
|
424
491
|
f"Latest version of the file was updated {timestamp[0]}, which "
|
|
425
492
|
f"is less than the timeout period of {timeout} minutes. "
|
|
@@ -433,7 +500,7 @@ class Path(str, _PathBase):
|
|
|
433
500
|
def with_version(self, version: int | None) -> "Path":
|
|
434
501
|
"""Replace the Path's version number, if any, with a new version number.
|
|
435
502
|
|
|
436
|
-
Examples
|
|
503
|
+
Examples:
|
|
437
504
|
--------
|
|
438
505
|
>>> Path('file.parquet').with_version(1)
|
|
439
506
|
'file_v1.parquet'
|
|
@@ -450,13 +517,13 @@ class Path(str, _PathBase):
|
|
|
450
517
|
self, include_versionless: bool = False
|
|
451
518
|
) -> "PathSeries":
|
|
452
519
|
"""Returns a PathSeries of all periods of the file."""
|
|
453
|
-
files_in_folder: Iterable[Path] = self.parent.glob(
|
|
520
|
+
files_in_folder: Iterable[Path] = self.parent.glob(
|
|
521
|
+
f"*{self.suffix}", recursive=False
|
|
522
|
+
)
|
|
454
523
|
|
|
455
524
|
# create boolean mask. With numpy to make it work with both pandas and list
|
|
456
525
|
arr = np.array(files_in_folder)
|
|
457
|
-
is_version_of_this_file = (
|
|
458
|
-
np_str_contains(arr, self.periodless_stem)
|
|
459
|
-
) & np_str_endswith(arr, self.suffix)
|
|
526
|
+
is_version_of_this_file = np_str_contains(arr, self.periodless_stem)
|
|
460
527
|
if not include_versionless:
|
|
461
528
|
is_version_of_this_file &= np_str_matches(arr, self._version_pattern)
|
|
462
529
|
|
|
@@ -480,12 +547,12 @@ class Path(str, _PathBase):
|
|
|
480
547
|
Lists files in the parent directory with the same
|
|
481
548
|
versionless and periodless stem and selects the path that sorts last.
|
|
482
549
|
|
|
483
|
-
Raises
|
|
550
|
+
Raises:
|
|
484
551
|
------
|
|
485
552
|
ValueError: If there is mismatch in period patterns, e.g. if one
|
|
486
553
|
path has the period "2020-01-01" and one path has "2021".
|
|
487
554
|
|
|
488
|
-
Returns
|
|
555
|
+
Returns:
|
|
489
556
|
-------
|
|
490
557
|
A Path.
|
|
491
558
|
"""
|
|
@@ -494,14 +561,14 @@ class Path(str, _PathBase):
|
|
|
494
561
|
include_versionless=False
|
|
495
562
|
)
|
|
496
563
|
sorted_paths = sort_by_period(period_paths)
|
|
497
|
-
return
|
|
564
|
+
return list(sorted_paths)[-1]
|
|
498
565
|
except (IndexError, StopIteration) as e:
|
|
499
566
|
raise FileNotFoundError(self) from e
|
|
500
567
|
|
|
501
568
|
def with_period(self, period: str) -> "Path":
|
|
502
569
|
"""Replace the Path's period, if any, with a new periods.
|
|
503
570
|
|
|
504
|
-
Examples
|
|
571
|
+
Examples:
|
|
505
572
|
--------
|
|
506
573
|
>>> Path('file_v1.parquet').with_period("2024-01-01")
|
|
507
574
|
'file_p2024-01-01_v1.parquet'
|
|
@@ -517,7 +584,7 @@ class Path(str, _PathBase):
|
|
|
517
584
|
def with_periods(self, from_period: str, to_period: str | None = None) -> "Path":
|
|
518
585
|
"""Replace the Path's period, if any, with one or two new periods.
|
|
519
586
|
|
|
520
|
-
Examples
|
|
587
|
+
Examples:
|
|
521
588
|
--------
|
|
522
589
|
>>> Path('file_v1.parquet').with_periods("2024-01-01")
|
|
523
590
|
'file_p2024-01-01_v1.parquet'
|
|
@@ -585,46 +652,57 @@ class Path(str, _PathBase):
|
|
|
585
652
|
@property
|
|
586
653
|
def periodless_stem(self) -> str:
|
|
587
654
|
"""Return the file stem before the period pattern."""
|
|
588
|
-
return str(re.sub(f"{self._period_pattern}.*", "", self.
|
|
655
|
+
return str(re.sub(f"{self._period_pattern}.*", "", self.stem))
|
|
589
656
|
|
|
590
657
|
@property
|
|
591
658
|
def versionless_stem(self) -> str:
|
|
592
659
|
"""Return the file stem before the version pattern."""
|
|
593
|
-
return self._new(re.split(self._version_pattern, self.
|
|
660
|
+
return self._new(re.split(self._version_pattern, self.name)[0]).stem
|
|
594
661
|
|
|
595
662
|
@property
|
|
596
663
|
def parent(self) -> "Path":
|
|
597
664
|
"""Parent path."""
|
|
598
|
-
return self._new(self.
|
|
665
|
+
return self._new("/".join(self.split("/")[:-1]))
|
|
599
666
|
|
|
600
667
|
@property
|
|
601
668
|
def parents(self) -> "list[Path]":
|
|
602
669
|
"""Parent path."""
|
|
603
|
-
|
|
670
|
+
no_protocol = self.split("://")[-1]
|
|
671
|
+
return [
|
|
672
|
+
self._new("/".join(no_protocol.split("/")[:i]))
|
|
673
|
+
for i in range(no_protocol.count("/"))
|
|
674
|
+
][::-1]
|
|
604
675
|
|
|
605
676
|
@property
|
|
606
677
|
def name(self) -> str:
|
|
607
678
|
"""Final part of the path."""
|
|
608
|
-
return self.
|
|
679
|
+
return self.split("/")[-1]
|
|
609
680
|
|
|
610
681
|
@property
|
|
611
682
|
def stem(self) -> str:
|
|
612
683
|
"""File name without the suffix"""
|
|
613
|
-
return self.
|
|
684
|
+
return self.split("/")[-1].replace(self.suffix, "")
|
|
614
685
|
|
|
615
686
|
@property
|
|
616
687
|
def parts(self) -> tuple[str]:
|
|
617
|
-
|
|
688
|
+
no_protocol = self.split("://")[-1]
|
|
689
|
+
return tuple(no_protocol.split("/"))
|
|
618
690
|
|
|
619
691
|
@property
|
|
620
692
|
def suffix(self) -> str:
|
|
621
693
|
"""Final file path suffix."""
|
|
622
|
-
|
|
694
|
+
name = self.name
|
|
695
|
+
if "." not in name:
|
|
696
|
+
return ""
|
|
697
|
+
return "." + (name).split(".")[-1]
|
|
623
698
|
|
|
624
699
|
@property
|
|
625
700
|
def suffixes(self) -> list[str]:
|
|
626
701
|
"""File path suffixes, if multiple."""
|
|
627
|
-
|
|
702
|
+
name = self.name
|
|
703
|
+
if "." not in name:
|
|
704
|
+
return []
|
|
705
|
+
return ["." + suff for suff in (name).split(".")[1:]]
|
|
628
706
|
|
|
629
707
|
@property
|
|
630
708
|
def index_column_names(self) -> list[str]:
|
|
@@ -685,9 +763,14 @@ class Path(str, _PathBase):
|
|
|
685
763
|
try:
|
|
686
764
|
info = self._info
|
|
687
765
|
except AttributeError:
|
|
688
|
-
info = self.
|
|
766
|
+
info = self.fs.info(self)
|
|
689
767
|
self._info = info
|
|
690
|
-
|
|
768
|
+
try:
|
|
769
|
+
return _get_timestamps(info["updated"])
|
|
770
|
+
except Exception as e:
|
|
771
|
+
if not self.exists() or self.is_dir():
|
|
772
|
+
return None
|
|
773
|
+
raise e
|
|
691
774
|
|
|
692
775
|
@property
|
|
693
776
|
def type(self) -> str:
|
|
@@ -699,7 +782,7 @@ class Path(str, _PathBase):
|
|
|
699
782
|
try:
|
|
700
783
|
info = self._info
|
|
701
784
|
except AttributeError:
|
|
702
|
-
info = self.
|
|
785
|
+
info = self.fs.info(self)
|
|
703
786
|
self._info = info
|
|
704
787
|
return info["size"]
|
|
705
788
|
|
|
@@ -764,29 +847,29 @@ class Path(str, _PathBase):
|
|
|
764
847
|
return self.isdir()
|
|
765
848
|
|
|
766
849
|
def with_suffix(self, suffix: str):
|
|
767
|
-
return self._new(self.
|
|
850
|
+
return self._new(self.replace(self.suffix, suffix))
|
|
768
851
|
|
|
769
852
|
def with_name(self, new_name: str):
|
|
770
|
-
return self._new(self.
|
|
853
|
+
return self._new(self.replace(self.name, new_name))
|
|
771
854
|
|
|
772
|
-
def with_stem(self,
|
|
773
|
-
return self._new(self.
|
|
855
|
+
def with_stem(self, new_stem: str):
|
|
856
|
+
return self._new(self.replace(self.stem, new_stem))
|
|
774
857
|
|
|
775
858
|
@property
|
|
776
|
-
def
|
|
777
|
-
if self.
|
|
778
|
-
self.
|
|
779
|
-
return self.
|
|
859
|
+
def fs(self):
|
|
860
|
+
if self._fs is None:
|
|
861
|
+
self._fs = self._fs_constructor()
|
|
862
|
+
return self._fs
|
|
780
863
|
|
|
781
|
-
@
|
|
782
|
-
def
|
|
783
|
-
self.
|
|
784
|
-
return self.
|
|
864
|
+
@fs.setter
|
|
865
|
+
def fs(self, val):
|
|
866
|
+
self._fs = val
|
|
867
|
+
return self._fs
|
|
785
868
|
|
|
786
869
|
def __truediv__(self, other: str | os.PathLike | PurePath) -> "Path":
|
|
787
870
|
"""Append a string or Path to the path with a forward slash.
|
|
788
871
|
|
|
789
|
-
Example
|
|
872
|
+
Example:
|
|
790
873
|
-------
|
|
791
874
|
>>> folder = 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2023'
|
|
792
875
|
>>> file_path = folder / "ABAS_kommune_flate_p2023_v1.parquet"
|
|
@@ -801,13 +884,12 @@ class Path(str, _PathBase):
|
|
|
801
884
|
return self._new(f"{self}/{as_str(other)}")
|
|
802
885
|
|
|
803
886
|
def __getattribute__(self, name):
|
|
804
|
-
"""
|
|
887
|
+
"""Stackoverflow hack to ensure we return Path when using string methods.
|
|
805
888
|
|
|
806
889
|
It works for all but the string magigmethods, importantly __add__.
|
|
807
890
|
"""
|
|
808
|
-
|
|
809
891
|
# skip magic methods
|
|
810
|
-
if name not in dir(str) or name.startswith("__") and name.endswith("__"):
|
|
892
|
+
if name not in dir(str) or (name.startswith("__") and name.endswith("__")):
|
|
811
893
|
return super().__getattribute__(name)
|
|
812
894
|
|
|
813
895
|
def method(self, *args, **kwargs):
|
|
@@ -826,35 +908,35 @@ class Path(str, _PathBase):
|
|
|
826
908
|
return method.__get__(self)
|
|
827
909
|
|
|
828
910
|
def __getattr__(self, attr: str) -> Any:
|
|
829
|
-
"""Get
|
|
911
|
+
"""Get fs attribute."""
|
|
830
912
|
error_message = f"{self.__class__.__name__} has no attribute '{attr}'"
|
|
831
913
|
if attr.startswith("_"):
|
|
832
914
|
raise AttributeError(error_message)
|
|
833
|
-
if attr not in self.
|
|
915
|
+
if attr not in self._fs_attrs:
|
|
834
916
|
raise AttributeError(error_message)
|
|
835
|
-
return functools.partial(getattr(self.
|
|
917
|
+
return functools.partial(getattr(self.fs, attr), self)
|
|
836
918
|
|
|
837
919
|
def __fspath__(self) -> str:
|
|
838
920
|
return str(self)
|
|
839
921
|
|
|
840
922
|
def __dir__(self) -> list[str]:
|
|
841
|
-
return list(sorted({x for x in dir(Path)} | self.
|
|
923
|
+
return list(sorted({x for x in dir(Path)} | self._fs_attrs))
|
|
842
924
|
|
|
843
925
|
def _iterable_constructor(self, info: list[dict], **kwargs) -> "PathSeries":
|
|
844
926
|
series: pd.Series = _get_paths_and_index(info).apply(self.__class__)
|
|
845
927
|
for path in series:
|
|
846
|
-
path.
|
|
928
|
+
path._fs = self._fs
|
|
847
929
|
return self._iterable_type(series, **kwargs)
|
|
848
930
|
|
|
849
931
|
def _new(self, new_path: str | Path) -> "Path":
|
|
850
|
-
return self.__class__(new_path, self.
|
|
932
|
+
return self.__class__(new_path, self.fs)
|
|
851
933
|
|
|
852
934
|
def _cp_or_mv(self, destination: "Path | str", attr: str) -> "Path":
|
|
853
|
-
func: Callable = getattr(self.
|
|
935
|
+
func: Callable = getattr(self.fs, attr)
|
|
854
936
|
try:
|
|
855
937
|
func(self, destination)
|
|
856
938
|
except FileNotFoundError:
|
|
857
|
-
destination = self.__class__(destination)
|
|
939
|
+
destination = self.__class__(destination, self.fs)
|
|
858
940
|
sources = list(self.glob("**").files)
|
|
859
941
|
destinations = [path.replace(self, destination) for path in sources]
|
|
860
942
|
with ThreadPoolExecutor() as executor:
|
|
@@ -915,7 +997,7 @@ class PathSeries(pd.Series, _PathBase):
|
|
|
915
997
|
names: Series
|
|
916
998
|
The names of the file paths.
|
|
917
999
|
|
|
918
|
-
Methods
|
|
1000
|
+
Methods:
|
|
919
1001
|
-------
|
|
920
1002
|
tree():
|
|
921
1003
|
con
|
|
@@ -955,18 +1037,24 @@ class PathSeries(pd.Series, _PathBase):
|
|
|
955
1037
|
data is not None
|
|
956
1038
|
and len(data)
|
|
957
1039
|
and not (
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
1040
|
+
(
|
|
1041
|
+
isinstance(data, pd.Series)
|
|
1042
|
+
and len(data.index.names) == len(self._index_names)
|
|
1043
|
+
)
|
|
1044
|
+
or (
|
|
1045
|
+
isinstance(index, pd.MultiIndex)
|
|
1046
|
+
and len(index.names) == len(self._index_names)
|
|
1047
|
+
)
|
|
962
1048
|
# dict with e.g. tuple keys, turned into MultiIndex
|
|
963
|
-
or
|
|
964
|
-
|
|
1049
|
+
or (
|
|
1050
|
+
is_dict_like(data)
|
|
1051
|
+
and all(len(key) == len(self._index_names) for key in data.keys())
|
|
1052
|
+
)
|
|
965
1053
|
)
|
|
966
1054
|
)
|
|
967
1055
|
if should_construct_index:
|
|
968
|
-
|
|
969
|
-
data = _get_paths_and_index([
|
|
1056
|
+
fs = kwargs.get("fs", self._fs_constructor())
|
|
1057
|
+
data = _get_paths_and_index([fs.info(path) for path in data])
|
|
970
1058
|
|
|
971
1059
|
super().__init__(data, index=index, **kwargs)
|
|
972
1060
|
|
|
@@ -1332,7 +1420,7 @@ def _pathseries_constructor_with_fallback(
|
|
|
1332
1420
|
max_parts: int | None = 2,
|
|
1333
1421
|
path_series_type: type | None = None,
|
|
1334
1422
|
**kwargs,
|
|
1335
|
-
) ->
|
|
1423
|
+
) -> PathSeries | pd.Series:
|
|
1336
1424
|
path_series_type = path_series_type or PathSeries
|
|
1337
1425
|
|
|
1338
1426
|
kwargs["name"] = kwargs.pop("name", "path")
|
|
@@ -1366,7 +1454,7 @@ def _pathseries_constructor_with_fallback(
|
|
|
1366
1454
|
return series
|
|
1367
1455
|
|
|
1368
1456
|
|
|
1369
|
-
def _dataframe_constructor(data=None, index=None, **kwargs) ->
|
|
1457
|
+
def _dataframe_constructor(data=None, index=None, **kwargs) -> pd.DataFrame:
|
|
1370
1458
|
data.name = "path"
|
|
1371
1459
|
return pd.DataFrame(data, index=index, **kwargs)
|
|
1372
1460
|
|
|
@@ -1390,16 +1478,15 @@ def split_path_and_make_copyable_html(
|
|
|
1390
1478
|
split: Text pattern to split the path on. Defaults to "/".
|
|
1391
1479
|
display_prefix: The text to display instead of the parent directory. Defaults to ".../".
|
|
1392
1480
|
|
|
1393
|
-
Returns
|
|
1481
|
+
Returns:
|
|
1394
1482
|
-------
|
|
1395
1483
|
A string that holds the HTML and JavaScript code to be passed to IPython.display.display.
|
|
1396
1484
|
"""
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
function copyToClipboard(text, event) {{
|
|
1485
|
+
copy_to_clipboard_js = """<script>
|
|
1486
|
+
function copyToClipboard(text, event) {
|
|
1400
1487
|
event.preventDefault();
|
|
1401
1488
|
navigator.clipboard.writeText(text)
|
|
1402
|
-
.then(() => {
|
|
1489
|
+
.then(() => {
|
|
1403
1490
|
const alertBox = document.createElement('div');
|
|
1404
1491
|
const selection = window.getSelection();
|
|
1405
1492
|
|
|
@@ -1412,14 +1499,14 @@ function copyToClipboard(text, event) {{
|
|
|
1412
1499
|
alertBox.innerHTML = 'Copied to clipboard';
|
|
1413
1500
|
document.body.appendChild(alertBox);
|
|
1414
1501
|
|
|
1415
|
-
setTimeout(function() {
|
|
1502
|
+
setTimeout(function() {
|
|
1416
1503
|
alertBox.style.display = 'none';
|
|
1417
|
-
}
|
|
1418
|
-
}
|
|
1419
|
-
.catch(err => {
|
|
1504
|
+
}, 1500); // 1.5 seconds
|
|
1505
|
+
})
|
|
1506
|
+
.catch(err => {
|
|
1420
1507
|
console.error('Could not copy text: ', err);
|
|
1421
|
-
}
|
|
1422
|
-
}
|
|
1508
|
+
});
|
|
1509
|
+
}
|
|
1423
1510
|
</script>"""
|
|
1424
1511
|
|
|
1425
1512
|
if split is not None:
|
|
@@ -1634,28 +1721,28 @@ def get_schema(file) -> pyarrow.Schema:
|
|
|
1634
1721
|
# try:
|
|
1635
1722
|
# return ds.dataset(file).schema
|
|
1636
1723
|
# except (TypeError, FileNotFoundError) as e:
|
|
1637
|
-
if not hasattr(file, "
|
|
1724
|
+
if not hasattr(file, "fs"):
|
|
1638
1725
|
raise e
|
|
1639
1726
|
|
|
1640
|
-
|
|
1727
|
+
fs = file.fs
|
|
1641
1728
|
|
|
1642
1729
|
def _get_schema(path):
|
|
1643
1730
|
try:
|
|
1644
1731
|
return pq.read_schema(path)
|
|
1645
1732
|
except FileNotFoundError as e:
|
|
1646
1733
|
try:
|
|
1647
|
-
with
|
|
1734
|
+
with fs.open(path, "rb") as f:
|
|
1648
1735
|
return pq.read_schema(f)
|
|
1649
1736
|
except Exception as e2:
|
|
1650
1737
|
raise e2.__class__(f"{e2}. {path}") from e
|
|
1651
1738
|
|
|
1652
|
-
child_paths =
|
|
1739
|
+
child_paths = fs.glob(file + "/**/*.parquet")
|
|
1653
1740
|
if not len(child_paths):
|
|
1654
1741
|
raise e.__class__(f"{e}: {file}") from e
|
|
1655
1742
|
|
|
1656
1743
|
with ThreadPoolExecutor() as executor:
|
|
1657
1744
|
schemas: list[pyarrow.Schema] = list(
|
|
1658
|
-
executor.map(_get_schema,
|
|
1745
|
+
executor.map(_get_schema, fs.glob(file + "/**/*.parquet"))
|
|
1659
1746
|
)
|
|
1660
1747
|
if not schemas:
|
|
1661
1748
|
raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
|
|
@@ -1734,15 +1821,14 @@ def sort_by_period(paths: Iterable[str]) -> Iterable[str]:
|
|
|
1734
1821
|
except ValueError:
|
|
1735
1822
|
# select last period
|
|
1736
1823
|
periods = [pd.Timestamp(next(iter(reversed(path.periods)))) for path in paths]
|
|
1737
|
-
combined = list(zip(periods, range(len(paths)),
|
|
1824
|
+
combined = list(zip(periods, paths, list(range(len(paths))), strict=True))
|
|
1738
1825
|
combined.sort()
|
|
1739
|
-
indices: list[int] = [x[
|
|
1826
|
+
indices: list[int] = [x[2] for x in combined]
|
|
1740
1827
|
try:
|
|
1741
1828
|
return paths.iloc[indices]
|
|
1742
1829
|
except AttributeError:
|
|
1743
|
-
return paths.__class__([x[
|
|
1830
|
+
return paths.__class__([x[1] for x in combined])
|
|
1744
1831
|
|
|
1745
1832
|
|
|
1746
1833
|
np_str_contains: Callable = np.vectorize(str.__contains__)
|
|
1747
|
-
np_str_endswith: Callable = np.vectorize(str.endswith)
|
|
1748
1834
|
np_str_matches: Callable = np.vectorize(lambda txt, pat: bool(re.search(pat, txt)))
|
|
File without changes
|
|
File without changes
|