lamindb_setup 1.8.3__py3-none-any.whl → 1.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. lamindb_setup/__init__.py +107 -107
  2. lamindb_setup/_cache.py +87 -87
  3. lamindb_setup/_check_setup.py +166 -166
  4. lamindb_setup/_connect_instance.py +328 -342
  5. lamindb_setup/_delete.py +141 -141
  6. lamindb_setup/_disconnect.py +32 -32
  7. lamindb_setup/_init_instance.py +440 -440
  8. lamindb_setup/_migrate.py +266 -259
  9. lamindb_setup/_register_instance.py +35 -35
  10. lamindb_setup/_schema_metadata.py +441 -441
  11. lamindb_setup/_set_managed_storage.py +70 -70
  12. lamindb_setup/_setup_user.py +133 -133
  13. lamindb_setup/core/__init__.py +21 -21
  14. lamindb_setup/core/_aws_options.py +223 -211
  15. lamindb_setup/core/_hub_client.py +248 -243
  16. lamindb_setup/core/_hub_core.py +665 -663
  17. lamindb_setup/core/_hub_crud.py +227 -227
  18. lamindb_setup/core/_private_django_api.py +83 -83
  19. lamindb_setup/core/_settings.py +377 -364
  20. lamindb_setup/core/_settings_instance.py +569 -568
  21. lamindb_setup/core/_settings_load.py +141 -141
  22. lamindb_setup/core/_settings_save.py +95 -95
  23. lamindb_setup/core/_settings_storage.py +429 -429
  24. lamindb_setup/core/_settings_store.py +91 -91
  25. lamindb_setup/core/_settings_user.py +55 -55
  26. lamindb_setup/core/_setup_bionty_sources.py +44 -44
  27. lamindb_setup/core/cloud_sqlite_locker.py +240 -240
  28. lamindb_setup/core/django.py +305 -291
  29. lamindb_setup/core/exceptions.py +1 -1
  30. lamindb_setup/core/hashing.py +134 -134
  31. lamindb_setup/core/types.py +1 -1
  32. lamindb_setup/core/upath.py +1013 -1009
  33. lamindb_setup/errors.py +70 -70
  34. lamindb_setup/types.py +20 -20
  35. {lamindb_setup-1.8.3.dist-info → lamindb_setup-1.9.1.dist-info}/METADATA +1 -1
  36. lamindb_setup-1.9.1.dist-info/RECORD +50 -0
  37. lamindb_setup-1.8.3.dist-info/RECORD +0 -50
  38. {lamindb_setup-1.8.3.dist-info → lamindb_setup-1.9.1.dist-info}/LICENSE +0 -0
  39. {lamindb_setup-1.8.3.dist-info → lamindb_setup-1.9.1.dist-info}/WHEEL +0 -0
@@ -1,1009 +1,1013 @@
1
- # we are not documenting UPath here because it's documented at lamindb.UPath
2
- """Paths & file systems."""
3
-
4
- from __future__ import annotations
5
-
6
- import os
7
- import warnings
8
- from collections import defaultdict
9
- from datetime import datetime, timezone
10
- from functools import partial
11
- from itertools import islice
12
- from pathlib import Path, PosixPath, PurePosixPath, WindowsPath
13
- from typing import TYPE_CHECKING, Any, Literal
14
- from urllib.parse import parse_qs, urlsplit
15
-
16
- import fsspec
17
- from lamin_utils import logger
18
- from upath import UPath
19
- from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
20
- from upath.implementations.local import LocalPath
21
- from upath.registry import register_implementation
22
-
23
- from lamindb_setup.errors import StorageNotEmpty
24
-
25
- from ._aws_options import HOSTED_BUCKETS, get_aws_options_manager
26
- from ._deprecated import deprecated
27
- from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list, hash_string
28
-
29
- if TYPE_CHECKING:
30
- from lamindb_setup.types import UPathStr
31
-
32
- LocalPathClasses = (PosixPath, WindowsPath, LocalPath)
33
-
34
- # also see https://gist.github.com/securifera/e7eed730cbe1ce43d0c29d7cd2d582f4
35
- # ".gz" is not listed here as it typically occurs with another suffix
36
- # the complete list is at lamindb.core.storage._suffixes
37
- VALID_SIMPLE_SUFFIXES = {
38
- #
39
- # without readers
40
- #
41
- ".fasta",
42
- ".fastq",
43
- ".jpg",
44
- ".mtx",
45
- ".obo",
46
- ".pdf",
47
- ".png",
48
- ".tar",
49
- ".tiff",
50
- ".txt",
51
- ".tsv",
52
- ".zip",
53
- ".xml",
54
- ".qs", # https://cran.r-project.org/web/packages/qs/vignettes/vignette.html
55
- ".rds",
56
- ".pt",
57
- ".pth",
58
- ".ckpt",
59
- ".state_dict",
60
- ".keras",
61
- ".pb",
62
- ".pbtxt",
63
- ".savedmodel",
64
- ".pkl",
65
- ".pickle",
66
- ".bin",
67
- ".safetensors",
68
- ".model",
69
- ".mlmodel",
70
- ".mar",
71
- #
72
- # with readers (see below)
73
- #
74
- ".h5ad",
75
- ".parquet",
76
- ".csv",
77
- ".fcs",
78
- ".xslx",
79
- ".zarr",
80
- ".json",
81
- }
82
- # below gets updated within lamindb because it's frequently changing
83
- VALID_COMPOSITE_SUFFIXES = {".anndata.zarr"}
84
-
85
- TRAILING_SEP = (os.sep, os.altsep) if os.altsep is not None else os.sep
86
-
87
-
88
- def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
89
- def process_digits(suffix: str):
90
- if suffix[1:].isdigit(): # :1 to skip the dot
91
- return "" # digits are no valid suffixes
92
- else:
93
- return suffix
94
-
95
- if len(path.suffixes) <= 1:
96
- return process_digits(path.suffix)
97
-
98
- total_suffix = "".join(path.suffixes)
99
- if total_suffix in VALID_SIMPLE_SUFFIXES:
100
- return total_suffix
101
- elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
102
- # below seems slow but OK for now
103
- for suffix in VALID_COMPOSITE_SUFFIXES:
104
- if total_suffix.endswith(suffix):
105
- break
106
- return suffix
107
- else:
108
- print_hint = True
109
- arg_name = "file" if arg_name is None else arg_name # for the warning
110
- msg = f"{arg_name} has more than one suffix (path.suffixes), "
111
- # first check the 2nd-to-last suffix because it might be followed by .gz
112
- # or another compression-related suffix
113
- # Alex thought about adding logic along the lines of path.suffixes[-1]
114
- # in COMPRESSION_SUFFIXES to detect something like .random.gz and then
115
- # add ".random.gz" but concluded it's too dangerous it's safer to just
116
- # use ".gz" in such a case
117
- if path.suffixes[-2] in VALID_SIMPLE_SUFFIXES:
118
- suffix = "".join(path.suffixes[-2:])
119
- msg += f"inferring: '{suffix}'"
120
- # do not print a warning for things like .tar.gz, .fastq.gz
121
- if path.suffixes[-1] == ".gz":
122
- print_hint = False
123
- else:
124
- suffix = path.suffixes[-1] # this is equivalent to path.suffix
125
- msg += (
126
- f"using only last suffix: '{suffix}' - if you want your composite"
127
- " suffix to be recognized add it to"
128
- " lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()"
129
- )
130
- if print_hint:
131
- logger.hint(msg)
132
- return process_digits(suffix)
133
-
134
-
135
- def infer_filesystem(path: UPathStr):
136
- import fsspec # improve cold start
137
-
138
- path_str = str(path)
139
-
140
- if isinstance(path, UPath):
141
- fs = path.fs
142
- else:
143
- protocol = fsspec.utils.get_protocol(path_str)
144
- if protocol == "s3":
145
- fs_kwargs = {"cache_regions": True}
146
- else:
147
- fs_kwargs = {}
148
- fs = fsspec.filesystem(protocol, **fs_kwargs)
149
-
150
- return fs, path_str
151
-
152
-
153
- # this is needed to avoid CreateBucket permission
154
- class S3FSMap(fsspec.FSMap):
155
- def __setitem__(self, key, value):
156
- """Store value in key."""
157
- key = self._key_to_str(key)
158
- self.fs.pipe_file(key, fsspec.mapping.maybe_convert(value))
159
-
160
-
161
- def create_mapper(
162
- fs,
163
- url="",
164
- check=False,
165
- create=False,
166
- missing_exceptions=None,
167
- ):
168
- if fsspec.utils.get_protocol(url) == "s3":
169
- return S3FSMap(
170
- url, fs, check=check, create=False, missing_exceptions=missing_exceptions
171
- )
172
- else:
173
- return fsspec.FSMap(
174
- url, fs, check=check, create=create, missing_exceptions=missing_exceptions
175
- )
176
-
177
-
178
- def print_hook(size: int, value: int, objectname: str, action: str):
179
- if size == 0:
180
- progress_in_percent = 100.0
181
- else:
182
- progress_in_percent = (value / size) * 100
183
- out = f"... {action} {objectname}:" f" {min(progress_in_percent, 100):4.1f}%"
184
- if "NBPRJ_TEST_NBPATH" not in os.environ:
185
- end = "\n" if progress_in_percent >= 100 else "\r"
186
- print(out, end=end)
187
-
188
-
189
- class ProgressCallback(fsspec.callbacks.Callback):
190
- def __init__(
191
- self,
192
- objectname: str,
193
- action: Literal["uploading", "downloading", "synchronizing"],
194
- adjust_size: bool = False,
195
- ):
196
- assert action in {"uploading", "downloading", "synchronizing"}
197
-
198
- super().__init__()
199
-
200
- self.action = action
201
- print_progress = partial(print_hook, objectname=objectname, action=action)
202
- self.hooks = {"print_progress": print_progress}
203
-
204
- self.adjust_size = adjust_size
205
-
206
- def absolute_update(self, value):
207
- pass
208
-
209
- def relative_update(self, inc=1):
210
- pass
211
-
212
- def update_relative_value(self, inc=1):
213
- if inc != 0:
214
- self.value += inc
215
- self.call()
216
- else:
217
- # this is specific to http filesystem
218
- # for some reason the last update is 0 always
219
- # sometimes the reported result is less that 100%
220
- # here 100% is forced manually in this case
221
- if self.value < 1.0 and self.value >= 0.999:
222
- self.value = self.size
223
- self.call()
224
-
225
- def branch(self, path_1, path_2, kwargs):
226
- if self.adjust_size:
227
- if Path(path_2 if self.action != "uploading" else path_1).is_dir():
228
- self.size -= 1
229
- kwargs["callback"] = ChildProgressCallback(self)
230
-
231
- def branched(self, path_1, path_2, **kwargs):
232
- self.branch(path_1, path_2, kwargs)
233
- return kwargs["callback"]
234
-
235
- def wrap(self, iterable):
236
- if self.adjust_size:
237
- paths = []
238
- for lpath, rpath in iterable:
239
- paths.append((lpath, rpath))
240
- if Path(lpath).is_dir():
241
- self.size -= 1
242
- self.adjust_size = False
243
- return paths
244
- else:
245
- return iterable
246
-
247
- @classmethod
248
- def requires_progress(
249
- cls,
250
- maybe_callback: fsspec.callbacks.Callback | None,
251
- print_progress: bool,
252
- objectname: str,
253
- action: Literal["uploading", "downloading", "synchronizing"],
254
- **kwargs,
255
- ):
256
- if maybe_callback is None:
257
- if print_progress:
258
- return cls(objectname, action, **kwargs)
259
- else:
260
- return fsspec.callbacks.NoOpCallback()
261
- return maybe_callback
262
-
263
-
264
- class ChildProgressCallback(fsspec.callbacks.Callback):
265
- def __init__(self, parent: ProgressCallback):
266
- super().__init__()
267
-
268
- self.parent = parent
269
-
270
- def parent_update(self, inc=1):
271
- self.parent.update_relative_value(inc)
272
-
273
- def relative_update(self, inc=1):
274
- if self.size != 0:
275
- self.parent_update(inc / self.size)
276
- else:
277
- self.parent_update(1)
278
-
279
-
280
- def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwargs):
281
- """Download from self (a destination in the cloud) to the local path."""
282
- if "recursive" not in kwargs:
283
- kwargs["recursive"] = True
284
- if print_progress and "callback" not in kwargs:
285
- callback = ProgressCallback(
286
- PurePosixPath(local_path).name, "downloading", adjust_size=True
287
- )
288
- kwargs["callback"] = callback
289
-
290
- cloud_path_str = str(self)
291
- local_path_str = str(local_path)
292
- # needed due to https://github.com/fsspec/filesystem_spec/issues/1766
293
- # otherwise fsspec calls fs._ls_real where it reads the body and parses links
294
- # so the file is downloaded 2 times
295
- # upath doesn't call fs.ls to infer type, so it is safe to call
296
- if self.protocol in {"http", "https"} and self.stat().as_info()["type"] == "file":
297
- self.fs.use_listings_cache = True
298
- self.fs.dircache[cloud_path_str] = []
299
-
300
- self.fs.download(cloud_path_str, local_path_str, **kwargs)
301
-
302
-
303
- def upload_from(
304
- self,
305
- local_path: UPathStr,
306
- create_folder: bool | None = None,
307
- print_progress: bool = True,
308
- **kwargs,
309
- ) -> UPath:
310
- """Upload from the local path to `self` (a destination in the cloud).
311
-
312
- If the local path is a directory, recursively upload its contents.
313
-
314
- Args:
315
- local_path: A local path of a file or directory.
316
- create_folder: Only applies if `local_path` is a directory and then
317
- defaults to `True`. If `True`, make a new folder in the destination
318
- using the directory name of `local_path`. If `False`, upload the
319
- contents of the directory to to the root-level of the destination.
320
- print_progress: Print progress.
321
-
322
- Returns:
323
- The destination path.
324
- """
325
- local_path = Path(local_path)
326
- local_path_is_dir = local_path.is_dir()
327
- if create_folder is None:
328
- create_folder = local_path_is_dir
329
- if create_folder and not local_path_is_dir:
330
- raise ValueError("create_folder can only be True if local_path is a directory")
331
-
332
- if print_progress and "callback" not in kwargs:
333
- callback = ProgressCallback(local_path.name, "uploading")
334
- kwargs["callback"] = callback
335
-
336
- source: str | list[str] = local_path.as_posix()
337
- destination: str | list[str] = self.as_posix()
338
- if local_path_is_dir:
339
- size: int = 0
340
- files: list[str] = []
341
- for file in (path for path in local_path.rglob("*") if path.is_file()):
342
- size += file.stat().st_size
343
- files.append(file.as_posix())
344
- # see https://github.com/fsspec/s3fs/issues/897
345
- # here we reduce batch_size for folders bigger than 8 GiB
346
- # to avoid the problem in the issue
347
- # the default batch size for this case is 128
348
- if "batch_size" not in kwargs and size >= 8 * 2**30:
349
- kwargs["batch_size"] = 64
350
-
351
- if not create_folder:
352
- source = files
353
- destination = fsspec.utils.other_paths(
354
- files, self.as_posix(), exists=False, flatten=False
355
- )
356
-
357
- # the below lines are to avoid s3fs triggering create_bucket in upload if
358
- # dirs are present, it allows to avoid the permission error
359
- if self.protocol == "s3" and local_path_is_dir and create_folder:
360
- bucket = self.drive
361
- if bucket not in self.fs.dircache:
362
- self.fs.dircache[bucket] = [{}]
363
- assert isinstance(destination, str)
364
- if not destination.endswith(TRAILING_SEP): # type: ignore
365
- destination += "/"
366
- cleanup_cache = True
367
- else:
368
- cleanup_cache = False
369
- else:
370
- cleanup_cache = False
371
-
372
- self.fs.upload(source, destination, recursive=create_folder, **kwargs)
373
-
374
- if cleanup_cache:
375
- # normally this is invalidated after the upload but still better to check
376
- if bucket in self.fs.dircache:
377
- del self.fs.dircache[bucket]
378
-
379
- if local_path_is_dir and create_folder:
380
- return self / local_path.name
381
- else:
382
- return self
383
-
384
-
385
- def synchronize_to(
386
- origin: UPath,
387
- destination: Path,
388
- error_no_origin: bool = True,
389
- print_progress: bool = False,
390
- just_check: bool = False,
391
- **kwargs,
392
- ) -> bool:
393
- """Sync to a local destination path."""
394
- destination = destination.resolve()
395
- protocol = origin.protocol
396
- try:
397
- cloud_info = origin.stat().as_info()
398
- exists = True
399
- is_dir = cloud_info["type"] == "directory"
400
- except FileNotFoundError:
401
- exists = False
402
-
403
- if not exists:
404
- warn_or_error = f"The original path {origin} does not exist anymore."
405
- if destination.exists():
406
- warn_or_error += (
407
- f"\nHowever, the local path {destination} still exists, you might want"
408
- " to reupload the object back."
409
- )
410
- logger.warning(warn_or_error)
411
- elif error_no_origin:
412
- warn_or_error += "\nIt is not possible to synchronize."
413
- raise FileNotFoundError(warn_or_error)
414
- return False
415
-
416
- use_size: bool = False
417
- # use casting to int to avoid problems when the local filesystem
418
- # discards fractional parts of timestamps
419
- if protocol == "s3":
420
- get_modified = lambda file_stat: int(file_stat["LastModified"].timestamp())
421
- elif protocol == "gs":
422
- get_modified = lambda file_stat: int(file_stat["mtime"].timestamp())
423
- elif protocol == "hf":
424
- get_modified = lambda file_stat: int(file_stat["last_commit"].date.timestamp())
425
- else: # http etc
426
- use_size = True
427
- get_modified = lambda file_stat: file_stat["size"]
428
-
429
- if use_size:
430
- is_sync_needed = lambda cloud_size, local_stat: cloud_size != local_stat.st_size
431
- else:
432
- # no need to cast local_stat.st_mtime to int
433
- # because if it has the fractional part and cloud_mtime doesn't
434
- # and they have the same integer part then cloud_mtime can't be bigger
435
- is_sync_needed = (
436
- lambda cloud_mtime, local_stat: cloud_mtime > local_stat.st_mtime
437
- )
438
-
439
- local_paths: list[Path] = []
440
- cloud_stats: dict[str, int]
441
- if is_dir:
442
- cloud_stats = {
443
- file: get_modified(stat)
444
- for file, stat in origin.fs.find(origin.as_posix(), detail=True).items()
445
- }
446
- for cloud_path in cloud_stats:
447
- file_key = PurePosixPath(cloud_path).relative_to(origin.path).as_posix()
448
- local_paths.append(destination / file_key)
449
- else:
450
- cloud_stats = {origin.path: get_modified(cloud_info)}
451
- local_paths.append(destination)
452
-
453
- local_paths_all: dict[Path, os.stat_result] = {}
454
- if destination.exists():
455
- if is_dir:
456
- local_paths_all = {
457
- path: path.stat() for path in destination.rglob("*") if path.is_file()
458
- }
459
- if not use_size:
460
- # cast to int to remove the fractional parts
461
- # there is a problem when a fractional part is allowed on one filesystem
462
- # but not on the other
463
- # so just normalize both to int
464
- cloud_mts_max: int = max(cloud_stats.values())
465
- local_mts_max: int = int(
466
- max(stat.st_mtime for stat in local_paths_all.values())
467
- )
468
- if local_mts_max > cloud_mts_max:
469
- return False
470
- elif local_mts_max == cloud_mts_max:
471
- if len(local_paths_all) == len(cloud_stats):
472
- return False
473
- elif just_check:
474
- return True
475
- else:
476
- local_paths_all = {destination: destination.stat()}
477
-
478
- cloud_files_sync = []
479
- local_files_sync = []
480
- for i, (cloud_file, cloud_stat) in enumerate(cloud_stats.items()):
481
- local_path = local_paths[i]
482
- if local_path not in local_paths_all or is_sync_needed(
483
- cloud_stat, local_paths_all[local_path]
484
- ):
485
- cloud_files_sync.append(cloud_file)
486
- local_files_sync.append(local_path.as_posix())
487
- else:
488
- cloud_files_sync = list(cloud_stats.keys())
489
- local_files_sync = [local_path.as_posix() for local_path in local_paths]
490
-
491
- if cloud_files_sync:
492
- if just_check:
493
- return True
494
-
495
- callback = ProgressCallback.requires_progress(
496
- maybe_callback=kwargs.pop("callback", None),
497
- print_progress=print_progress,
498
- objectname=destination.name,
499
- action="synchronizing",
500
- adjust_size=False,
501
- )
502
- origin.fs.download(
503
- cloud_files_sync,
504
- local_files_sync,
505
- recursive=False,
506
- callback=callback,
507
- **kwargs,
508
- )
509
- if not use_size:
510
- for i, cloud_file in enumerate(cloud_files_sync):
511
- cloud_mtime = cloud_stats[cloud_file]
512
- os.utime(local_files_sync[i], times=(cloud_mtime, cloud_mtime))
513
- else:
514
- return False
515
-
516
- if is_dir and local_paths_all:
517
- for path in (path for path in local_paths_all if path not in local_paths):
518
- path.unlink()
519
- parent = path.parent
520
- if next(parent.iterdir(), None) is None:
521
- parent.rmdir()
522
-
523
- return True
524
-
525
-
526
- def modified(self) -> datetime | None:
527
- """Return modified time stamp."""
528
- mtime = self.fs.modified(str(self))
529
- if mtime.tzinfo is None:
530
- mtime = mtime.replace(tzinfo=timezone.utc)
531
- return mtime.astimezone().replace(tzinfo=None)
532
-
533
-
534
- def compute_file_tree(
535
- path: UPath,
536
- *,
537
- level: int = -1,
538
- only_dirs: bool = False,
539
- n_max_files_per_dir_and_type: int = 100,
540
- n_max_files: int = 1000,
541
- include_paths: set[Any] | None = None,
542
- skip_suffixes: list[str] | None = None,
543
- ) -> tuple[str, int]:
544
- # .exists() helps to separate files from folders for gcsfs
545
- # otherwise sometimes it has is_dir() True and is_file() True
546
- if path.protocol == "gs" and not path.exists():
547
- raise FileNotFoundError
548
-
549
- space = " "
550
- branch = ""
551
- tee = "├── "
552
- last = "└── "
553
- if skip_suffixes is None:
554
- skip_suffixes_tuple = ()
555
- else:
556
- skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
557
- n_files = 0
558
- n_directories = 0
559
-
560
- # by default only including registered files
561
- # need a flag and a proper implementation
562
- suffixes = set()
563
- include_dirs = set()
564
- if include_paths is not None:
565
- include_dirs = {d for p in include_paths for d in p.parents}
566
- else:
567
- include_paths = set()
568
-
569
- def inner(dir_path: Path, prefix: str = "", level: int = -1):
570
- nonlocal n_files, n_directories, suffixes
571
- if level == 0:
572
- return
573
- stripped_dir_path = dir_path.as_posix().rstrip("/")
574
- # do not iterate through zarr directories
575
- if stripped_dir_path.endswith(skip_suffixes_tuple):
576
- return
577
- # this is needed so that the passed folder is not listed
578
- contents = [
579
- i
580
- for i in dir_path.iterdir()
581
- if i.as_posix().rstrip("/") != stripped_dir_path
582
- ]
583
- if only_dirs:
584
- contents = [d for d in contents if d.is_dir()]
585
- pointers = [tee] * (len(contents) - 1) + [last]
586
- n_files_per_dir_and_type = defaultdict(lambda: 0) # type: ignore
587
- # TODO: pass strict=False to zip with python > 3.9
588
- for pointer, child_path in zip(pointers, contents, strict=False): # type: ignore
589
- if child_path.is_dir():
590
- if include_dirs and child_path not in include_dirs:
591
- continue
592
- yield prefix + pointer + child_path.name + "/"
593
- n_directories += 1
594
- n_files_per_dir_and_type = defaultdict(lambda: 0)
595
- extension = branch if pointer == tee else space
596
- yield from inner(child_path, prefix=prefix + extension, level=level - 1)
597
- elif not only_dirs:
598
- if include_paths and child_path not in include_paths:
599
- continue
600
- suffix = extract_suffix_from_path(child_path)
601
- suffixes.add(suffix)
602
- n_files_per_dir_and_type[suffix] += 1
603
- n_files += 1
604
- if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
605
- yield prefix + "..."
606
- elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
607
- continue
608
- else:
609
- yield prefix + pointer + child_path.name
610
-
611
- folder_tree = ""
612
- iterator = inner(path, level=level)
613
- for line in islice(iterator, n_max_files):
614
- folder_tree += f"\n{line}"
615
- if next(iterator, None):
616
- folder_tree += f"\n... only showing {n_max_files} out of {n_files} files"
617
- directory_info = "directory" if n_directories == 1 else "directories"
618
- display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
619
- suffix_message = f" with suffixes {display_suffixes}" if n_files > 0 else ""
620
- message = (
621
- f"{n_directories} sub-{directory_info} &"
622
- f" {n_files} files{suffix_message}\n{path.resolve()}{folder_tree}"
623
- )
624
- return message, n_files
625
-
626
-
627
- # adapted from: https://stackoverflow.com/questions/9727673
628
- def view_tree(
629
- path: Path,
630
- *,
631
- level: int = 2,
632
- only_dirs: bool = False,
633
- n_max_files_per_dir_and_type: int = 100,
634
- n_max_files: int = 1000,
635
- include_paths: set[Any] | None = None,
636
- skip_suffixes: list[str] | None = None,
637
- ) -> None:
638
- """Print a visual tree structure of files & directories.
639
-
640
- Args:
641
- level: If `1`, only iterate through one level, if `2` iterate through 2
642
- levels, if `-1` iterate through entire hierarchy.
643
- only_dirs: Only iterate through directories.
644
- n_max_files: Display limit. Will only show this many files. Doesn't affect count.
645
- include_paths: Restrict to these paths.
646
- skip_suffixes: Skip directories with these suffixes.
647
-
648
- Examples:
649
- >>> dir_path = ln.core.datasets.generate_cell_ranger_files(
650
- >>> "sample_001", ln.settings.storage
651
- >>> )
652
- >>> ln.UPath(dir_path).view_tree()
653
- 3 subdirectories, 15 files
654
- sample_001
655
- ├── web_summary.html
656
- ├── metrics_summary.csv
657
- ├── molecule_info.h5
658
- ├── filtered_feature_bc_matrix
659
- ├── features.tsv.gz
660
- ├── barcodes.tsv.gz
661
- │ └── matrix.mtx.gz
662
- ├── analysis
663
- └── analysis.csv
664
- ├── raw_feature_bc_matrix
665
- ├── features.tsv.gz
666
- ├── barcodes.tsv.gz
667
- │ └── matrix.mtx.gz
668
- ├── possorted_genome_bam.bam.bai
669
- ├── cloupe.cloupe
670
- ├── possorted_genome_bam.bam
671
- ├── filtered_feature_bc_matrix.h5
672
- └── raw_feature_bc_matrix.h5
673
- """
674
- message, _ = compute_file_tree(
675
- path,
676
- level=level,
677
- only_dirs=only_dirs,
678
- n_max_files=n_max_files,
679
- n_max_files_per_dir_and_type=n_max_files_per_dir_and_type,
680
- include_paths=include_paths,
681
- skip_suffixes=skip_suffixes,
682
- )
683
- logger.print(message)
684
-
685
-
686
- def to_url(upath):
687
- """Public storage URL.
688
-
689
- Generates a public URL for an object in an S3 bucket using fsspec's UPath,
690
- considering the bucket's region.
691
-
692
- Args:
693
- - upath: A UPath object representing an S3 path.
694
-
695
- Returns:
696
- - A string containing the public URL to the S3 object.
697
- """
698
- if upath.protocol != "s3":
699
- raise ValueError("The provided UPath must be an S3 path.")
700
- key = "/".join(upath.parts[1:])
701
- bucket = upath.drive
702
- region = get_storage_region(upath)
703
- if region == "us-east-1":
704
- return f"https://{bucket}.s3.amazonaws.com/{key}"
705
- else:
706
- return f"https://{bucket}.s3-{region}.amazonaws.com/{key}"
707
-
708
-
709
- # Why aren't we subclassing?
710
- #
711
- # The problem is that UPath defines a type system of paths
712
- # Its __new__ method returns instances of different subclasses rather than a
713
- # UPath object
714
- # If we create a custom subclass naively, subclasses of the parent UPath won't
715
- # be subclasses of our custom subclass
716
- # This makes life really hard in type checks involving local to cloud
717
- # comparisons, etc.
718
- # Hence, we extend the existing UPath and amend the docs
719
- # Some of this might end up in the original UPath implementation over time,
720
- # we'll see.
721
-
722
-
723
- # add custom functions
724
- UPath.modified = property(modified)
725
- UPath.synchronize = deprecated("synchronize_to")(synchronize_to)
726
- UPath.synchronize_to = synchronize_to
727
- UPath.upload_from = upload_from
728
- UPath.to_url = to_url
729
- UPath.download_to = download_to
730
- UPath.view_tree = view_tree
731
- # unfortunately, we also have to do this for the subclasses
732
- Path.view_tree = view_tree # type: ignore
733
-
734
- UPath.glob.__doc__ = Path.glob.__doc__
735
- UPath.rglob.__doc__ = Path.rglob.__doc__
736
- UPath.stat.__doc__ = Path.stat.__doc__
737
- UPath.iterdir.__doc__ = Path.iterdir.__doc__
738
- UPath.resolve.__doc__ = Path.resolve.__doc__
739
- UPath.relative_to.__doc__ = Path.relative_to.__doc__
740
- UPath.exists.__doc__ = Path.exists.__doc__
741
- UPath.is_dir.__doc__ = Path.is_dir.__doc__
742
- UPath.is_file.__doc__ = Path.is_file.__doc__
743
- UPath.unlink.__doc__ = Path.unlink.__doc__
744
- UPath.rename.__doc__ = """Move file, see fsspec.AbstractFileSystem.mv.
745
-
746
- >>> upath = Upath("s3://my-bucket/my-file")
747
- >>> upath.rename(UPath("s3://my-bucket/my-file-renamed"))
748
- >>> upath.rename("my-file-renamed")
749
-
750
- >>> upath = Upath("local-folder/my-file")
751
- >>> upath.rename("local-folder/my-file-renamed")
752
- """
753
- UPath.__doc__ = """Paths: low-level key-value access to files/objects.
754
-
755
- Paths are based on keys that offer the typical access patterns of file systems
756
- and object stores.
757
-
758
- >>> upath = UPath("s3://my-bucket/my-folder")
759
- >>> upath.exists()
760
-
761
- Args:
762
- pathlike: A string or Path to a local/cloud file/directory/folder.
763
- """
764
-
765
- logger.debug("upath.UPath has been patched")
766
-
767
- # suppress the warning from upath about hf (huggingface) filesystem
768
- # not being explicitly implemented in upath
769
- warnings.filterwarnings(
770
- "ignore", module="upath", message=".*'hf' filesystem not explicitly implemented.*"
771
- )
772
-
773
-
774
- # split query params from path string
775
- def _split_path_query(url: str) -> tuple[str, dict]:
776
- split_result = urlsplit(url)
777
- query = parse_qs(split_result.query)
778
- path = split_result._replace(query="").geturl()
779
- return path, query
780
-
781
-
782
- class S3QueryPath(S3Path):
783
- @classmethod
784
- def _transform_init_args(cls, args, protocol, storage_options):
785
- args, protocol, storage_options = super()._transform_init_args(
786
- args, protocol, storage_options
787
- )
788
- arg0 = args[0]
789
- path, query = _split_path_query(str(arg0))
790
- for param, param_values in query.items():
791
- if len(param_values) > 1:
792
- raise ValueError(f"Multiple values for {param} query parameter")
793
- else:
794
- param_value = param_values[0]
795
- if param in storage_options and param_value != storage_options[param]:
796
- raise ValueError(
797
- f"Incompatible {param} in query and storage_options"
798
- )
799
- storage_options.setdefault(param, param_value)
800
- if hasattr(arg0, "storage_options"):
801
- storage_options = {**arg0.storage_options, **storage_options}
802
-
803
- return (path, *args[1:]), protocol, storage_options
804
-
805
-
806
- register_implementation("s3", S3QueryPath, clobber=True)
807
-
808
-
809
- def get_storage_region(path: UPathStr) -> str | None:
810
- upath = UPath(path)
811
-
812
- if upath.protocol != "s3":
813
- return None
814
-
815
- bucket = upath.drive
816
-
817
- if bucket == "scverse-spatial-eu-central-1":
818
- return "eu-central-1"
819
- elif f"s3://{bucket}" in HOSTED_BUCKETS:
820
- return bucket.replace("lamin-", "")
821
-
822
- from botocore.exceptions import ClientError
823
-
824
- if isinstance(path, str):
825
- import botocore.session
826
- from botocore.config import Config
827
-
828
- path_part = path.replace("s3://", "")
829
- # check for endpoint_url in the path string
830
- if "?" in path_part:
831
- path_part, query = _split_path_query(path_part)
832
- endpoint_url = query.get("endpoint_url", [None])[0]
833
- else:
834
- endpoint_url = None
835
- session = botocore.session.get_session()
836
- credentials = session.get_credentials()
837
- if credentials is None or credentials.access_key is None:
838
- config = Config(signature_version=botocore.session.UNSIGNED)
839
- else:
840
- config = None
841
- s3_client = session.create_client(
842
- "s3", endpoint_url=endpoint_url, config=config
843
- )
844
- try:
845
- response = s3_client.head_bucket(Bucket=bucket)
846
- except ClientError as exc:
847
- response = getattr(exc, "response", {})
848
- if response.get("Error", {}).get("Code") == "404":
849
- raise exc
850
- else:
851
- upath = get_aws_options_manager()._path_inject_options(upath, {})
852
- try:
853
- response = upath.fs.call_s3("head_bucket", Bucket=bucket)
854
- except Exception as exc:
855
- cause = getattr(exc, "__cause__", None)
856
- if not isinstance(cause, ClientError):
857
- raise exc
858
- response = getattr(cause, "response", {})
859
- if response.get("Error", {}).get("Code") == "404":
860
- raise exc
861
-
862
- region = (
863
- response.get("ResponseMetadata", {})
864
- .get("HTTPHeaders", {})
865
- .get("x-amz-bucket-region", None)
866
- )
867
- return region
868
-
869
-
870
- def create_path(path: UPathStr, access_token: str | None = None) -> UPath:
871
- upath = UPath(path).expanduser()
872
-
873
- if upath.protocol == "s3":
874
- # add managed credentials and other options for AWS s3 paths
875
- return get_aws_options_manager().enrich_path(upath, access_token)
876
-
877
- if upath.protocol in {"http", "https"}:
878
- # this is needed because by default aiohttp drops a connection after 5 min
879
- # so it is impossible to download large files
880
- storage_options = {}
881
- client_kwargs = upath.storage_options.get("client_kwargs", {})
882
- if "timeout" not in client_kwargs:
883
- from aiohttp import ClientTimeout
884
-
885
- client_kwargs = {
886
- **client_kwargs,
887
- "timeout": ClientTimeout(sock_connect=30, sock_read=30),
888
- }
889
- storage_options["client_kwargs"] = client_kwargs
890
- # see download_to for the reason
891
- if "use_listings_cache" not in upath.storage_options:
892
- storage_options["use_listings_cache"] = True
893
- if len(storage_options) > 0:
894
- return UPath(upath, **storage_options)
895
- return upath
896
-
897
-
898
- def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
899
- size = stat["size"]
900
- hash, hash_type = None, None
901
- # gs, use md5Hash instead of etag for now
902
- if "md5Hash" in stat:
903
- # gs hash is already in base64
904
- hash = stat["md5Hash"].strip('"=')
905
- hash_type = "md5"
906
- # hf
907
- elif "blob_id" in stat:
908
- hash = b16_to_b64(stat["blob_id"])
909
- hash_type = "sha1"
910
- # s3
911
- # StorageClass is checked to be sure that it is indeed s3
912
- # because http also has ETag
913
- elif "ETag" in stat:
914
- etag = stat["ETag"]
915
- if "mimetype" in stat:
916
- # http
917
- hash = hash_string(etag.strip('"'))
918
- hash_type = "md5-etag"
919
- else:
920
- # s3
921
- # small files
922
- if "-" not in etag:
923
- # only store hash for non-multipart uploads
924
- # we can't rapidly validate multi-part uploaded files client-side
925
- # we can add more logic later down-the-road
926
- hash = b16_to_b64(etag)
927
- hash_type = "md5"
928
- else:
929
- stripped_etag, suffix = etag.split("-")
930
- suffix = suffix.strip('"')
931
- hash = b16_to_b64(stripped_etag)
932
- hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
933
- if hash is not None:
934
- hash = hash[:HASH_LENGTH]
935
- return size, hash, hash_type
936
-
937
-
938
- def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
939
- objects = path.fs.find(path.as_posix(), detail=True)
940
- hash, hash_type = None, None
941
- compute_list_hash = True
942
- if path.protocol == "s3":
943
- accessor = "ETag"
944
- elif path.protocol == "gs":
945
- accessor = "md5Hash"
946
- elif path.protocol == "hf":
947
- accessor = "blob_id"
948
- else:
949
- compute_list_hash = False
950
- sizes = []
951
- hashes = []
952
- for object in objects.values():
953
- sizes.append(object["size"])
954
- if compute_list_hash:
955
- hashes.append(object[accessor].strip('"='))
956
- size = sum(sizes)
957
- n_files = len(sizes)
958
- if compute_list_hash:
959
- hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
960
- return size, hash, hash_type, n_files
961
-
962
-
963
- # is as fast as boto3: https://lamin.ai/laminlabs/lamin-site-assets/transform/krGp3hT1f78N5zKv
964
- def check_storage_is_empty(
965
- root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
966
- ) -> int:
967
- from ._settings_storage import STORAGE_UID_FILE_KEY
968
-
969
- root_upath = UPath(root)
970
- root_string = root_upath.as_posix() # type: ignore
971
- n_offset_objects = 1 # because of storage_uid.txt file, see mark_storage_root()
972
- # if the storage_uid.txt was somehow deleted, we restore a dummy version of it
973
- # because we need it to count files in an empty directory on S3 (otherwise permission error)
974
- if not (root_upath / STORAGE_UID_FILE_KEY).exists():
975
- try:
976
- (root_upath / STORAGE_UID_FILE_KEY).write_text(
977
- "was deleted, restored during delete"
978
- )
979
- except FileNotFoundError:
980
- # this can happen if the root is a local non-existing path
981
- pass
982
- if account_for_sqlite_file:
983
- n_offset_objects += 1 # the SQLite file is in the ".lamindb" directory
984
- if root_string.startswith(HOSTED_BUCKETS):
985
- # in hosted buckets, count across entire root
986
- directory_string = root_string
987
- else:
988
- # in any other storage location, only count in .lamindb
989
- if not root_string.endswith("/"):
990
- root_string += "/"
991
- directory_string = root_string + ".lamindb"
992
- objects = root_upath.fs.find(directory_string)
993
- n_files = len(objects)
994
- n_diff = n_files - n_offset_objects
995
- ask_for_deletion = (
996
- "delete them prior to deleting the storage location"
997
- if raise_error
998
- else "consider deleting them"
999
- )
1000
- message = (
1001
- f"'{directory_string}' contains {n_files - n_offset_objects} objects"
1002
- f" - {ask_for_deletion}"
1003
- )
1004
- if n_diff > 0:
1005
- if raise_error:
1006
- raise StorageNotEmpty(message) from None
1007
- else:
1008
- logger.warning(message)
1009
- return n_diff
1
+ # we are not documenting UPath here because it's documented at lamindb.UPath
2
+ """Paths & file systems."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import os
7
+ import warnings
8
+ from collections import defaultdict
9
+ from datetime import datetime, timezone
10
+ from functools import partial
11
+ from itertools import islice
12
+ from pathlib import Path, PosixPath, PurePosixPath, WindowsPath
13
+ from typing import TYPE_CHECKING, Any, Literal
14
+ from urllib.parse import parse_qs, urlsplit
15
+
16
+ import fsspec
17
+ from lamin_utils import logger
18
+ from upath import UPath
19
+ from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
20
+ from upath.implementations.local import LocalPath
21
+ from upath.registry import register_implementation
22
+
23
+ from lamindb_setup.errors import StorageNotEmpty
24
+
25
+ from ._aws_options import HOSTED_BUCKETS, get_aws_options_manager
26
+ from ._deprecated import deprecated
27
+ from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list, hash_string
28
+
29
+ if TYPE_CHECKING:
30
+ from lamindb_setup.types import UPathStr
31
+
32
+ LocalPathClasses = (PosixPath, WindowsPath, LocalPath)
33
+
34
+ # also see https://gist.github.com/securifera/e7eed730cbe1ce43d0c29d7cd2d582f4
35
+ # ".gz" is not listed here as it typically occurs with another suffix
36
+ # the complete list is at lamindb.core.storage._suffixes
37
+ VALID_SIMPLE_SUFFIXES = {
38
+ #
39
+ # without readers
40
+ #
41
+ ".fasta",
42
+ ".fastq",
43
+ ".jpg",
44
+ ".mtx",
45
+ ".obo",
46
+ ".pdf",
47
+ ".png",
48
+ ".tar",
49
+ ".tiff",
50
+ ".txt",
51
+ ".tsv",
52
+ ".zip",
53
+ ".xml",
54
+ ".qs", # https://cran.r-project.org/web/packages/qs/vignettes/vignette.html
55
+ ".rds",
56
+ ".pt",
57
+ ".pth",
58
+ ".ckpt",
59
+ ".state_dict",
60
+ ".keras",
61
+ ".pb",
62
+ ".pbtxt",
63
+ ".savedmodel",
64
+ ".pkl",
65
+ ".pickle",
66
+ ".bin",
67
+ ".safetensors",
68
+ ".model",
69
+ ".mlmodel",
70
+ ".mar",
71
+ #
72
+ # with readers (see below)
73
+ #
74
+ ".h5ad",
75
+ ".parquet",
76
+ ".csv",
77
+ ".fcs",
78
+ ".xslx",
79
+ ".zarr",
80
+ ".json",
81
+ }
82
+ # below gets updated within lamindb because it's frequently changing
83
+ VALID_COMPOSITE_SUFFIXES = {".anndata.zarr"}
84
+
85
+ TRAILING_SEP = (os.sep, os.altsep) if os.altsep is not None else os.sep
86
+
87
+
88
+ def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
89
+ def process_digits(suffix: str):
90
+ if suffix[1:].isdigit(): # :1 to skip the dot
91
+ return "" # digits are no valid suffixes
92
+ else:
93
+ return suffix
94
+
95
+ if len(path.suffixes) <= 1:
96
+ return process_digits(path.suffix)
97
+
98
+ total_suffix = "".join(path.suffixes)
99
+ if total_suffix in VALID_SIMPLE_SUFFIXES:
100
+ return total_suffix
101
+ elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
102
+ # below seems slow but OK for now
103
+ for suffix in VALID_COMPOSITE_SUFFIXES:
104
+ if total_suffix.endswith(suffix):
105
+ break
106
+ return suffix
107
+ else:
108
+ print_hint = True
109
+ arg_name = "file" if arg_name is None else arg_name # for the warning
110
+ msg = f"{arg_name} has more than one suffix (path.suffixes), "
111
+ # first check the 2nd-to-last suffix because it might be followed by .gz
112
+ # or another compression-related suffix
113
+ # Alex thought about adding logic along the lines of path.suffixes[-1]
114
+ # in COMPRESSION_SUFFIXES to detect something like .random.gz and then
115
+ # add ".random.gz" but concluded it's too dangerous it's safer to just
116
+ # use ".gz" in such a case
117
+ if path.suffixes[-2] in VALID_SIMPLE_SUFFIXES:
118
+ suffix = "".join(path.suffixes[-2:])
119
+ msg += f"inferring: '{suffix}'"
120
+ # do not print a warning for things like .tar.gz, .fastq.gz
121
+ if path.suffixes[-1] == ".gz":
122
+ print_hint = False
123
+ else:
124
+ suffix = path.suffixes[-1] # this is equivalent to path.suffix
125
+ msg += (
126
+ f"using only last suffix: '{suffix}' - if you want your composite"
127
+ " suffix to be recognized add it to"
128
+ " lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()"
129
+ )
130
+ if print_hint:
131
+ logger.hint(msg)
132
+ return process_digits(suffix)
133
+
134
+
135
+ def infer_filesystem(path: UPathStr):
136
+ import fsspec # improve cold start
137
+
138
+ path_str = str(path)
139
+
140
+ if isinstance(path, UPath):
141
+ fs = path.fs
142
+ else:
143
+ protocol = fsspec.utils.get_protocol(path_str)
144
+ if protocol == "s3":
145
+ fs_kwargs = {"cache_regions": True}
146
+ else:
147
+ fs_kwargs = {}
148
+ fs = fsspec.filesystem(protocol, **fs_kwargs)
149
+
150
+ return fs, path_str
151
+
152
+
153
+ # this is needed to avoid CreateBucket permission
154
+ class S3FSMap(fsspec.FSMap):
155
+ def __setitem__(self, key, value):
156
+ """Store value in key."""
157
+ key = self._key_to_str(key)
158
+ self.fs.pipe_file(key, fsspec.mapping.maybe_convert(value))
159
+
160
+
161
+ def create_mapper(
162
+ fs,
163
+ url="",
164
+ check=False,
165
+ create=False,
166
+ missing_exceptions=None,
167
+ ):
168
+ if fsspec.utils.get_protocol(url) == "s3":
169
+ return S3FSMap(
170
+ url, fs, check=check, create=False, missing_exceptions=missing_exceptions
171
+ )
172
+ else:
173
+ return fsspec.FSMap(
174
+ url, fs, check=check, create=create, missing_exceptions=missing_exceptions
175
+ )
176
+
177
+
178
+ def print_hook(size: int, value: int, objectname: str, action: str):
179
+ if size == 0:
180
+ progress_in_percent = 100.0
181
+ else:
182
+ progress_in_percent = (value / size) * 100
183
+ out = f"... {action} {objectname}:" f" {min(progress_in_percent, 100):4.1f}%"
184
+ if "NBPRJ_TEST_NBPATH" not in os.environ:
185
+ end = "\n" if progress_in_percent >= 100 else "\r"
186
+ print(out, end=end)
187
+
188
+
189
+ class ProgressCallback(fsspec.callbacks.Callback):
190
+ def __init__(
191
+ self,
192
+ objectname: str,
193
+ action: Literal["uploading", "downloading", "synchronizing"],
194
+ adjust_size: bool = False,
195
+ ):
196
+ assert action in {"uploading", "downloading", "synchronizing"}
197
+
198
+ super().__init__()
199
+
200
+ self.action = action
201
+ print_progress = partial(print_hook, objectname=objectname, action=action)
202
+ self.hooks = {"print_progress": print_progress}
203
+
204
+ self.adjust_size = adjust_size
205
+
206
+ def absolute_update(self, value):
207
+ pass
208
+
209
+ def relative_update(self, inc=1):
210
+ pass
211
+
212
+ def update_relative_value(self, inc=1):
213
+ if inc != 0:
214
+ self.value += inc
215
+ self.call()
216
+ else:
217
+ # this is specific to http filesystem
218
+ # for some reason the last update is 0 always
219
+ # sometimes the reported result is less that 100%
220
+ # here 100% is forced manually in this case
221
+ if self.value < 1.0 and self.value >= 0.999:
222
+ self.value = self.size
223
+ self.call()
224
+
225
+ def branch(self, path_1, path_2, kwargs):
226
+ if self.adjust_size:
227
+ if Path(path_2 if self.action != "uploading" else path_1).is_dir():
228
+ self.size -= 1
229
+ kwargs["callback"] = ChildProgressCallback(self)
230
+
231
+ def branched(self, path_1, path_2, **kwargs):
232
+ self.branch(path_1, path_2, kwargs)
233
+ return kwargs["callback"]
234
+
235
+ def wrap(self, iterable):
236
+ if self.adjust_size:
237
+ paths = []
238
+ for lpath, rpath in iterable:
239
+ paths.append((lpath, rpath))
240
+ if Path(lpath).is_dir():
241
+ self.size -= 1
242
+ self.adjust_size = False
243
+ return paths
244
+ else:
245
+ return iterable
246
+
247
+ @classmethod
248
+ def requires_progress(
249
+ cls,
250
+ maybe_callback: fsspec.callbacks.Callback | None,
251
+ print_progress: bool,
252
+ objectname: str,
253
+ action: Literal["uploading", "downloading", "synchronizing"],
254
+ **kwargs,
255
+ ):
256
+ if maybe_callback is None:
257
+ if print_progress:
258
+ return cls(objectname, action, **kwargs)
259
+ else:
260
+ return fsspec.callbacks.NoOpCallback()
261
+ return maybe_callback
262
+
263
+
264
+ class ChildProgressCallback(fsspec.callbacks.Callback):
265
+ def __init__(self, parent: ProgressCallback):
266
+ super().__init__()
267
+
268
+ self.parent = parent
269
+
270
+ def parent_update(self, inc=1):
271
+ self.parent.update_relative_value(inc)
272
+
273
+ def relative_update(self, inc=1):
274
+ if self.size != 0:
275
+ self.parent_update(inc / self.size)
276
+ else:
277
+ self.parent_update(1)
278
+
279
+
280
+ def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwargs):
281
+ """Download from self (a destination in the cloud) to the local path."""
282
+ if "recursive" not in kwargs:
283
+ kwargs["recursive"] = True
284
+ if print_progress and "callback" not in kwargs:
285
+ callback = ProgressCallback(
286
+ PurePosixPath(local_path).name, "downloading", adjust_size=True
287
+ )
288
+ kwargs["callback"] = callback
289
+
290
+ cloud_path_str = str(self)
291
+ local_path_str = str(local_path)
292
+ # needed due to https://github.com/fsspec/filesystem_spec/issues/1766
293
+ # otherwise fsspec calls fs._ls_real where it reads the body and parses links
294
+ # so the file is downloaded 2 times
295
+ # upath doesn't call fs.ls to infer type, so it is safe to call
296
+ if self.protocol in {"http", "https"} and self.stat().as_info()["type"] == "file":
297
+ self.fs.use_listings_cache = True
298
+ self.fs.dircache[cloud_path_str] = []
299
+
300
+ self.fs.download(cloud_path_str, local_path_str, **kwargs)
301
+
302
+
303
+ def upload_from(
304
+ self,
305
+ local_path: UPathStr,
306
+ create_folder: bool | None = None,
307
+ print_progress: bool = True,
308
+ **kwargs,
309
+ ) -> UPath:
310
+ """Upload from the local path to `self` (a destination in the cloud).
311
+
312
+ If the local path is a directory, recursively upload its contents.
313
+
314
+ Args:
315
+ local_path: A local path of a file or directory.
316
+ create_folder: Only applies if `local_path` is a directory and then
317
+ defaults to `True`. If `True`, make a new folder in the destination
318
+ using the directory name of `local_path`. If `False`, upload the
319
+ contents of the directory to to the root-level of the destination.
320
+ print_progress: Print progress.
321
+
322
+ Returns:
323
+ The destination path.
324
+ """
325
+ local_path = Path(local_path)
326
+ local_path_is_dir = local_path.is_dir()
327
+ if create_folder is None:
328
+ create_folder = local_path_is_dir
329
+ if create_folder and not local_path_is_dir:
330
+ raise ValueError("create_folder can only be True if local_path is a directory")
331
+
332
+ if print_progress and "callback" not in kwargs:
333
+ callback = ProgressCallback(local_path.name, "uploading")
334
+ kwargs["callback"] = callback
335
+
336
+ source: str | list[str] = local_path.as_posix()
337
+ destination: str | list[str] = self.as_posix()
338
+ if local_path_is_dir:
339
+ size: int = 0
340
+ files: list[str] = []
341
+ for file in (path for path in local_path.rglob("*") if path.is_file()):
342
+ size += file.stat().st_size
343
+ files.append(file.as_posix())
344
+ # see https://github.com/fsspec/s3fs/issues/897
345
+ # here we reduce batch_size for folders bigger than 8 GiB
346
+ # to avoid the problem in the issue
347
+ # the default batch size for this case is 128
348
+ if "batch_size" not in kwargs and size >= 8 * 2**30:
349
+ kwargs["batch_size"] = 64
350
+
351
+ if not create_folder:
352
+ source = files
353
+ destination = fsspec.utils.other_paths(
354
+ files, self.as_posix(), exists=False, flatten=False
355
+ )
356
+
357
+ # the below lines are to avoid s3fs triggering create_bucket in upload if
358
+ # dirs are present, it allows to avoid the permission error
359
+ if self.protocol == "s3" and local_path_is_dir and create_folder:
360
+ bucket = self.drive
361
+ if bucket not in self.fs.dircache:
362
+ self.fs.dircache[bucket] = [{}]
363
+ assert isinstance(destination, str)
364
+ if not destination.endswith(TRAILING_SEP): # type: ignore
365
+ destination += "/"
366
+ cleanup_cache = True
367
+ else:
368
+ cleanup_cache = False
369
+ else:
370
+ cleanup_cache = False
371
+
372
+ self.fs.upload(source, destination, recursive=create_folder, **kwargs)
373
+
374
+ if cleanup_cache:
375
+ # normally this is invalidated after the upload but still better to check
376
+ if bucket in self.fs.dircache:
377
+ del self.fs.dircache[bucket]
378
+
379
+ if local_path_is_dir and create_folder:
380
+ return self / local_path.name
381
+ else:
382
+ return self
383
+
384
+
385
+ def synchronize_to(
386
+ origin: UPath,
387
+ destination: Path,
388
+ error_no_origin: bool = True,
389
+ print_progress: bool = False,
390
+ just_check: bool = False,
391
+ **kwargs,
392
+ ) -> bool:
393
+ """Sync to a local destination path."""
394
+ destination = destination.resolve()
395
+ protocol = origin.protocol
396
+ stat_kwargs = {"expand_info": True} if protocol == "hf" else {}
397
+ origin_str = str(origin)
398
+ try:
399
+ cloud_info = origin.fs.stat(origin_str, **stat_kwargs)
400
+ exists = True
401
+ is_dir = cloud_info["type"] == "directory"
402
+ except FileNotFoundError:
403
+ exists = False
404
+
405
+ if not exists:
406
+ warn_or_error = f"The original path {origin} does not exist anymore."
407
+ if destination.exists():
408
+ warn_or_error += (
409
+ f"\nHowever, the local path {destination} still exists, you might want"
410
+ " to reupload the object back."
411
+ )
412
+ logger.warning(warn_or_error)
413
+ elif error_no_origin:
414
+ warn_or_error += "\nIt is not possible to synchronize."
415
+ raise FileNotFoundError(warn_or_error)
416
+ return False
417
+
418
+ use_size: bool = False
419
+ # use casting to int to avoid problems when the local filesystem
420
+ # discards fractional parts of timestamps
421
+ if protocol == "s3":
422
+ get_modified = lambda file_stat: int(file_stat["LastModified"].timestamp())
423
+ elif protocol == "gs":
424
+ get_modified = lambda file_stat: int(file_stat["mtime"].timestamp())
425
+ elif protocol == "hf":
426
+ get_modified = lambda file_stat: int(file_stat["last_commit"].date.timestamp())
427
+ else: # http etc
428
+ use_size = True
429
+ get_modified = lambda file_stat: file_stat["size"]
430
+
431
+ if use_size:
432
+ is_sync_needed = lambda cloud_size, local_stat: cloud_size != local_stat.st_size
433
+ else:
434
+ # no need to cast local_stat.st_mtime to int
435
+ # because if it has the fractional part and cloud_mtime doesn't
436
+ # and they have the same integer part then cloud_mtime can't be bigger
437
+ is_sync_needed = (
438
+ lambda cloud_mtime, local_stat: cloud_mtime > local_stat.st_mtime
439
+ )
440
+
441
+ local_paths: list[Path] = []
442
+ cloud_stats: dict[str, int]
443
+ if is_dir:
444
+ cloud_stats = {
445
+ file: get_modified(stat)
446
+ for file, stat in origin.fs.find(
447
+ origin_str, detail=True, **stat_kwargs
448
+ ).items()
449
+ }
450
+ for cloud_path in cloud_stats:
451
+ file_key = PurePosixPath(cloud_path).relative_to(origin.path).as_posix()
452
+ local_paths.append(destination / file_key)
453
+ else:
454
+ cloud_stats = {origin.path: get_modified(cloud_info)}
455
+ local_paths.append(destination)
456
+
457
+ local_paths_all: dict[Path, os.stat_result] = {}
458
+ if destination.exists():
459
+ if is_dir:
460
+ local_paths_all = {
461
+ path: path.stat() for path in destination.rglob("*") if path.is_file()
462
+ }
463
+ if not use_size:
464
+ # cast to int to remove the fractional parts
465
+ # there is a problem when a fractional part is allowed on one filesystem
466
+ # but not on the other
467
+ # so just normalize both to int
468
+ cloud_mts_max: int = max(cloud_stats.values())
469
+ local_mts_max: int = int(
470
+ max(stat.st_mtime for stat in local_paths_all.values())
471
+ )
472
+ if local_mts_max > cloud_mts_max:
473
+ return False
474
+ elif local_mts_max == cloud_mts_max:
475
+ if len(local_paths_all) == len(cloud_stats):
476
+ return False
477
+ elif just_check:
478
+ return True
479
+ else:
480
+ local_paths_all = {destination: destination.stat()}
481
+
482
+ cloud_files_sync = []
483
+ local_files_sync = []
484
+ for i, (cloud_file, cloud_stat) in enumerate(cloud_stats.items()):
485
+ local_path = local_paths[i]
486
+ if local_path not in local_paths_all or is_sync_needed(
487
+ cloud_stat, local_paths_all[local_path]
488
+ ):
489
+ cloud_files_sync.append(cloud_file)
490
+ local_files_sync.append(local_path.as_posix())
491
+ else:
492
+ cloud_files_sync = list(cloud_stats.keys())
493
+ local_files_sync = [local_path.as_posix() for local_path in local_paths]
494
+
495
+ if cloud_files_sync:
496
+ if just_check:
497
+ return True
498
+
499
+ callback = ProgressCallback.requires_progress(
500
+ maybe_callback=kwargs.pop("callback", None),
501
+ print_progress=print_progress,
502
+ objectname=destination.name,
503
+ action="synchronizing",
504
+ adjust_size=False,
505
+ )
506
+ origin.fs.download(
507
+ cloud_files_sync,
508
+ local_files_sync,
509
+ recursive=False,
510
+ callback=callback,
511
+ **kwargs,
512
+ )
513
+ if not use_size:
514
+ for i, cloud_file in enumerate(cloud_files_sync):
515
+ cloud_mtime = cloud_stats[cloud_file]
516
+ os.utime(local_files_sync[i], times=(cloud_mtime, cloud_mtime))
517
+ else:
518
+ return False
519
+
520
+ if is_dir and local_paths_all:
521
+ for path in (path for path in local_paths_all if path not in local_paths):
522
+ path.unlink()
523
+ parent = path.parent
524
+ if next(parent.iterdir(), None) is None:
525
+ parent.rmdir()
526
+
527
+ return True
528
+
529
+
530
+ def modified(self) -> datetime | None:
531
+ """Return modified time stamp."""
532
+ mtime = self.fs.modified(str(self))
533
+ if mtime.tzinfo is None:
534
+ mtime = mtime.replace(tzinfo=timezone.utc)
535
+ return mtime.astimezone().replace(tzinfo=None)
536
+
537
+
538
+ def compute_file_tree(
539
+ path: UPath,
540
+ *,
541
+ level: int = -1,
542
+ only_dirs: bool = False,
543
+ n_max_files_per_dir_and_type: int = 100,
544
+ n_max_files: int = 1000,
545
+ include_paths: set[Any] | None = None,
546
+ skip_suffixes: list[str] | None = None,
547
+ ) -> tuple[str, int]:
548
+ # .exists() helps to separate files from folders for gcsfs
549
+ # otherwise sometimes it has is_dir() True and is_file() True
550
+ if path.protocol == "gs" and not path.exists():
551
+ raise FileNotFoundError
552
+
553
+ space = " "
554
+ branch = "│ "
555
+ tee = "├── "
556
+ last = "└── "
557
+ if skip_suffixes is None:
558
+ skip_suffixes_tuple = ()
559
+ else:
560
+ skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
561
+ n_files = 0
562
+ n_directories = 0
563
+
564
+ # by default only including registered files
565
+ # need a flag and a proper implementation
566
+ suffixes = set()
567
+ include_dirs = set()
568
+ if include_paths is not None:
569
+ include_dirs = {d for p in include_paths for d in p.parents}
570
+ else:
571
+ include_paths = set()
572
+
573
+ def inner(dir_path: Path, prefix: str = "", level: int = -1):
574
+ nonlocal n_files, n_directories, suffixes
575
+ if level == 0:
576
+ return
577
+ stripped_dir_path = dir_path.as_posix().rstrip("/")
578
+ # do not iterate through zarr directories
579
+ if stripped_dir_path.endswith(skip_suffixes_tuple):
580
+ return
581
+ # this is needed so that the passed folder is not listed
582
+ contents = [
583
+ i
584
+ for i in dir_path.iterdir()
585
+ if i.as_posix().rstrip("/") != stripped_dir_path
586
+ ]
587
+ if only_dirs:
588
+ contents = [d for d in contents if d.is_dir()]
589
+ pointers = [tee] * (len(contents) - 1) + [last]
590
+ n_files_per_dir_and_type = defaultdict(lambda: 0) # type: ignore
591
+ # TODO: pass strict=False to zip with python > 3.9
592
+ for pointer, child_path in zip(pointers, contents, strict=False): # type: ignore
593
+ if child_path.is_dir():
594
+ if include_dirs and child_path not in include_dirs:
595
+ continue
596
+ yield prefix + pointer + child_path.name + "/"
597
+ n_directories += 1
598
+ n_files_per_dir_and_type = defaultdict(lambda: 0)
599
+ extension = branch if pointer == tee else space
600
+ yield from inner(child_path, prefix=prefix + extension, level=level - 1)
601
+ elif not only_dirs:
602
+ if include_paths and child_path not in include_paths:
603
+ continue
604
+ suffix = extract_suffix_from_path(child_path)
605
+ suffixes.add(suffix)
606
+ n_files_per_dir_and_type[suffix] += 1
607
+ n_files += 1
608
+ if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
609
+ yield prefix + "..."
610
+ elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
611
+ continue
612
+ else:
613
+ yield prefix + pointer + child_path.name
614
+
615
+ folder_tree = ""
616
+ iterator = inner(path, level=level)
617
+ for line in islice(iterator, n_max_files):
618
+ folder_tree += f"\n{line}"
619
+ if next(iterator, None):
620
+ folder_tree += f"\n... only showing {n_max_files} out of {n_files} files"
621
+ directory_info = "directory" if n_directories == 1 else "directories"
622
+ display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
623
+ suffix_message = f" with suffixes {display_suffixes}" if n_files > 0 else ""
624
+ message = (
625
+ f"{n_directories} sub-{directory_info} &"
626
+ f" {n_files} files{suffix_message}\n{path.resolve()}{folder_tree}"
627
+ )
628
+ return message, n_files
629
+
630
+
631
+ # adapted from: https://stackoverflow.com/questions/9727673
632
+ def view_tree(
633
+ path: Path,
634
+ *,
635
+ level: int = 2,
636
+ only_dirs: bool = False,
637
+ n_max_files_per_dir_and_type: int = 100,
638
+ n_max_files: int = 1000,
639
+ include_paths: set[Any] | None = None,
640
+ skip_suffixes: list[str] | None = None,
641
+ ) -> None:
642
+ """Print a visual tree structure of files & directories.
643
+
644
+ Args:
645
+ level: If `1`, only iterate through one level, if `2` iterate through 2
646
+ levels, if `-1` iterate through entire hierarchy.
647
+ only_dirs: Only iterate through directories.
648
+ n_max_files: Display limit. Will only show this many files. Doesn't affect count.
649
+ include_paths: Restrict to these paths.
650
+ skip_suffixes: Skip directories with these suffixes.
651
+
652
+ Examples:
653
+ >>> dir_path = ln.core.datasets.generate_cell_ranger_files(
654
+ >>> "sample_001", ln.settings.storage
655
+ >>> )
656
+ >>> ln.UPath(dir_path).view_tree()
657
+ 3 subdirectories, 15 files
658
+ sample_001
659
+ ├── web_summary.html
660
+ ├── metrics_summary.csv
661
+ ├── molecule_info.h5
662
+ ├── filtered_feature_bc_matrix
663
+ ├── features.tsv.gz
664
+ ├── barcodes.tsv.gz
665
+ └── matrix.mtx.gz
666
+ ├── analysis
667
+ │ └── analysis.csv
668
+ ├── raw_feature_bc_matrix
669
+ ├── features.tsv.gz
670
+ ├── barcodes.tsv.gz
671
+ │ └── matrix.mtx.gz
672
+ ├── possorted_genome_bam.bam.bai
673
+ ├── cloupe.cloupe
674
+ ├── possorted_genome_bam.bam
675
+ ├── filtered_feature_bc_matrix.h5
676
+ └── raw_feature_bc_matrix.h5
677
+ """
678
+ message, _ = compute_file_tree(
679
+ path,
680
+ level=level,
681
+ only_dirs=only_dirs,
682
+ n_max_files=n_max_files,
683
+ n_max_files_per_dir_and_type=n_max_files_per_dir_and_type,
684
+ include_paths=include_paths,
685
+ skip_suffixes=skip_suffixes,
686
+ )
687
+ logger.print(message)
688
+
689
+
690
+ def to_url(upath):
691
+ """Public storage URL.
692
+
693
+ Generates a public URL for an object in an S3 bucket using fsspec's UPath,
694
+ considering the bucket's region.
695
+
696
+ Args:
697
+ - upath: A UPath object representing an S3 path.
698
+
699
+ Returns:
700
+ - A string containing the public URL to the S3 object.
701
+ """
702
+ if upath.protocol != "s3":
703
+ raise ValueError("The provided UPath must be an S3 path.")
704
+ key = "/".join(upath.parts[1:])
705
+ bucket = upath.drive
706
+ region = get_storage_region(upath)
707
+ if region == "us-east-1":
708
+ return f"https://{bucket}.s3.amazonaws.com/{key}"
709
+ else:
710
+ return f"https://{bucket}.s3-{region}.amazonaws.com/{key}"
711
+
712
+
713
+ # Why aren't we subclassing?
714
+ #
715
+ # The problem is that UPath defines a type system of paths
716
+ # Its __new__ method returns instances of different subclasses rather than a
717
+ # UPath object
718
+ # If we create a custom subclass naively, subclasses of the parent UPath won't
719
+ # be subclasses of our custom subclass
720
+ # This makes life really hard in type checks involving local to cloud
721
+ # comparisons, etc.
722
+ # Hence, we extend the existing UPath and amend the docs
723
+ # Some of this might end up in the original UPath implementation over time,
724
+ # we'll see.
725
+
726
+
727
+ # add custom functions
728
+ UPath.modified = property(modified)
729
+ UPath.synchronize = deprecated("synchronize_to")(synchronize_to)
730
+ UPath.synchronize_to = synchronize_to
731
+ UPath.upload_from = upload_from
732
+ UPath.to_url = to_url
733
+ UPath.download_to = download_to
734
+ UPath.view_tree = view_tree
735
+ # unfortunately, we also have to do this for the subclasses
736
+ Path.view_tree = view_tree # type: ignore
737
+
738
+ UPath.glob.__doc__ = Path.glob.__doc__
739
+ UPath.rglob.__doc__ = Path.rglob.__doc__
740
+ UPath.stat.__doc__ = Path.stat.__doc__
741
+ UPath.iterdir.__doc__ = Path.iterdir.__doc__
742
+ UPath.resolve.__doc__ = Path.resolve.__doc__
743
+ UPath.relative_to.__doc__ = Path.relative_to.__doc__
744
+ UPath.exists.__doc__ = Path.exists.__doc__
745
+ UPath.is_dir.__doc__ = Path.is_dir.__doc__
746
+ UPath.is_file.__doc__ = Path.is_file.__doc__
747
+ UPath.unlink.__doc__ = Path.unlink.__doc__
748
+ UPath.rename.__doc__ = """Move file, see fsspec.AbstractFileSystem.mv.
749
+
750
+ >>> upath = Upath("s3://my-bucket/my-file")
751
+ >>> upath.rename(UPath("s3://my-bucket/my-file-renamed"))
752
+ >>> upath.rename("my-file-renamed")
753
+
754
+ >>> upath = Upath("local-folder/my-file")
755
+ >>> upath.rename("local-folder/my-file-renamed")
756
+ """
757
+ UPath.__doc__ = """Paths: low-level key-value access to files/objects.
758
+
759
+ Paths are based on keys that offer the typical access patterns of file systems
760
+ and object stores.
761
+
762
+ >>> upath = UPath("s3://my-bucket/my-folder")
763
+ >>> upath.exists()
764
+
765
+ Args:
766
+ pathlike: A string or Path to a local/cloud file/directory/folder.
767
+ """
768
+
769
+ logger.debug("upath.UPath has been patched")
770
+
771
+ # suppress the warning from upath about hf (huggingface) filesystem
772
+ # not being explicitly implemented in upath
773
+ warnings.filterwarnings(
774
+ "ignore", module="upath", message=".*'hf' filesystem not explicitly implemented.*"
775
+ )
776
+
777
+
778
+ # split query params from path string
779
+ def _split_path_query(url: str) -> tuple[str, dict]:
780
+ split_result = urlsplit(url)
781
+ query = parse_qs(split_result.query)
782
+ path = split_result._replace(query="").geturl()
783
+ return path, query
784
+
785
+
786
+ class S3QueryPath(S3Path):
787
+ @classmethod
788
+ def _transform_init_args(cls, args, protocol, storage_options):
789
+ args, protocol, storage_options = super()._transform_init_args(
790
+ args, protocol, storage_options
791
+ )
792
+ arg0 = args[0]
793
+ path, query = _split_path_query(str(arg0))
794
+ for param, param_values in query.items():
795
+ if len(param_values) > 1:
796
+ raise ValueError(f"Multiple values for {param} query parameter")
797
+ else:
798
+ param_value = param_values[0]
799
+ if param in storage_options and param_value != storage_options[param]:
800
+ raise ValueError(
801
+ f"Incompatible {param} in query and storage_options"
802
+ )
803
+ storage_options.setdefault(param, param_value)
804
+ if hasattr(arg0, "storage_options"):
805
+ storage_options = {**arg0.storage_options, **storage_options}
806
+
807
+ return (path, *args[1:]), protocol, storage_options
808
+
809
+
810
+ register_implementation("s3", S3QueryPath, clobber=True)
811
+
812
+
813
+ def get_storage_region(path: UPathStr) -> str | None:
814
+ upath = UPath(path)
815
+
816
+ if upath.protocol != "s3":
817
+ return None
818
+
819
+ bucket = upath.drive
820
+
821
+ if bucket == "scverse-spatial-eu-central-1":
822
+ return "eu-central-1"
823
+ elif f"s3://{bucket}" in HOSTED_BUCKETS:
824
+ return bucket.replace("lamin-", "")
825
+
826
+ from botocore.exceptions import ClientError
827
+
828
+ if isinstance(path, str):
829
+ import botocore.session
830
+ from botocore.config import Config
831
+
832
+ path_part = path.replace("s3://", "")
833
+ # check for endpoint_url in the path string
834
+ if "?" in path_part:
835
+ path_part, query = _split_path_query(path_part)
836
+ endpoint_url = query.get("endpoint_url", [None])[0]
837
+ else:
838
+ endpoint_url = None
839
+ session = botocore.session.get_session()
840
+ credentials = session.get_credentials()
841
+ if credentials is None or credentials.access_key is None:
842
+ config = Config(signature_version=botocore.session.UNSIGNED)
843
+ else:
844
+ config = None
845
+ s3_client = session.create_client(
846
+ "s3", endpoint_url=endpoint_url, config=config
847
+ )
848
+ try:
849
+ response = s3_client.head_bucket(Bucket=bucket)
850
+ except ClientError as exc:
851
+ response = getattr(exc, "response", {})
852
+ if response.get("Error", {}).get("Code") == "404":
853
+ raise exc
854
+ else:
855
+ upath = get_aws_options_manager()._path_inject_options(upath, {})
856
+ try:
857
+ response = upath.fs.call_s3("head_bucket", Bucket=bucket)
858
+ except Exception as exc:
859
+ cause = getattr(exc, "__cause__", None)
860
+ if not isinstance(cause, ClientError):
861
+ raise exc
862
+ response = getattr(cause, "response", {})
863
+ if response.get("Error", {}).get("Code") == "404":
864
+ raise exc
865
+
866
+ region = (
867
+ response.get("ResponseMetadata", {})
868
+ .get("HTTPHeaders", {})
869
+ .get("x-amz-bucket-region", None)
870
+ )
871
+ return region
872
+
873
+
874
+ def create_path(path: UPathStr, access_token: str | None = None) -> UPath:
875
+ upath = UPath(path).expanduser()
876
+
877
+ if upath.protocol == "s3":
878
+ # add managed credentials and other options for AWS s3 paths
879
+ return get_aws_options_manager().enrich_path(upath, access_token)
880
+
881
+ if upath.protocol in {"http", "https"}:
882
+ # this is needed because by default aiohttp drops a connection after 5 min
883
+ # so it is impossible to download large files
884
+ storage_options = {}
885
+ client_kwargs = upath.storage_options.get("client_kwargs", {})
886
+ if "timeout" not in client_kwargs:
887
+ from aiohttp import ClientTimeout
888
+
889
+ client_kwargs = {
890
+ **client_kwargs,
891
+ "timeout": ClientTimeout(sock_connect=30, sock_read=30),
892
+ }
893
+ storage_options["client_kwargs"] = client_kwargs
894
+ # see download_to for the reason
895
+ if "use_listings_cache" not in upath.storage_options:
896
+ storage_options["use_listings_cache"] = True
897
+ if len(storage_options) > 0:
898
+ return UPath(upath, **storage_options)
899
+ return upath
900
+
901
+
902
+ def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
903
+ size = stat["size"]
904
+ hash, hash_type = None, None
905
+ # gs, use md5Hash instead of etag for now
906
+ if "md5Hash" in stat:
907
+ # gs hash is already in base64
908
+ hash = stat["md5Hash"].strip('"=')
909
+ hash_type = "md5"
910
+ # hf
911
+ elif "blob_id" in stat:
912
+ hash = b16_to_b64(stat["blob_id"])
913
+ hash_type = "sha1"
914
+ # s3
915
+ # StorageClass is checked to be sure that it is indeed s3
916
+ # because http also has ETag
917
+ elif "ETag" in stat:
918
+ etag = stat["ETag"]
919
+ if "mimetype" in stat:
920
+ # http
921
+ hash = hash_string(etag.strip('"'))
922
+ hash_type = "md5-etag"
923
+ else:
924
+ # s3
925
+ # small files
926
+ if "-" not in etag:
927
+ # only store hash for non-multipart uploads
928
+ # we can't rapidly validate multi-part uploaded files client-side
929
+ # we can add more logic later down-the-road
930
+ hash = b16_to_b64(etag)
931
+ hash_type = "md5"
932
+ else:
933
+ stripped_etag, suffix = etag.split("-")
934
+ suffix = suffix.strip('"')
935
+ hash = b16_to_b64(stripped_etag)
936
+ hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
937
+ if hash is not None:
938
+ hash = hash[:HASH_LENGTH]
939
+ return size, hash, hash_type
940
+
941
+
942
+ def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
943
+ objects = path.fs.find(path.as_posix(), detail=True)
944
+ hash, hash_type = None, None
945
+ compute_list_hash = True
946
+ if path.protocol == "s3":
947
+ accessor = "ETag"
948
+ elif path.protocol == "gs":
949
+ accessor = "md5Hash"
950
+ elif path.protocol == "hf":
951
+ accessor = "blob_id"
952
+ else:
953
+ compute_list_hash = False
954
+ sizes = []
955
+ hashes = []
956
+ for object in objects.values():
957
+ sizes.append(object["size"])
958
+ if compute_list_hash:
959
+ hashes.append(object[accessor].strip('"='))
960
+ size = sum(sizes)
961
+ n_files = len(sizes)
962
+ if compute_list_hash:
963
+ hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
964
+ return size, hash, hash_type, n_files
965
+
966
+
967
+ # is as fast as boto3: https://lamin.ai/laminlabs/lamin-site-assets/transform/krGp3hT1f78N5zKv
968
+ def check_storage_is_empty(
969
+ root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
970
+ ) -> int:
971
+ from ._settings_storage import STORAGE_UID_FILE_KEY
972
+
973
+ root_upath = UPath(root)
974
+ root_string = root_upath.as_posix() # type: ignore
975
+ n_offset_objects = 1 # because of storage_uid.txt file, see mark_storage_root()
976
+ # if the storage_uid.txt was somehow deleted, we restore a dummy version of it
977
+ # because we need it to count files in an empty directory on S3 (otherwise permission error)
978
+ if not (root_upath / STORAGE_UID_FILE_KEY).exists():
979
+ try:
980
+ (root_upath / STORAGE_UID_FILE_KEY).write_text(
981
+ "was deleted, restored during delete"
982
+ )
983
+ except FileNotFoundError:
984
+ # this can happen if the root is a local non-existing path
985
+ pass
986
+ if account_for_sqlite_file:
987
+ n_offset_objects += 1 # the SQLite file is in the ".lamindb" directory
988
+ if root_string.startswith(HOSTED_BUCKETS):
989
+ # in hosted buckets, count across entire root
990
+ directory_string = root_string
991
+ else:
992
+ # in any other storage location, only count in .lamindb
993
+ if not root_string.endswith("/"):
994
+ root_string += "/"
995
+ directory_string = root_string + ".lamindb"
996
+ objects = root_upath.fs.find(directory_string)
997
+ if account_for_sqlite_file:
998
+ # ignore exclusion dir for cloud sqlite
999
+ objects = [o for o in objects if "/.lamindb/_exclusion/" not in o]
1000
+ n_files = len(objects)
1001
+ n_diff = n_files - n_offset_objects
1002
+ ask_for_deletion = (
1003
+ "delete them prior to deleting the storage location"
1004
+ if raise_error
1005
+ else "consider deleting them"
1006
+ )
1007
+ message = f"'{directory_string}' contains {n_diff} objects" f" - {ask_for_deletion}"
1008
+ if n_diff > 0:
1009
+ if raise_error:
1010
+ raise StorageNotEmpty(message) from None
1011
+ else:
1012
+ logger.warning(message)
1013
+ return n_diff