lamindb_setup 1.19.0__py3-none-any.whl → 1.19.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. lamindb_setup/__init__.py +1 -1
  2. lamindb_setup/_cache.py +87 -87
  3. lamindb_setup/_check.py +7 -7
  4. lamindb_setup/_check_setup.py +131 -131
  5. lamindb_setup/_connect_instance.py +443 -441
  6. lamindb_setup/_delete.py +155 -155
  7. lamindb_setup/_disconnect.py +38 -38
  8. lamindb_setup/_django.py +39 -39
  9. lamindb_setup/_entry_points.py +19 -19
  10. lamindb_setup/_init_instance.py +423 -423
  11. lamindb_setup/_migrate.py +331 -331
  12. lamindb_setup/_register_instance.py +32 -32
  13. lamindb_setup/_schema.py +27 -27
  14. lamindb_setup/_schema_metadata.py +451 -451
  15. lamindb_setup/_set_managed_storage.py +81 -81
  16. lamindb_setup/_setup_user.py +198 -198
  17. lamindb_setup/_silence_loggers.py +46 -46
  18. lamindb_setup/core/__init__.py +25 -34
  19. lamindb_setup/core/_aws_options.py +276 -276
  20. lamindb_setup/core/_aws_storage.py +57 -57
  21. lamindb_setup/core/_clone.py +50 -50
  22. lamindb_setup/core/_deprecated.py +62 -62
  23. lamindb_setup/core/_docs.py +14 -14
  24. lamindb_setup/core/_hub_client.py +288 -288
  25. lamindb_setup/core/_hub_crud.py +247 -247
  26. lamindb_setup/core/_hub_utils.py +100 -100
  27. lamindb_setup/core/_private_django_api.py +80 -80
  28. lamindb_setup/core/_settings.py +440 -434
  29. lamindb_setup/core/_settings_instance.py +22 -1
  30. lamindb_setup/core/_settings_load.py +162 -162
  31. lamindb_setup/core/_settings_save.py +108 -108
  32. lamindb_setup/core/_settings_storage.py +433 -433
  33. lamindb_setup/core/_settings_store.py +162 -162
  34. lamindb_setup/core/_settings_user.py +55 -55
  35. lamindb_setup/core/_setup_bionty_sources.py +44 -44
  36. lamindb_setup/core/cloud_sqlite_locker.py +240 -240
  37. lamindb_setup/core/django.py +414 -413
  38. lamindb_setup/core/exceptions.py +1 -1
  39. lamindb_setup/core/hashing.py +134 -134
  40. lamindb_setup/core/types.py +1 -1
  41. lamindb_setup/core/upath.py +1031 -1028
  42. lamindb_setup/errors.py +72 -72
  43. lamindb_setup/io.py +423 -423
  44. lamindb_setup/types.py +17 -17
  45. {lamindb_setup-1.19.0.dist-info → lamindb_setup-1.19.1.dist-info}/METADATA +3 -2
  46. lamindb_setup-1.19.1.dist-info/RECORD +51 -0
  47. {lamindb_setup-1.19.0.dist-info → lamindb_setup-1.19.1.dist-info}/WHEEL +1 -1
  48. {lamindb_setup-1.19.0.dist-info → lamindb_setup-1.19.1.dist-info/licenses}/LICENSE +201 -201
  49. lamindb_setup-1.19.0.dist-info/RECORD +0 -51
@@ -1,1028 +1,1031 @@
1
- # we are not documenting UPath here because it's documented at lamindb.UPath
2
- """Paths & file systems."""
3
-
4
- from __future__ import annotations
5
-
6
- import math
7
- import os
8
- import warnings
9
- from collections import defaultdict
10
- from datetime import datetime, timezone
11
- from functools import partial
12
- from itertools import islice
13
- from pathlib import Path, PosixPath, PurePosixPath, WindowsPath
14
- from typing import TYPE_CHECKING, Any, Literal
15
- from urllib.parse import parse_qs, urlsplit
16
-
17
- import fsspec
18
- from lamin_utils import logger
19
- from upath import UPath
20
- from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
21
- from upath.implementations.local import LocalPath
22
- from upath.registry import register_implementation
23
-
24
- from lamindb_setup.errors import StorageNotEmpty
25
-
26
- from ._aws_options import HOSTED_BUCKETS, get_aws_options_manager
27
- from ._deprecated import deprecated
28
- from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list, hash_string
29
-
30
- if TYPE_CHECKING:
31
- from lamindb_setup.types import UPathStr
32
-
33
- LocalPathClasses = (PosixPath, WindowsPath, LocalPath)
34
-
35
- # also see https://gist.github.com/securifera/e7eed730cbe1ce43d0c29d7cd2d582f4
36
- # ".gz" is not listed here as it typically occurs with another suffix
37
- # the complete list is at lamindb.core.storage._suffixes
38
- VALID_SIMPLE_SUFFIXES = {
39
- #
40
- # without readers
41
- #
42
- ".fasta",
43
- ".fastq",
44
- ".jpg",
45
- ".mtx",
46
- ".obo",
47
- ".pdf",
48
- ".png",
49
- ".tar",
50
- ".tiff",
51
- ".txt",
52
- ".tsv",
53
- ".zip",
54
- ".xml",
55
- ".qs", # https://cran.r-project.org/web/packages/qs/vignettes/vignette.html
56
- ".rds",
57
- ".pt",
58
- ".pth",
59
- ".ckpt",
60
- ".state_dict",
61
- ".keras",
62
- ".pb",
63
- ".pbtxt",
64
- ".savedmodel",
65
- ".pkl",
66
- ".pickle",
67
- ".bin",
68
- ".safetensors",
69
- ".model",
70
- ".mlmodel",
71
- ".mar",
72
- #
73
- # with readers (see below)
74
- #
75
- ".h5ad",
76
- ".parquet",
77
- ".csv",
78
- ".fcs",
79
- ".xslx",
80
- ".zarr",
81
- ".json",
82
- }
83
- # below gets updated within lamindb because it's frequently changing
84
- VALID_COMPOSITE_SUFFIXES = {".anndata.zarr"}
85
-
86
- TRAILING_SEP = (os.sep, os.altsep) if os.altsep is not None else os.sep
87
-
88
-
89
- def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
90
- def process_digits(suffix: str):
91
- if suffix[1:].isdigit(): # :1 to skip the dot
92
- return "" # digits are no valid suffixes
93
- else:
94
- return suffix
95
-
96
- suffixes = path.suffixes
97
-
98
- if len(suffixes) <= 1:
99
- return process_digits(path.suffix)
100
-
101
- total_suffix = "".join(suffixes)
102
- if total_suffix in VALID_SIMPLE_SUFFIXES:
103
- return total_suffix
104
- elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
105
- # below seems slow but OK for now
106
- for suffix in VALID_COMPOSITE_SUFFIXES:
107
- if total_suffix.endswith(suffix):
108
- break
109
- return suffix
110
- else:
111
- print_hint = True
112
- arg_name = "file" if arg_name is None else arg_name # for the warning
113
- msg = f"{arg_name} has more than one suffix (path.suffixes), "
114
- # first check the 2nd-to-last suffix because it might be followed by .gz
115
- # or another compression-related suffix
116
- # Alex thought about adding logic along the lines of path.suffixes[-1]
117
- # in COMPRESSION_SUFFIXES to detect something like .random.gz and then
118
- # add ".random.gz" but concluded it's too dangerous it's safer to just
119
- # use ".gz" in such a case
120
- if suffixes[-2] in VALID_SIMPLE_SUFFIXES:
121
- suffix = "".join(suffixes[-2:])
122
- # if the suffix preceding the compression suffixes is a valid suffix,
123
- # we account for it; otherwise we don't.
124
- # i.e. we should have .h5ad.tar.gz or .csv.tar.gz, not just .tar.gz
125
- if (
126
- suffix == ".tar.gz"
127
- and len(suffixes) > 2
128
- and (suffix_3 := suffixes[-3]) in VALID_SIMPLE_SUFFIXES
129
- ):
130
- suffix = suffix_3 + suffix
131
- # do not print a warning for things like .tar.gz, .fastq.gz
132
- if suffixes[-1] == ".gz":
133
- print_hint = False
134
- else:
135
- msg += f"inferring: '{suffix}'"
136
- else:
137
- suffix = suffixes[-1] # this is equivalent to path.suffix
138
- msg += (
139
- f"using only last suffix: '{suffix}' - if you want your composite"
140
- " suffix to be recognized add it to"
141
- " lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()"
142
- )
143
- if print_hint:
144
- logger.hint(msg)
145
- return process_digits(suffix)
146
-
147
-
148
- def infer_filesystem(path: UPathStr):
149
- import fsspec # improve cold start
150
-
151
- path_str = str(path)
152
-
153
- if isinstance(path, UPath):
154
- fs = path.fs
155
- else:
156
- protocol = fsspec.utils.get_protocol(path_str)
157
- if protocol == "s3":
158
- fs_kwargs = {"cache_regions": True}
159
- else:
160
- fs_kwargs = {}
161
- fs = fsspec.filesystem(protocol, **fs_kwargs)
162
-
163
- return fs, path_str
164
-
165
-
166
- # this is needed to avoid CreateBucket permission
167
- class S3FSMap(fsspec.FSMap):
168
- def __setitem__(self, key, value):
169
- """Store value in key."""
170
- key = self._key_to_str(key)
171
- self.fs.pipe_file(key, fsspec.mapping.maybe_convert(value))
172
-
173
-
174
- def create_mapper(
175
- fs,
176
- url="",
177
- check=False,
178
- create=False,
179
- missing_exceptions=None,
180
- ):
181
- if fsspec.utils.get_protocol(url) == "s3":
182
- return S3FSMap(
183
- url, fs, check=check, create=False, missing_exceptions=missing_exceptions
184
- )
185
- else:
186
- return fsspec.FSMap(
187
- url, fs, check=check, create=create, missing_exceptions=missing_exceptions
188
- )
189
-
190
-
191
- def print_hook(size: int, value: int, objectname: str, action: str):
192
- if size == 0:
193
- progress_in_percent = 100.0
194
- else:
195
- progress_in_percent = (value / size) * 100
196
- out = f"... {action} {objectname}: {min(progress_in_percent, 100):4.1f}%"
197
- if "NBPRJ_TEST_NBPATH" not in os.environ:
198
- end = "\n" if progress_in_percent >= 100 else "\r"
199
- print(out, end=end)
200
-
201
-
202
- class ProgressCallback(fsspec.callbacks.Callback):
203
- def __init__(
204
- self,
205
- objectname: str,
206
- action: Literal["uploading", "downloading", "synchronizing"],
207
- adjust_size: bool = False,
208
- ):
209
- assert action in {"uploading", "downloading", "synchronizing"}
210
-
211
- super().__init__()
212
-
213
- self.action = action
214
- print_progress = partial(print_hook, objectname=objectname, action=action)
215
- self.hooks = {"print_progress": print_progress}
216
-
217
- self.adjust_size = adjust_size
218
-
219
- def absolute_update(self, value):
220
- pass
221
-
222
- def relative_update(self, inc=1):
223
- pass
224
-
225
- def update_relative_value(self, inc=1):
226
- if inc != 0:
227
- self.value += inc
228
- self.call()
229
- else:
230
- # this is specific to http filesystem
231
- # for some reason the last update is 0 always
232
- # sometimes the reported result is less that 100%
233
- # here 100% is forced manually in this case
234
- if self.value < 1.0 and self.value >= 0.999:
235
- self.value = self.size
236
- self.call()
237
-
238
- def branch(self, path_1, path_2, kwargs):
239
- if self.adjust_size:
240
- if Path(path_2 if self.action != "uploading" else path_1).is_dir():
241
- self.size -= 1
242
- kwargs["callback"] = ChildProgressCallback(self)
243
-
244
- def branched(self, path_1, path_2, **kwargs):
245
- self.branch(path_1, path_2, kwargs)
246
- return kwargs["callback"]
247
-
248
- def wrap(self, iterable):
249
- if self.adjust_size:
250
- paths = []
251
- for lpath, rpath in iterable:
252
- paths.append((lpath, rpath))
253
- if Path(lpath).is_dir():
254
- self.size -= 1
255
- self.adjust_size = False
256
- return paths
257
- else:
258
- return iterable
259
-
260
- @classmethod
261
- def requires_progress(
262
- cls,
263
- maybe_callback: fsspec.callbacks.Callback | None,
264
- print_progress: bool,
265
- objectname: str,
266
- action: Literal["uploading", "downloading", "synchronizing"],
267
- **kwargs,
268
- ):
269
- if maybe_callback is None:
270
- if print_progress:
271
- return cls(objectname, action, **kwargs)
272
- else:
273
- return fsspec.callbacks.NoOpCallback()
274
- return maybe_callback
275
-
276
-
277
- class ChildProgressCallback(fsspec.callbacks.Callback):
278
- def __init__(self, parent: ProgressCallback):
279
- super().__init__()
280
-
281
- self.parent = parent
282
-
283
- def parent_update(self, inc=1):
284
- self.parent.update_relative_value(inc)
285
-
286
- def relative_update(self, inc=1):
287
- if self.size != 0:
288
- self.parent_update(inc / self.size)
289
- else:
290
- self.parent_update(1)
291
-
292
-
293
- def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwargs):
294
- """Download from self (a destination in the cloud) to the local path."""
295
- if "recursive" not in kwargs:
296
- kwargs["recursive"] = True
297
- if print_progress and "callback" not in kwargs:
298
- callback = ProgressCallback(
299
- PurePosixPath(local_path).name, "downloading", adjust_size=True
300
- )
301
- kwargs["callback"] = callback
302
-
303
- cloud_path_str = str(self)
304
- local_path_str = str(local_path)
305
- # needed due to https://github.com/fsspec/filesystem_spec/issues/1766
306
- # otherwise fsspec calls fs._ls_real where it reads the body and parses links
307
- # so the file is downloaded 2 times
308
- # upath doesn't call fs.ls to infer type, so it is safe to call
309
- if self.protocol in {"http", "https"} and self.stat().as_info()["type"] == "file":
310
- self.fs.use_listings_cache = True
311
- self.fs.dircache[cloud_path_str] = []
312
-
313
- self.fs.download(cloud_path_str, local_path_str, **kwargs)
314
-
315
-
316
- def upload_from(
317
- self,
318
- local_path: UPathStr,
319
- create_folder: bool | None = None,
320
- print_progress: bool = True,
321
- **kwargs,
322
- ) -> UPath:
323
- """Upload from the local path to `self` (a destination in the cloud).
324
-
325
- If the local path is a directory, recursively upload its contents.
326
-
327
- Args:
328
- local_path: A local path of a file or directory.
329
- create_folder: Only applies if `local_path` is a directory and then
330
- defaults to `True`. If `True`, make a new folder in the destination
331
- using the directory name of `local_path`. If `False`, upload the
332
- contents of the directory to to the root-level of the destination.
333
- print_progress: Print progress.
334
-
335
- Returns:
336
- The destination path.
337
- """
338
- local_path = Path(local_path)
339
- local_path_is_dir = local_path.is_dir()
340
- if create_folder is None:
341
- create_folder = local_path_is_dir
342
- if create_folder and not local_path_is_dir:
343
- raise ValueError("create_folder can only be True if local_path is a directory")
344
-
345
- if print_progress and "callback" not in kwargs:
346
- callback = ProgressCallback(local_path.name, "uploading")
347
- kwargs["callback"] = callback
348
-
349
- protocol = self.protocol
350
- cleanup_cache = False
351
- source: str | list[str] = local_path.as_posix()
352
- destination: str | list[str] = self.as_posix()
353
- if local_path_is_dir:
354
- if not create_folder:
355
- source = [
356
- path.as_posix() for path in local_path.rglob("*") if path.is_file()
357
- ]
358
- destination = fsspec.utils.other_paths(
359
- source, self.as_posix(), exists=False, flatten=False
360
- )
361
- elif protocol == "s3" and (bucket := self.drive) not in self.fs.dircache:
362
- # the below lines are to avoid s3fs triggering create_bucket in upload if
363
- # dirs are present, it allows to avoid the permission error
364
- self.fs.dircache[bucket] = [{}]
365
- assert isinstance(destination, str)
366
- if not destination.endswith(TRAILING_SEP):
367
- destination += "/"
368
- cleanup_cache = True
369
- elif protocol == "s3" and "chunksize" not in kwargs:
370
- size = local_path.stat().st_size
371
- MiB = 1024**2
372
- DEFAULT_CHUNKSIZE = 50 * MiB # so in s3fs
373
- if size / DEFAULT_CHUNKSIZE > 10000: # should be no more than 10k parts for s3
374
- raw = math.ceil(size / 10000)
375
- step = 5 * MiB
376
- rounded = math.ceil(raw / step) * step
377
- kwargs["chunksize"] = rounded
378
-
379
- self.fs.upload(source, destination, recursive=create_folder, **kwargs)
380
-
381
- if cleanup_cache:
382
- # normally this is invalidated after the upload but still better to check
383
- if bucket in self.fs.dircache:
384
- del self.fs.dircache[bucket]
385
-
386
- if local_path_is_dir and create_folder:
387
- return self / local_path.name
388
- else:
389
- return self
390
-
391
-
392
- def synchronize_to(
393
- origin: UPath,
394
- destination: Path,
395
- error_no_origin: bool = True,
396
- print_progress: bool = False,
397
- just_check: bool = False,
398
- **kwargs,
399
- ) -> bool:
400
- """Sync to a local destination path."""
401
- destination = destination.resolve()
402
- protocol = origin.protocol
403
- stat_kwargs = {"expand_info": True} if protocol == "hf" else {}
404
- origin_str = str(origin)
405
- try:
406
- cloud_info = origin.fs.stat(origin_str, **stat_kwargs)
407
- exists = True
408
- is_dir = cloud_info["type"] == "directory"
409
- except FileNotFoundError:
410
- exists = False
411
-
412
- if not exists:
413
- warn_or_error = f"The original path {origin} does not exist anymore."
414
- if destination.exists():
415
- warn_or_error += (
416
- f"\nHowever, the local path {destination} still exists, you might want"
417
- " to reupload the object back."
418
- )
419
- logger.warning(warn_or_error)
420
- elif error_no_origin:
421
- warn_or_error += "\nIt is not possible to synchronize."
422
- raise FileNotFoundError(warn_or_error)
423
- return False
424
-
425
- use_size: bool = False
426
- # use casting to int to avoid problems when the local filesystem
427
- # discards fractional parts of timestamps
428
- if protocol == "s3":
429
- get_modified = lambda file_stat: int(file_stat["LastModified"].timestamp())
430
- elif protocol == "gs":
431
- get_modified = lambda file_stat: int(file_stat["mtime"].timestamp())
432
- elif protocol == "hf":
433
- get_modified = lambda file_stat: int(file_stat["last_commit"].date.timestamp())
434
- else: # http etc
435
- use_size = True
436
- get_modified = lambda file_stat: file_stat["size"]
437
-
438
- if use_size:
439
- is_sync_needed = lambda cloud_size, local_stat: cloud_size != local_stat.st_size
440
- else:
441
- # no need to cast local_stat.st_mtime to int
442
- # because if it has the fractional part and cloud_mtime doesn't
443
- # and they have the same integer part then cloud_mtime can't be bigger
444
- is_sync_needed = (
445
- lambda cloud_mtime, local_stat: cloud_mtime > local_stat.st_mtime
446
- )
447
-
448
- local_paths: list[Path] = []
449
- cloud_stats: dict[str, int]
450
- if is_dir:
451
- cloud_stats = {
452
- file: get_modified(stat)
453
- for file, stat in origin.fs.find(
454
- origin_str, detail=True, **stat_kwargs
455
- ).items()
456
- }
457
- for cloud_path in cloud_stats:
458
- file_key = PurePosixPath(cloud_path).relative_to(origin.path).as_posix()
459
- local_paths.append(destination / file_key)
460
- else:
461
- cloud_stats = {origin.path: get_modified(cloud_info)}
462
- local_paths.append(destination)
463
-
464
- local_paths_all: dict[Path, os.stat_result] = {}
465
- if destination.exists():
466
- if is_dir:
467
- local_paths_all = {
468
- path: path.stat() for path in destination.rglob("*") if path.is_file()
469
- }
470
- if not use_size:
471
- # cast to int to remove the fractional parts
472
- # there is a problem when a fractional part is allowed on one filesystem
473
- # but not on the other
474
- # so just normalize both to int
475
- cloud_mts_max: int = max(cloud_stats.values())
476
- local_mts_max: int = int(
477
- max(stat.st_mtime for stat in local_paths_all.values())
478
- )
479
- if local_mts_max > cloud_mts_max:
480
- return False
481
- elif local_mts_max == cloud_mts_max:
482
- if len(local_paths_all) == len(cloud_stats):
483
- return False
484
- elif just_check:
485
- return True
486
- else:
487
- local_paths_all = {destination: destination.stat()}
488
-
489
- cloud_files_sync = []
490
- local_files_sync = []
491
- for i, (cloud_file, cloud_stat) in enumerate(cloud_stats.items()):
492
- local_path = local_paths[i]
493
- if local_path not in local_paths_all or is_sync_needed(
494
- cloud_stat, local_paths_all[local_path]
495
- ):
496
- cloud_files_sync.append(cloud_file)
497
- local_files_sync.append(local_path.as_posix())
498
- else:
499
- cloud_files_sync = list(cloud_stats.keys())
500
- local_files_sync = [local_path.as_posix() for local_path in local_paths]
501
-
502
- if cloud_files_sync:
503
- if just_check:
504
- return True
505
-
506
- callback = ProgressCallback.requires_progress(
507
- maybe_callback=kwargs.pop("callback", None),
508
- print_progress=print_progress,
509
- objectname=destination.name,
510
- action="synchronizing",
511
- adjust_size=False,
512
- )
513
- origin.fs.download(
514
- cloud_files_sync,
515
- local_files_sync,
516
- recursive=False,
517
- callback=callback,
518
- **kwargs,
519
- )
520
- if not use_size:
521
- for i, cloud_file in enumerate(cloud_files_sync):
522
- cloud_mtime = cloud_stats[cloud_file]
523
- os.utime(local_files_sync[i], times=(cloud_mtime, cloud_mtime))
524
- else:
525
- return False
526
-
527
- if is_dir and local_paths_all:
528
- for path in (path for path in local_paths_all if path not in local_paths):
529
- path.unlink()
530
- parent = path.parent
531
- if next(parent.iterdir(), None) is None:
532
- parent.rmdir()
533
-
534
- return True
535
-
536
-
537
- def modified(self) -> datetime | None:
538
- """Return modified time stamp."""
539
- mtime = self.fs.modified(str(self))
540
- if mtime.tzinfo is None:
541
- mtime = mtime.replace(tzinfo=timezone.utc)
542
- return mtime.astimezone().replace(tzinfo=None)
543
-
544
-
545
- def compute_file_tree(
546
- path: UPath,
547
- *,
548
- level: int = -1,
549
- only_dirs: bool = False,
550
- n_max_files_per_dir_and_type: int = 100,
551
- n_max_files: int = 1000,
552
- include_paths: set[Any] | None = None,
553
- skip_suffixes: list[str] | None = None,
554
- ) -> tuple[str, int]:
555
- # .exists() helps to separate files from folders for gcsfs
556
- # otherwise sometimes it has is_dir() True and is_file() True
557
- if path.protocol == "gs" and not path.exists():
558
- raise FileNotFoundError
559
-
560
- space = " "
561
- branch = "│ "
562
- tee = "├── "
563
- last = "└── "
564
- if skip_suffixes is None:
565
- skip_suffixes_tuple = ()
566
- else:
567
- skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
568
- n_files = 0
569
- n_directories = 0
570
-
571
- # by default only including registered files
572
- # need a flag and a proper implementation
573
- suffixes = set()
574
- include_dirs = set()
575
- if include_paths is not None:
576
- include_dirs = {d for p in include_paths for d in p.parents}
577
- else:
578
- include_paths = set()
579
-
580
- def inner(dir_path: Path, prefix: str = "", level: int = -1):
581
- nonlocal n_files, n_directories, suffixes
582
- if level == 0:
583
- return
584
- stripped_dir_path = dir_path.as_posix().rstrip("/")
585
- # do not iterate through zarr directories
586
- if stripped_dir_path.endswith(skip_suffixes_tuple):
587
- return
588
- # this is needed so that the passed folder is not listed
589
- contents = [
590
- i
591
- for i in dir_path.iterdir()
592
- if i.as_posix().rstrip("/") != stripped_dir_path
593
- ]
594
- if only_dirs:
595
- contents = [d for d in contents if d.is_dir()]
596
- pointers = [tee] * (len(contents) - 1) + [last]
597
- n_files_per_dir_and_type = defaultdict(lambda: 0) # type: ignore
598
- # TODO: pass strict=False to zip with python > 3.9
599
- for pointer, child_path in zip(pointers, contents, strict=False): # type: ignore
600
- if child_path.is_dir():
601
- if include_dirs and child_path not in include_dirs:
602
- continue
603
- yield prefix + pointer + child_path.name + "/"
604
- n_directories += 1
605
- n_files_per_dir_and_type = defaultdict(lambda: 0)
606
- extension = branch if pointer == tee else space
607
- yield from inner(child_path, prefix=prefix + extension, level=level - 1)
608
- elif not only_dirs:
609
- if include_paths and child_path not in include_paths:
610
- continue
611
- suffix = extract_suffix_from_path(child_path)
612
- suffixes.add(suffix)
613
- n_files_per_dir_and_type[suffix] += 1
614
- n_files += 1
615
- if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
616
- yield prefix + "..."
617
- elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
618
- continue
619
- else:
620
- yield prefix + pointer + child_path.name
621
-
622
- folder_tree = ""
623
- iterator = inner(path, level=level)
624
- for line in islice(iterator, n_max_files):
625
- folder_tree += f"\n{line}"
626
- if next(iterator, None):
627
- folder_tree += f"\n... only showing {n_max_files} out of {n_files} files"
628
- directory_info = "directory" if n_directories == 1 else "directories"
629
- display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
630
- suffix_message = f" with suffixes {display_suffixes}" if n_files > 0 else ""
631
- message = (
632
- f"{n_directories} sub-{directory_info} &"
633
- f" {n_files} files{suffix_message}\n{path.resolve()}{folder_tree}"
634
- )
635
- return message, n_files
636
-
637
-
638
- # adapted from: https://stackoverflow.com/questions/9727673
639
- def view_tree(
640
- path: Path,
641
- *,
642
- level: int = 2,
643
- only_dirs: bool = False,
644
- n_max_files_per_dir_and_type: int = 100,
645
- n_max_files: int = 1000,
646
- include_paths: set[Any] | None = None,
647
- skip_suffixes: list[str] | None = None,
648
- ) -> None:
649
- """Print a visual tree structure of files & directories.
650
-
651
- Args:
652
- level: If `1`, only iterate through one level, if `2` iterate through 2
653
- levels, if `-1` iterate through entire hierarchy.
654
- only_dirs: Only iterate through directories.
655
- n_max_files: Display limit. Will only show this many files. Doesn't affect count.
656
- include_paths: Restrict to these paths.
657
- skip_suffixes: Skip directories with these suffixes.
658
-
659
- Examples:
660
- >>> dir_path = ln.examples.datasets.generate_cell_ranger_files(
661
- >>> "sample_001", ln.settings.storage
662
- >>> )
663
- >>> ln.UPath(dir_path).view_tree()
664
- 3 subdirectories, 15 files
665
- sample_001
666
- ├── web_summary.html
667
- ├── metrics_summary.csv
668
- ├── molecule_info.h5
669
- ├── filtered_feature_bc_matrix
670
- │ ├── features.tsv.gz
671
- │ ├── barcodes.tsv.gz
672
- │ └── matrix.mtx.gz
673
- ├── analysis
674
- │ └── analysis.csv
675
- ├── raw_feature_bc_matrix
676
- │ ├── features.tsv.gz
677
- │ ├── barcodes.tsv.gz
678
- │ └── matrix.mtx.gz
679
- ├── possorted_genome_bam.bam.bai
680
- ├── cloupe.cloupe
681
- ├── possorted_genome_bam.bam
682
- ├── filtered_feature_bc_matrix.h5
683
- └── raw_feature_bc_matrix.h5
684
- """
685
- message, _ = compute_file_tree(
686
- path,
687
- level=level,
688
- only_dirs=only_dirs,
689
- n_max_files=n_max_files,
690
- n_max_files_per_dir_and_type=n_max_files_per_dir_and_type,
691
- include_paths=include_paths,
692
- skip_suffixes=skip_suffixes,
693
- )
694
- logger.print(message)
695
-
696
-
697
- def to_url(upath):
698
- """Public storage URL.
699
-
700
- Generates a public URL for an object in an S3 bucket using fsspec's UPath,
701
- considering the bucket's region.
702
-
703
- Args:
704
- - upath: A UPath object representing an S3 path.
705
-
706
- Returns:
707
- - A string containing the public URL to the S3 object.
708
- """
709
- if upath.protocol != "s3":
710
- raise ValueError("The provided UPath must be an S3 path.")
711
- key = "/".join(upath.parts[1:])
712
- bucket = upath.drive
713
- region = get_storage_region(upath)
714
- if region == "us-east-1":
715
- return f"https://{bucket}.s3.amazonaws.com/{key}"
716
- else:
717
- return f"https://{bucket}.s3-{region}.amazonaws.com/{key}"
718
-
719
-
720
- # Why aren't we subclassing?
721
- #
722
- # The problem is that UPath defines a type system of paths
723
- # Its __new__ method returns instances of different subclasses rather than a UPath object
724
- # If we create a custom subclass naively, subclasses of the parent UPath won't
725
- # be subclasses of our custom subclass
726
- # This makes life really hard in type checks involving local to cloud comparisons, etc.
727
- # Hence, we extend the existing UPath and amend the docs
728
- # Some of this might end up in the original UPath implementation over time, we'll see.
729
-
730
-
731
- # add custom functions
732
- UPath.modified = property(modified)
733
- UPath.synchronize = deprecated("synchronize_to")(synchronize_to)
734
- UPath.synchronize_to = synchronize_to
735
- UPath.upload_from = upload_from
736
- UPath.to_url = to_url
737
- UPath.download_to = download_to
738
- UPath.view_tree = view_tree
739
- # unfortunately, we also have to do this for the subclasses
740
- Path.view_tree = view_tree # type: ignore
741
-
742
- UPath.glob.__doc__ = Path.glob.__doc__
743
- UPath.rglob.__doc__ = Path.rglob.__doc__
744
- UPath.stat.__doc__ = Path.stat.__doc__
745
- UPath.iterdir.__doc__ = Path.iterdir.__doc__
746
- UPath.resolve.__doc__ = Path.resolve.__doc__
747
- UPath.relative_to.__doc__ = Path.relative_to.__doc__
748
- UPath.exists.__doc__ = Path.exists.__doc__
749
- UPath.is_dir.__doc__ = Path.is_dir.__doc__
750
- UPath.is_file.__doc__ = Path.is_file.__doc__
751
- UPath.unlink.__doc__ = Path.unlink.__doc__
752
- UPath.rename.__doc__ = """Move file, see `fsspec.AbstractFileSystem.mv`.
753
-
754
- For example::
755
-
756
- upath = UPath("s3://my-bucket/my-file")
757
- upath.rename(UPath("s3://my-bucket/my-file-renamed"))
758
- upath.rename("my-file-renamed")
759
- """
760
- UPath.__doc__ = """Paths: low-level key-value access to files.
761
-
762
- Offers the typical access patterns of file systems and object stores, for instance::
763
-
764
- upath = UPath("s3://my-bucket/my-folder/my-file.txt")
765
- upath.exists() # file exists in storage
766
-
767
- LaminDB exposes `universal_pathlib.UPath` and adds functionality related to authentication and the following methods::
768
-
769
- upath.view_tree() # view a file tree
770
- upath.upload_from("local-file.txt") # upload a local file
771
- upath.download_to("local-file.txt") # download a file
772
- upath.synchronize_to("local-folder/") # synchronize a folder
773
-
774
- Args:
775
- pathlike: A string or `Path` to a local or cloud file/directory/folder.
776
- """
777
-
778
- logger.debug("upath.UPath has been patched")
779
-
780
- # suppress the warning from upath about hf (huggingface) filesystem
781
- # not being explicitly implemented in upath
782
- warnings.filterwarnings(
783
- "ignore", module="upath", message=".*'hf' filesystem not explicitly implemented.*"
784
- )
785
-
786
-
787
- # split query params from path string
788
- def _split_path_query(url: str) -> tuple[str, dict]:
789
- split_result = urlsplit(url)
790
- query = parse_qs(split_result.query)
791
- path = split_result._replace(query="").geturl()
792
- return path, query
793
-
794
-
795
- class S3QueryPath(S3Path):
796
- @classmethod
797
- def _transform_init_args(cls, args, protocol, storage_options):
798
- args, protocol, storage_options = super()._transform_init_args(
799
- args, protocol, storage_options
800
- )
801
- arg0 = args[0]
802
- path, query = _split_path_query(str(arg0))
803
- for param, param_values in query.items():
804
- if len(param_values) > 1:
805
- raise ValueError(f"Multiple values for {param} query parameter")
806
- else:
807
- param_value = param_values[0]
808
- if param in storage_options and param_value != storage_options[param]:
809
- raise ValueError(
810
- f"Incompatible {param} in query and storage_options"
811
- )
812
- storage_options.setdefault(param, param_value)
813
- if hasattr(arg0, "storage_options"):
814
- storage_options = {**arg0.storage_options, **storage_options}
815
-
816
- return (path, *args[1:]), protocol, storage_options
817
-
818
-
819
- register_implementation("s3", S3QueryPath, clobber=True)
820
-
821
-
822
- def get_storage_region(path: UPathStr) -> str | None:
823
- upath = UPath(path)
824
-
825
- if upath.protocol != "s3":
826
- return None
827
-
828
- bucket = upath.drive
829
-
830
- if bucket == "scverse-spatial-eu-central-1":
831
- return "eu-central-1"
832
- elif f"s3://{bucket}" in HOSTED_BUCKETS:
833
- return bucket.replace("lamin-", "")
834
-
835
- from botocore.exceptions import ClientError
836
-
837
- if isinstance(path, str):
838
- import botocore.session
839
- from botocore.config import Config
840
-
841
- path_part = path.replace("s3://", "")
842
- # check for endpoint_url in the path string
843
- if "?" in path_part:
844
- path_part, query = _split_path_query(path_part)
845
- endpoint_url = query.get("endpoint_url", [None])[0]
846
- else:
847
- endpoint_url = None
848
- session = botocore.session.get_session()
849
- credentials = session.get_credentials()
850
- if credentials is None or credentials.access_key is None:
851
- config = Config(signature_version=botocore.session.UNSIGNED)
852
- else:
853
- config = None
854
- s3_client = session.create_client(
855
- "s3", endpoint_url=endpoint_url, config=config
856
- )
857
- try:
858
- response = s3_client.head_bucket(Bucket=bucket)
859
- except ClientError as exc:
860
- response = getattr(exc, "response", {})
861
- if response.get("Error", {}).get("Code") == "404":
862
- raise exc
863
- else:
864
- upath = get_aws_options_manager()._path_inject_options(upath, {})
865
- try:
866
- response = upath.fs.call_s3("head_bucket", Bucket=bucket)
867
- except Exception as exc:
868
- cause = getattr(exc, "__cause__", None)
869
- if not isinstance(cause, ClientError):
870
- raise exc
871
- response = getattr(cause, "response", {})
872
- if response.get("Error", {}).get("Code") == "404":
873
- raise exc
874
-
875
- region = (
876
- response.get("ResponseMetadata", {})
877
- .get("HTTPHeaders", {})
878
- .get("x-amz-bucket-region", None)
879
- )
880
- return region
881
-
882
-
883
- def create_path(path: UPathStr, access_token: str | None = None) -> UPath:
884
- upath = UPath(path).expanduser()
885
-
886
- if upath.protocol == "s3":
887
- # add managed credentials and other options for AWS s3 paths
888
- return get_aws_options_manager().enrich_path(upath, access_token)
889
-
890
- if upath.protocol in {"http", "https"}:
891
- # this is needed because by default aiohttp drops a connection after 5 min
892
- # so it is impossible to download large files
893
- storage_options = {}
894
- client_kwargs = upath.storage_options.get("client_kwargs", {})
895
- if "timeout" not in client_kwargs:
896
- from aiohttp import ClientTimeout
897
-
898
- client_kwargs = {
899
- **client_kwargs,
900
- "timeout": ClientTimeout(sock_connect=30, sock_read=30),
901
- }
902
- storage_options["client_kwargs"] = client_kwargs
903
- # see download_to for the reason
904
- if "use_listings_cache" not in upath.storage_options:
905
- storage_options["use_listings_cache"] = True # type: ignore
906
- if len(storage_options) > 0:
907
- return UPath(upath, **storage_options)
908
- return upath
909
-
910
-
911
- def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
912
- size = stat["size"]
913
- hash, hash_type = None, None
914
- # gs, use md5Hash instead of etag for now
915
- if "md5Hash" in stat:
916
- # gs hash is already in base64
917
- hash = stat["md5Hash"].strip('"=')
918
- hash_type = "md5"
919
- # hf
920
- elif "blob_id" in stat:
921
- hash = b16_to_b64(stat["blob_id"])
922
- hash_type = "sha1"
923
- elif "ETag" in stat:
924
- etag = stat["ETag"]
925
- if "mimetype" in stat or ("url" in stat and stat["url"].startswith("http")):
926
- # http
927
- hash = hash_string(etag.strip('"'))
928
- hash_type = "md5-etag"
929
- else:
930
- # s3
931
- # small files
932
- if "-" not in etag:
933
- # only store hash for non-multipart uploads
934
- # we can't rapidly validate multi-part uploaded files client-side
935
- # we can add more logic later down-the-road
936
- hash = b16_to_b64(etag)
937
- hash_type = "md5"
938
- else:
939
- stripped_etag, suffix = etag.split("-")
940
- suffix = suffix.strip('"')
941
- hash = b16_to_b64(stripped_etag)
942
- hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
943
- if hash is not None:
944
- hash = hash[:HASH_LENGTH]
945
- return size, hash, hash_type
946
-
947
-
948
- def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
949
- objects = path.fs.find(path.as_posix(), detail=True)
950
- hash, hash_type = None, None
951
- compute_list_hash = True
952
- if path.protocol == "s3":
953
- accessor = "ETag"
954
- elif path.protocol == "gs":
955
- accessor = "md5Hash"
956
- elif path.protocol == "hf":
957
- accessor = "blob_id"
958
- else:
959
- compute_list_hash = False
960
- sizes = []
961
- hashes = []
962
- for object in objects.values():
963
- sizes.append(object["size"])
964
- if compute_list_hash:
965
- hashes.append(object[accessor].strip('"='))
966
- size = sum(sizes)
967
- n_files = len(sizes)
968
- if compute_list_hash:
969
- hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
970
- return size, hash, hash_type, n_files
971
-
972
-
973
- # is as fast as boto3: https://lamin.ai/laminlabs/lamin-site-assets/transform/krGp3hT1f78N5zKv
974
- def check_storage_is_empty(
975
- root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
976
- ) -> int:
977
- from ._settings_storage import STORAGE_UID_FILE_KEY
978
-
979
- root_upath = UPath(root)
980
- root_string = root_upath.as_posix() # type: ignore
981
- n_offset_objects = 1 # because of storage_uid.txt file, see mark_storage_root()
982
- # if the storage_uid.txt was somehow deleted, we restore a dummy version of it
983
- # because we need it to count files in an empty directory on S3 (otherwise permission error)
984
- if not (root_upath / STORAGE_UID_FILE_KEY).exists():
985
- try:
986
- (root_upath / STORAGE_UID_FILE_KEY).write_text(
987
- "was deleted, restored during delete"
988
- )
989
- except FileNotFoundError:
990
- # this can happen if the root is a local non-existing path
991
- pass
992
- if account_for_sqlite_file:
993
- n_offset_objects += 1 # the SQLite file is in the ".lamindb" directory
994
- if root_string.startswith(HOSTED_BUCKETS):
995
- # in hosted buckets, count across entire root
996
- directory_string = root_string
997
- else:
998
- # in any other storage location, only count in .lamindb
999
- if not root_string.endswith("/"):
1000
- root_string += "/"
1001
- directory_string = root_string + ".lamindb"
1002
- objects = root_upath.fs.find(directory_string)
1003
- if account_for_sqlite_file:
1004
- # ignore exclusion dir for cloud sqlite
1005
- objects = [o for o in objects if "/.lamindb/_exclusion/" not in o]
1006
- n_files = len(objects)
1007
- n_diff = n_files - n_offset_objects
1008
- if n_diff > 0:
1009
- ask_for_deletion = (
1010
- "delete them prior to deleting the storage location"
1011
- if raise_error
1012
- else "consider deleting them"
1013
- )
1014
- message = f"'{directory_string}' contains {n_diff} objects:\n"
1015
- message += "\n".join(
1016
- [
1017
- o
1018
- for o in objects
1019
- if not o.endswith(".lamindb/storage_uid.txt")
1020
- and not (account_for_sqlite_file and o.endswith(".lamindb/lamin.db"))
1021
- ]
1022
- )
1023
- message += f"\n{ask_for_deletion}"
1024
- if raise_error:
1025
- raise StorageNotEmpty(message) from None
1026
- else:
1027
- logger.warning(message)
1028
- return n_diff
1
+ # we are not documenting UPath here because it's documented at lamindb.UPath
2
+ """Paths & file systems."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import math
7
+ import os
8
+ import warnings
9
+ from collections import defaultdict
10
+ from datetime import datetime, timezone
11
+ from functools import partial
12
+ from itertools import islice
13
+ from pathlib import Path, PosixPath, PurePosixPath, WindowsPath
14
+ from typing import TYPE_CHECKING, Any, Literal
15
+ from urllib.parse import parse_qs, urlsplit
16
+
17
+ import fsspec
18
+ from lamin_utils import logger
19
+ from upath import UPath
20
+ from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
21
+ from upath.implementations.local import LocalPath
22
+ from upath.registry import register_implementation
23
+
24
+ from lamindb_setup.errors import StorageNotEmpty
25
+
26
+ from ._aws_options import HOSTED_BUCKETS, get_aws_options_manager
27
+ from ._deprecated import deprecated
28
+ from .hashing import HASH_LENGTH, b16_to_b64, hash_from_hashes_list, hash_string
29
+
30
+ if TYPE_CHECKING:
31
+ from lamindb_setup.types import UPathStr
32
+
33
+ LocalPathClasses = (PosixPath, WindowsPath, LocalPath)
34
+
35
+ # also see https://gist.github.com/securifera/e7eed730cbe1ce43d0c29d7cd2d582f4
36
+ # ".gz" is not listed here as it typically occurs with another suffix
37
+ # the complete list is at lamindb.core.storage._suffixes
38
+ VALID_SIMPLE_SUFFIXES = {
39
+ #
40
+ # without readers
41
+ #
42
+ ".fasta",
43
+ ".fastq",
44
+ ".jpg",
45
+ ".mtx",
46
+ ".obo",
47
+ ".pdf",
48
+ ".png",
49
+ ".tar",
50
+ ".tiff",
51
+ ".txt",
52
+ ".tsv",
53
+ ".zip",
54
+ ".xml",
55
+ ".qs", # https://cran.r-project.org/web/packages/qs/vignettes/vignette.html
56
+ ".rds",
57
+ ".pt",
58
+ ".pth",
59
+ ".ckpt",
60
+ ".state_dict",
61
+ ".keras",
62
+ ".pb",
63
+ ".pbtxt",
64
+ ".savedmodel",
65
+ ".pkl",
66
+ ".pickle",
67
+ ".bin",
68
+ ".safetensors",
69
+ ".model",
70
+ ".mlmodel",
71
+ ".mar",
72
+ #
73
+ # with readers (see below)
74
+ #
75
+ ".h5ad",
76
+ ".parquet",
77
+ ".csv",
78
+ ".fcs",
79
+ ".xslx",
80
+ ".zarr",
81
+ ".json",
82
+ }
83
+ # below gets updated within lamindb because it's frequently changing
84
+ VALID_COMPOSITE_SUFFIXES = {".anndata.zarr"}
85
+
86
+ TRAILING_SEP = (os.sep, os.altsep) if os.altsep is not None else os.sep
87
+
88
+
89
+ def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
90
+ def process_digits(suffix: str):
91
+ if suffix[1:].isdigit(): # :1 to skip the dot
92
+ return "" # digits are no valid suffixes
93
+ else:
94
+ return suffix
95
+
96
+ suffixes = path.suffixes
97
+
98
+ if len(suffixes) <= 1:
99
+ return process_digits(path.suffix)
100
+
101
+ total_suffix = "".join(suffixes)
102
+ if total_suffix in VALID_SIMPLE_SUFFIXES:
103
+ return total_suffix
104
+ elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
105
+ # below seems slow but OK for now
106
+ for suffix in VALID_COMPOSITE_SUFFIXES:
107
+ if total_suffix.endswith(suffix):
108
+ break
109
+ return suffix
110
+ else:
111
+ print_hint = True
112
+ arg_name = "file" if arg_name is None else arg_name # for the warning
113
+ msg = f"{arg_name} has more than one suffix (path.suffixes), "
114
+ # first check the 2nd-to-last suffix because it might be followed by .gz
115
+ # or another compression-related suffix
116
+ # Alex thought about adding logic along the lines of path.suffixes[-1]
117
+ # in COMPRESSION_SUFFIXES to detect something like .random.gz and then
118
+ # add ".random.gz" but concluded it's too dangerous it's safer to just
119
+ # use ".gz" in such a case
120
+ if suffixes[-2] in VALID_SIMPLE_SUFFIXES:
121
+ suffix = "".join(suffixes[-2:])
122
+ # if the suffix preceding the compression suffixes is a valid suffix,
123
+ # we account for it; otherwise we don't.
124
+ # i.e. we should have .h5ad.tar.gz or .csv.tar.gz, not just .tar.gz
125
+ if (
126
+ suffix == ".tar.gz"
127
+ and len(suffixes) > 2
128
+ and (suffix_3 := suffixes[-3]) in VALID_SIMPLE_SUFFIXES
129
+ ):
130
+ suffix = suffix_3 + suffix
131
+ # do not print a warning for things like .tar.gz, .fastq.gz
132
+ if suffixes[-1] == ".gz":
133
+ print_hint = False
134
+ else:
135
+ msg += f"inferring: '{suffix}'"
136
+ else:
137
+ suffix = suffixes[-1] # this is equivalent to path.suffix
138
+ msg += (
139
+ f"using only last suffix: '{suffix}' - if you want your composite"
140
+ " suffix to be recognized add it to"
141
+ " lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()"
142
+ )
143
+ if print_hint:
144
+ logger.hint(msg)
145
+ return process_digits(suffix)
146
+
147
+
148
+ def infer_filesystem(path: UPathStr):
149
+ import fsspec # improve cold start
150
+
151
+ path_str = str(path)
152
+
153
+ if isinstance(path, UPath):
154
+ fs = path.fs
155
+ else:
156
+ protocol = fsspec.utils.get_protocol(path_str)
157
+ if protocol == "s3":
158
+ fs_kwargs = {"cache_regions": True}
159
+ else:
160
+ fs_kwargs = {}
161
+ fs = fsspec.filesystem(protocol, **fs_kwargs)
162
+
163
+ return fs, path_str
164
+
165
+
166
+ # this is needed to avoid CreateBucket permission
167
+ class S3FSMap(fsspec.FSMap):
168
+ def __setitem__(self, key, value):
169
+ """Store value in key."""
170
+ key = self._key_to_str(key)
171
+ self.fs.pipe_file(key, fsspec.mapping.maybe_convert(value))
172
+
173
+
174
+ def create_mapper(
175
+ fs,
176
+ url="",
177
+ check=False,
178
+ create=False,
179
+ missing_exceptions=None,
180
+ ):
181
+ if fsspec.utils.get_protocol(url) == "s3":
182
+ return S3FSMap(
183
+ url, fs, check=check, create=False, missing_exceptions=missing_exceptions
184
+ )
185
+ else:
186
+ return fsspec.FSMap(
187
+ url, fs, check=check, create=create, missing_exceptions=missing_exceptions
188
+ )
189
+
190
+
191
+ def print_hook(size: int, value: int, objectname: str, action: str):
192
+ if size == 0:
193
+ progress_in_percent = 100.0
194
+ else:
195
+ progress_in_percent = (value / size) * 100
196
+ out = f"... {action} {objectname}: {min(progress_in_percent, 100):4.1f}%"
197
+ if "NBPRJ_TEST_NBPATH" not in os.environ:
198
+ end = "\n" if progress_in_percent >= 100 else "\r"
199
+ print(out, end=end)
200
+
201
+
202
+ class ProgressCallback(fsspec.callbacks.Callback):
203
+ def __init__(
204
+ self,
205
+ objectname: str,
206
+ action: Literal["uploading", "downloading", "synchronizing"],
207
+ adjust_size: bool = False,
208
+ ):
209
+ assert action in {"uploading", "downloading", "synchronizing"}
210
+
211
+ super().__init__()
212
+
213
+ self.action = action
214
+ print_progress = partial(print_hook, objectname=objectname, action=action)
215
+ self.hooks = {"print_progress": print_progress}
216
+
217
+ self.adjust_size = adjust_size
218
+
219
+ def absolute_update(self, value):
220
+ pass
221
+
222
+ def relative_update(self, inc=1):
223
+ pass
224
+
225
+ def update_relative_value(self, inc=1):
226
+ if inc != 0:
227
+ self.value += inc
228
+ self.call()
229
+ else:
230
+ # this is specific to http filesystem
231
+ # for some reason the last update is 0 always
232
+ # sometimes the reported result is less that 100%
233
+ # here 100% is forced manually in this case
234
+ if self.value < 1.0 and self.value >= 0.999:
235
+ self.value = self.size
236
+ self.call()
237
+
238
+ def branch(self, path_1, path_2, kwargs):
239
+ if self.adjust_size:
240
+ if Path(path_2 if self.action != "uploading" else path_1).is_dir():
241
+ self.size -= 1
242
+ kwargs["callback"] = ChildProgressCallback(self)
243
+
244
+ def branched(self, path_1, path_2, **kwargs):
245
+ self.branch(path_1, path_2, kwargs)
246
+ return kwargs["callback"]
247
+
248
+ def wrap(self, iterable):
249
+ if self.adjust_size:
250
+ paths = []
251
+ for lpath, rpath in iterable:
252
+ paths.append((lpath, rpath))
253
+ if Path(lpath).is_dir():
254
+ self.size -= 1
255
+ self.adjust_size = False
256
+ return paths
257
+ else:
258
+ return iterable
259
+
260
+ @classmethod
261
+ def requires_progress(
262
+ cls,
263
+ maybe_callback: fsspec.callbacks.Callback | None,
264
+ print_progress: bool,
265
+ objectname: str,
266
+ action: Literal["uploading", "downloading", "synchronizing"],
267
+ **kwargs,
268
+ ):
269
+ if maybe_callback is None:
270
+ if print_progress:
271
+ return cls(objectname, action, **kwargs)
272
+ else:
273
+ return fsspec.callbacks.NoOpCallback()
274
+ return maybe_callback
275
+
276
+
277
+ class ChildProgressCallback(fsspec.callbacks.Callback):
278
+ def __init__(self, parent: ProgressCallback):
279
+ super().__init__()
280
+
281
+ self.parent = parent
282
+
283
+ def parent_update(self, inc=1):
284
+ self.parent.update_relative_value(inc)
285
+
286
+ def relative_update(self, inc=1):
287
+ if self.size != 0:
288
+ self.parent_update(inc / self.size)
289
+ else:
290
+ self.parent_update(1)
291
+
292
+
293
+ def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwargs):
294
+ """Download from self (a destination in the cloud) to the local path."""
295
+ if "recursive" not in kwargs:
296
+ kwargs["recursive"] = True
297
+ if print_progress and "callback" not in kwargs:
298
+ callback = ProgressCallback(
299
+ PurePosixPath(local_path).name, "downloading", adjust_size=True
300
+ )
301
+ kwargs["callback"] = callback
302
+
303
+ cloud_path_str = str(self)
304
+ local_path_str = str(local_path)
305
+ # needed due to https://github.com/fsspec/filesystem_spec/issues/1766
306
+ # otherwise fsspec calls fs._ls_real where it reads the body and parses links
307
+ # so the file is downloaded 2 times
308
+ # upath doesn't call fs.ls to infer type, so it is safe to call
309
+ if self.protocol in {"http", "https"} and self.stat().as_info()["type"] == "file":
310
+ self.fs.use_listings_cache = True
311
+ self.fs.dircache[cloud_path_str] = []
312
+
313
+ self.fs.download(cloud_path_str, local_path_str, **kwargs)
314
+
315
+
316
+ def upload_from(
317
+ self,
318
+ local_path: UPathStr,
319
+ create_folder: bool | None = None,
320
+ print_progress: bool = True,
321
+ **kwargs,
322
+ ) -> UPath:
323
+ """Upload from the local path to `self` (a destination in the cloud).
324
+
325
+ If the local path is a directory, recursively upload its contents.
326
+
327
+ Args:
328
+ local_path: A local path of a file or directory.
329
+ create_folder: Only applies if `local_path` is a directory and then
330
+ defaults to `True`. If `True`, make a new folder in the destination
331
+ using the directory name of `local_path`. If `False`, upload the
332
+ contents of the directory to to the root-level of the destination.
333
+ print_progress: Print progress.
334
+
335
+ Returns:
336
+ The destination path.
337
+ """
338
+ local_path = Path(local_path)
339
+ local_path_is_dir = local_path.is_dir()
340
+ if create_folder is None:
341
+ create_folder = local_path_is_dir
342
+ if create_folder and not local_path_is_dir:
343
+ raise ValueError("create_folder can only be True if local_path is a directory")
344
+
345
+ if print_progress and "callback" not in kwargs:
346
+ callback = ProgressCallback(local_path.name, "uploading")
347
+ kwargs["callback"] = callback
348
+
349
+ protocol = self.protocol
350
+ cleanup_cache = False
351
+ source: str | list[str] = local_path.as_posix()
352
+ destination: str | list[str] = self.as_posix()
353
+ if local_path_is_dir:
354
+ if not create_folder:
355
+ source = [
356
+ path.as_posix() for path in local_path.rglob("*") if path.is_file()
357
+ ]
358
+ destination = fsspec.utils.other_paths(
359
+ source, self.as_posix(), exists=False, flatten=False
360
+ )
361
+ elif protocol == "s3" and (bucket := self.drive) not in self.fs.dircache:
362
+ # the below lines are to avoid s3fs triggering create_bucket in upload if
363
+ # dirs are present, it allows to avoid the permission error
364
+ self.fs.dircache[bucket] = [{}]
365
+ assert isinstance(destination, str)
366
+ if not destination.endswith(TRAILING_SEP):
367
+ destination += "/"
368
+ cleanup_cache = True
369
+ elif protocol == "s3" and "chunksize" not in kwargs:
370
+ size = local_path.stat().st_size
371
+ MiB = 1024**2
372
+ DEFAULT_CHUNKSIZE = 50 * MiB # so in s3fs
373
+ if size / DEFAULT_CHUNKSIZE > 10000: # should be no more than 10k parts for s3
374
+ raw = math.ceil(size / 10000)
375
+ step = 5 * MiB
376
+ rounded = math.ceil(raw / step) * step
377
+ kwargs["chunksize"] = rounded
378
+
379
+ self.fs.upload(source, destination, recursive=create_folder, **kwargs)
380
+
381
+ if cleanup_cache:
382
+ # normally this is invalidated after the upload but still better to check
383
+ if bucket in self.fs.dircache:
384
+ del self.fs.dircache[bucket]
385
+
386
+ if local_path_is_dir and create_folder:
387
+ return self / local_path.name
388
+ else:
389
+ return self
390
+
391
+
392
+ def synchronize_to(
393
+ origin: UPath,
394
+ destination: Path,
395
+ error_no_origin: bool = True,
396
+ print_progress: bool = False,
397
+ just_check: bool = False,
398
+ **kwargs,
399
+ ) -> bool:
400
+ """Sync to a local destination path."""
401
+ destination = destination.resolve()
402
+ protocol = origin.protocol
403
+ stat_kwargs = {"expand_info": True} if protocol == "hf" else {}
404
+ origin_str = str(origin)
405
+ try:
406
+ cloud_info = origin.fs.stat(origin_str, **stat_kwargs)
407
+ exists = True
408
+ is_dir = cloud_info["type"] == "directory"
409
+ except FileNotFoundError:
410
+ exists = False
411
+
412
+ if not exists:
413
+ warn_or_error = f"The original path {origin} does not exist anymore."
414
+ if destination.exists():
415
+ warn_or_error += (
416
+ f"\nHowever, the local path {destination} still exists, you might want"
417
+ " to reupload the object back."
418
+ )
419
+ logger.warning(warn_or_error)
420
+ elif error_no_origin:
421
+ warn_or_error += "\nIt is not possible to synchronize."
422
+ raise FileNotFoundError(warn_or_error)
423
+ return False
424
+
425
+ use_size: bool = False
426
+ # use casting to int to avoid problems when the local filesystem
427
+ # discards fractional parts of timestamps
428
+ if protocol == "s3":
429
+ get_modified = lambda file_stat: int(file_stat["LastModified"].timestamp())
430
+ elif protocol == "gs":
431
+ get_modified = lambda file_stat: int(file_stat["mtime"].timestamp())
432
+ elif protocol == "hf":
433
+ get_modified = lambda file_stat: int(file_stat["last_commit"].date.timestamp())
434
+ else: # http etc
435
+ use_size = True
436
+ get_modified = lambda file_stat: file_stat["size"]
437
+
438
+ if use_size:
439
+ is_sync_needed = lambda cloud_size, local_stat: cloud_size != local_stat.st_size
440
+ else:
441
+ # no need to cast local_stat.st_mtime to int
442
+ # because if it has the fractional part and cloud_mtime doesn't
443
+ # and they have the same integer part then cloud_mtime can't be bigger
444
+ is_sync_needed = (
445
+ lambda cloud_mtime, local_stat: cloud_mtime > local_stat.st_mtime
446
+ )
447
+
448
+ local_paths: list[Path] = []
449
+ cloud_stats: dict[str, int]
450
+ if is_dir:
451
+ cloud_stats = {
452
+ file: get_modified(stat)
453
+ for file, stat in origin.fs.find(
454
+ origin_str, detail=True, **stat_kwargs
455
+ ).items()
456
+ }
457
+ for cloud_path in cloud_stats:
458
+ file_key = PurePosixPath(cloud_path).relative_to(origin.path).as_posix()
459
+ local_paths.append(destination / file_key)
460
+ else:
461
+ cloud_stats = {origin.path: get_modified(cloud_info)}
462
+ local_paths.append(destination)
463
+
464
+ local_paths_all: dict[Path, os.stat_result] = {}
465
+ if destination.exists():
466
+ if is_dir:
467
+ local_paths_all = {
468
+ path: path.stat() for path in destination.rglob("*") if path.is_file()
469
+ }
470
+ if not use_size:
471
+ # cast to int to remove the fractional parts
472
+ # there is a problem when a fractional part is allowed on one filesystem
473
+ # but not on the other
474
+ # so just normalize both to int
475
+ cloud_mts_max: int = max(cloud_stats.values())
476
+ local_mts_max: int = int(
477
+ max(stat.st_mtime for stat in local_paths_all.values())
478
+ )
479
+ if local_mts_max > cloud_mts_max:
480
+ return False
481
+ elif local_mts_max == cloud_mts_max:
482
+ if len(local_paths_all) == len(cloud_stats):
483
+ return False
484
+ elif just_check:
485
+ return True
486
+ else:
487
+ local_paths_all = {destination: destination.stat()}
488
+
489
+ cloud_files_sync = []
490
+ local_files_sync = []
491
+ for i, (cloud_file, cloud_stat) in enumerate(cloud_stats.items()):
492
+ local_path = local_paths[i]
493
+ if local_path not in local_paths_all or is_sync_needed(
494
+ cloud_stat, local_paths_all[local_path]
495
+ ):
496
+ cloud_files_sync.append(cloud_file)
497
+ local_files_sync.append(local_path.as_posix())
498
+ else:
499
+ cloud_files_sync = list(cloud_stats.keys())
500
+ local_files_sync = [local_path.as_posix() for local_path in local_paths]
501
+
502
+ if cloud_files_sync:
503
+ if just_check:
504
+ return True
505
+
506
+ callback = ProgressCallback.requires_progress(
507
+ maybe_callback=kwargs.pop("callback", None),
508
+ print_progress=print_progress,
509
+ objectname=destination.name,
510
+ action="synchronizing",
511
+ adjust_size=False,
512
+ )
513
+ origin.fs.download(
514
+ cloud_files_sync,
515
+ local_files_sync,
516
+ recursive=False,
517
+ callback=callback,
518
+ **kwargs,
519
+ )
520
+ if not use_size:
521
+ for i, cloud_file in enumerate(cloud_files_sync):
522
+ cloud_mtime = cloud_stats[cloud_file]
523
+ os.utime(local_files_sync[i], times=(cloud_mtime, cloud_mtime))
524
+ else:
525
+ return False
526
+
527
+ if is_dir and local_paths_all:
528
+ for path in (path for path in local_paths_all if path not in local_paths):
529
+ path.unlink()
530
+ parent = path.parent
531
+ if next(parent.iterdir(), None) is None:
532
+ parent.rmdir()
533
+
534
+ return True
535
+
536
+
537
+ def modified(self) -> datetime | None:
538
+ """Return modified time stamp."""
539
+ mtime = self.fs.modified(str(self))
540
+ if mtime.tzinfo is None:
541
+ mtime = mtime.replace(tzinfo=timezone.utc)
542
+ return mtime.astimezone().replace(tzinfo=None)
543
+
544
+
545
+ def compute_file_tree(
546
+ path: UPath,
547
+ *,
548
+ level: int = -1,
549
+ only_dirs: bool = False,
550
+ n_max_files_per_dir_and_type: int = 100,
551
+ n_max_files: int = 1000,
552
+ include_paths: set[Any] | None = None,
553
+ skip_suffixes: list[str] | None = None,
554
+ ) -> tuple[str, int]:
555
+ # .exists() helps to separate files from folders for gcsfs
556
+ # otherwise sometimes it has is_dir() True and is_file() True
557
+ if path.protocol == "gs" and not path.exists():
558
+ raise FileNotFoundError
559
+
560
+ space = " "
561
+ branch = "│ "
562
+ tee = "├── "
563
+ last = "└── "
564
+ if skip_suffixes is None:
565
+ skip_suffixes_tuple = ()
566
+ else:
567
+ skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
568
+ n_files = 0
569
+ n_directories = 0
570
+
571
+ # by default only including registered files
572
+ # need a flag and a proper implementation
573
+ suffixes = set()
574
+ include_dirs = set()
575
+ if include_paths is not None:
576
+ include_dirs = {d for p in include_paths for d in p.parents}
577
+ else:
578
+ include_paths = set()
579
+
580
+ def inner(dir_path: Path, prefix: str = "", level: int = -1):
581
+ nonlocal n_files, n_directories, suffixes
582
+ if level == 0:
583
+ return
584
+ stripped_dir_path = dir_path.as_posix().rstrip("/")
585
+ # do not iterate through zarr directories
586
+ if stripped_dir_path.endswith(skip_suffixes_tuple):
587
+ return
588
+ # this is needed so that the passed folder is not listed
589
+ contents = [
590
+ i
591
+ for i in dir_path.iterdir()
592
+ if i.as_posix().rstrip("/") != stripped_dir_path
593
+ ]
594
+ if only_dirs:
595
+ contents = [d for d in contents if d.is_dir()]
596
+ pointers = [tee] * (len(contents) - 1) + [last]
597
+ n_files_per_dir_and_type = defaultdict(lambda: 0) # type: ignore
598
+ # TODO: pass strict=False to zip with python > 3.9
599
+ for pointer, child_path in zip(pointers, contents, strict=False): # type: ignore
600
+ if child_path.is_dir():
601
+ if include_dirs and child_path not in include_dirs:
602
+ continue
603
+ yield prefix + pointer + child_path.name + "/"
604
+ n_directories += 1
605
+ n_files_per_dir_and_type = defaultdict(lambda: 0)
606
+ extension = branch if pointer == tee else space
607
+ yield from inner(child_path, prefix=prefix + extension, level=level - 1)
608
+ elif not only_dirs:
609
+ if include_paths and child_path not in include_paths:
610
+ continue
611
+ suffix = extract_suffix_from_path(child_path)
612
+ suffixes.add(suffix)
613
+ n_files_per_dir_and_type[suffix] += 1
614
+ n_files += 1
615
+ if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
616
+ yield prefix + "..."
617
+ elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
618
+ continue
619
+ else:
620
+ yield prefix + pointer + child_path.name
621
+
622
+ folder_tree = ""
623
+ iterator = inner(path, level=level)
624
+ for line in islice(iterator, n_max_files):
625
+ folder_tree += f"\n{line}"
626
+ if next(iterator, None):
627
+ folder_tree += f"\n... only showing {n_max_files} out of {n_files} files"
628
+ directory_info = "directory" if n_directories == 1 else "directories"
629
+ display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
630
+ suffix_message = f" with suffixes {display_suffixes}" if n_files > 0 else ""
631
+ message = (
632
+ f"{n_directories} sub-{directory_info} &"
633
+ f" {n_files} files{suffix_message}\n{path.resolve()}{folder_tree}"
634
+ )
635
+ return message, n_files
636
+
637
+
638
+ # adapted from: https://stackoverflow.com/questions/9727673
639
+ def view_tree(
640
+ path: Path,
641
+ *,
642
+ level: int = 2,
643
+ only_dirs: bool = False,
644
+ n_max_files_per_dir_and_type: int = 100,
645
+ n_max_files: int = 1000,
646
+ include_paths: set[Any] | None = None,
647
+ skip_suffixes: list[str] | None = None,
648
+ ) -> None:
649
+ """Print a visual tree structure of files & directories.
650
+
651
+ Args:
652
+ level: If `1`, only iterate through one level, if `2` iterate through 2
653
+ levels, if `-1` iterate through entire hierarchy.
654
+ only_dirs: Only iterate through directories.
655
+ n_max_files: Display limit. Will only show this many files. Doesn't affect count.
656
+ include_paths: Restrict to these paths.
657
+ skip_suffixes: Skip directories with these suffixes.
658
+
659
+ Examples:
660
+ >>> dir_path = ln.examples.datasets.generate_cell_ranger_files(
661
+ >>> "sample_001", ln.settings.storage
662
+ >>> )
663
+ >>> ln.UPath(dir_path).view_tree()
664
+ 3 subdirectories, 15 files
665
+ sample_001
666
+ ├── web_summary.html
667
+ ├── metrics_summary.csv
668
+ ├── molecule_info.h5
669
+ ├── filtered_feature_bc_matrix
670
+ │ ├── features.tsv.gz
671
+ │ ├── barcodes.tsv.gz
672
+ │ └── matrix.mtx.gz
673
+ ├── analysis
674
+ │ └── analysis.csv
675
+ ├── raw_feature_bc_matrix
676
+ │ ├── features.tsv.gz
677
+ │ ├── barcodes.tsv.gz
678
+ │ └── matrix.mtx.gz
679
+ ├── possorted_genome_bam.bam.bai
680
+ ├── cloupe.cloupe
681
+ ├── possorted_genome_bam.bam
682
+ ├── filtered_feature_bc_matrix.h5
683
+ └── raw_feature_bc_matrix.h5
684
+ """
685
+ message, _ = compute_file_tree(
686
+ path,
687
+ level=level,
688
+ only_dirs=only_dirs,
689
+ n_max_files=n_max_files,
690
+ n_max_files_per_dir_and_type=n_max_files_per_dir_and_type,
691
+ include_paths=include_paths,
692
+ skip_suffixes=skip_suffixes,
693
+ )
694
+ logger.print(message)
695
+
696
+
697
+ def to_url(upath):
698
+ """Public storage URL.
699
+
700
+ Generates a public URL for an object in an S3 bucket using fsspec's UPath,
701
+ considering the bucket's region.
702
+
703
+ Args:
704
+ - upath: A UPath object representing an S3 path.
705
+
706
+ Returns:
707
+ - A string containing the public URL to the S3 object.
708
+ """
709
+ if upath.protocol != "s3":
710
+ raise ValueError("The provided UPath must be an S3 path.")
711
+ key = "/".join(upath.parts[1:])
712
+ bucket = upath.drive
713
+ region = get_storage_region(upath)
714
+ if region == "us-east-1":
715
+ return f"https://{bucket}.s3.amazonaws.com/{key}"
716
+ else:
717
+ return f"https://{bucket}.s3-{region}.amazonaws.com/{key}"
718
+
719
+
720
+ # Why aren't we subclassing?
721
+ #
722
+ # The problem is that UPath defines a type system of paths
723
+ # Its __new__ method returns instances of different subclasses rather than a UPath object
724
+ # If we create a custom subclass naively, subclasses of the parent UPath won't
725
+ # be subclasses of our custom subclass
726
+ # This makes life really hard in type checks involving local to cloud comparisons, etc.
727
+ # Hence, we extend the existing UPath and amend the docs
728
+ # Some of this might end up in the original UPath implementation over time, we'll see.
729
+
730
+
731
+ # add custom functions
732
+ UPath.modified = property(modified)
733
+ UPath.synchronize = deprecated("synchronize_to")(synchronize_to)
734
+ UPath.synchronize_to = synchronize_to
735
+ UPath.upload_from = upload_from
736
+ UPath.to_url = to_url
737
+ UPath.download_to = download_to
738
+ UPath.view_tree = view_tree
739
+ # unfortunately, we also have to do this for the subclasses
740
+ Path.view_tree = view_tree # type: ignore
741
+
742
+ UPath.glob.__doc__ = Path.glob.__doc__
743
+ UPath.rglob.__doc__ = Path.rglob.__doc__
744
+ UPath.stat.__doc__ = Path.stat.__doc__
745
+ UPath.iterdir.__doc__ = Path.iterdir.__doc__
746
+ UPath.resolve.__doc__ = Path.resolve.__doc__
747
+ UPath.relative_to.__doc__ = Path.relative_to.__doc__
748
+ UPath.exists.__doc__ = Path.exists.__doc__
749
+ UPath.is_dir.__doc__ = Path.is_dir.__doc__
750
+ UPath.is_file.__doc__ = Path.is_file.__doc__
751
+ UPath.unlink.__doc__ = Path.unlink.__doc__
752
+ UPath.rename.__doc__ = """Move file, see `fsspec.AbstractFileSystem.mv`.
753
+
754
+ For example::
755
+
756
+ upath = UPath("s3://my-bucket/my-file")
757
+ upath.rename(UPath("s3://my-bucket/my-file-renamed"))
758
+ upath.rename("my-file-renamed")
759
+ """
760
+ UPath.__doc__ = """Paths: low-level key-value access to files.
761
+
762
+ Offers the typical access patterns of file systems and object stores, for instance::
763
+
764
+ upath = UPath("s3://my-bucket/my-folder/my-file.txt")
765
+ upath.exists() # file exists in storage
766
+
767
+ LaminDB exposes `universal_pathlib.UPath` and adds functionality related to authentication and the following methods::
768
+
769
+ upath.view_tree() # view a file tree
770
+ upath.upload_from("local-file.txt") # upload a local file
771
+ upath.download_to("local-file.txt") # download a file
772
+ upath.synchronize_to("local-folder/") # synchronize a folder
773
+
774
+ Args:
775
+ pathlike: A string or `Path` to a local or cloud file/directory/folder.
776
+ """
777
+
778
+ logger.debug("upath.UPath has been patched")
779
+
780
+ # suppress the warning from upath about hf (huggingface) filesystem
781
+ # not being explicitly implemented in upath
782
+ warnings.filterwarnings(
783
+ "ignore", module="upath", message=".*'hf' filesystem not explicitly implemented.*"
784
+ )
785
+
786
+
787
+ # split query params from path string
788
+ def _split_path_query(url: str) -> tuple[str, dict]:
789
+ split_result = urlsplit(url)
790
+ query = parse_qs(split_result.query)
791
+ path = split_result._replace(query="").geturl()
792
+ return path, query
793
+
794
+
795
+ class S3QueryPath(S3Path):
796
+ @classmethod
797
+ def _transform_init_args(cls, args, protocol, storage_options):
798
+ args, protocol, storage_options = super()._transform_init_args(
799
+ args, protocol, storage_options
800
+ )
801
+ arg0 = args[0]
802
+ path, query = _split_path_query(str(arg0))
803
+ for param, param_values in query.items():
804
+ if len(param_values) > 1:
805
+ raise ValueError(f"Multiple values for {param} query parameter")
806
+ else:
807
+ param_value = param_values[0]
808
+ if param in storage_options and param_value != storage_options[param]:
809
+ raise ValueError(
810
+ f"Incompatible {param} in query and storage_options"
811
+ )
812
+ storage_options.setdefault(param, param_value)
813
+ if hasattr(arg0, "storage_options"):
814
+ storage_options = {**arg0.storage_options, **storage_options}
815
+
816
+ return (path, *args[1:]), protocol, storage_options
817
+
818
+ def is_bucket_versioned(self) -> bool:
819
+ return self.fs.is_bucket_versioned(self.drive)
820
+
821
+
822
+ register_implementation("s3", S3QueryPath, clobber=True)
823
+
824
+
825
+ def get_storage_region(path: UPathStr) -> str | None:
826
+ upath = UPath(path)
827
+
828
+ if upath.protocol != "s3":
829
+ return None
830
+
831
+ bucket = upath.drive
832
+
833
+ if bucket == "scverse-spatial-eu-central-1":
834
+ return "eu-central-1"
835
+ elif f"s3://{bucket}" in HOSTED_BUCKETS:
836
+ return bucket.replace("lamin-", "")
837
+
838
+ from botocore.exceptions import ClientError
839
+
840
+ if isinstance(path, str):
841
+ import botocore.session
842
+ from botocore.config import Config
843
+
844
+ path_part = path.replace("s3://", "")
845
+ # check for endpoint_url in the path string
846
+ if "?" in path_part:
847
+ path_part, query = _split_path_query(path_part)
848
+ endpoint_url = query.get("endpoint_url", [None])[0]
849
+ else:
850
+ endpoint_url = None
851
+ session = botocore.session.get_session()
852
+ credentials = session.get_credentials()
853
+ if credentials is None or credentials.access_key is None:
854
+ config = Config(signature_version=botocore.session.UNSIGNED)
855
+ else:
856
+ config = None
857
+ s3_client = session.create_client(
858
+ "s3", endpoint_url=endpoint_url, config=config
859
+ )
860
+ try:
861
+ response = s3_client.head_bucket(Bucket=bucket)
862
+ except ClientError as exc:
863
+ response = getattr(exc, "response", {})
864
+ if response.get("Error", {}).get("Code") == "404":
865
+ raise exc
866
+ else:
867
+ upath = get_aws_options_manager()._path_inject_options(upath, {})
868
+ try:
869
+ response = upath.fs.call_s3("head_bucket", Bucket=bucket)
870
+ except Exception as exc:
871
+ cause = getattr(exc, "__cause__", None)
872
+ if not isinstance(cause, ClientError):
873
+ raise exc
874
+ response = getattr(cause, "response", {})
875
+ if response.get("Error", {}).get("Code") == "404":
876
+ raise exc
877
+
878
+ region = (
879
+ response.get("ResponseMetadata", {})
880
+ .get("HTTPHeaders", {})
881
+ .get("x-amz-bucket-region", None)
882
+ )
883
+ return region
884
+
885
+
886
+ def create_path(path: UPathStr, access_token: str | None = None) -> UPath:
887
+ upath = UPath(path).expanduser()
888
+
889
+ if upath.protocol == "s3":
890
+ # add managed credentials and other options for AWS s3 paths
891
+ return get_aws_options_manager().enrich_path(upath, access_token)
892
+
893
+ if upath.protocol in {"http", "https"}:
894
+ # this is needed because by default aiohttp drops a connection after 5 min
895
+ # so it is impossible to download large files
896
+ storage_options = {}
897
+ client_kwargs = upath.storage_options.get("client_kwargs", {})
898
+ if "timeout" not in client_kwargs:
899
+ from aiohttp import ClientTimeout
900
+
901
+ client_kwargs = {
902
+ **client_kwargs,
903
+ "timeout": ClientTimeout(sock_connect=30, sock_read=30),
904
+ }
905
+ storage_options["client_kwargs"] = client_kwargs
906
+ # see download_to for the reason
907
+ if "use_listings_cache" not in upath.storage_options:
908
+ storage_options["use_listings_cache"] = True # type: ignore
909
+ if len(storage_options) > 0:
910
+ return UPath(upath, **storage_options)
911
+ return upath
912
+
913
+
914
+ def get_stat_file_cloud(stat: dict) -> tuple[int, str | None, str | None]:
915
+ size = stat["size"]
916
+ hash, hash_type = None, None
917
+ # gs, use md5Hash instead of etag for now
918
+ if "md5Hash" in stat:
919
+ # gs hash is already in base64
920
+ hash = stat["md5Hash"].strip('"=')
921
+ hash_type = "md5"
922
+ # hf
923
+ elif "blob_id" in stat:
924
+ hash = b16_to_b64(stat["blob_id"])
925
+ hash_type = "sha1"
926
+ elif "ETag" in stat:
927
+ etag = stat["ETag"]
928
+ if "mimetype" in stat or ("url" in stat and stat["url"].startswith("http")):
929
+ # http
930
+ hash = hash_string(etag.strip('"'))
931
+ hash_type = "md5-etag"
932
+ else:
933
+ # s3
934
+ # small files
935
+ if "-" not in etag:
936
+ # only store hash for non-multipart uploads
937
+ # we can't rapidly validate multi-part uploaded files client-side
938
+ # we can add more logic later down-the-road
939
+ hash = b16_to_b64(etag)
940
+ hash_type = "md5"
941
+ else:
942
+ stripped_etag, suffix = etag.split("-")
943
+ suffix = suffix.strip('"')
944
+ hash = b16_to_b64(stripped_etag)
945
+ hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
946
+ if hash is not None:
947
+ hash = hash[:HASH_LENGTH]
948
+ return size, hash, hash_type
949
+
950
+
951
+ def get_stat_dir_cloud(path: UPath) -> tuple[int, str | None, str | None, int]:
952
+ objects = path.fs.find(path.as_posix(), detail=True)
953
+ hash, hash_type = None, None
954
+ compute_list_hash = True
955
+ if path.protocol == "s3":
956
+ accessor = "ETag"
957
+ elif path.protocol == "gs":
958
+ accessor = "md5Hash"
959
+ elif path.protocol == "hf":
960
+ accessor = "blob_id"
961
+ else:
962
+ compute_list_hash = False
963
+ sizes = []
964
+ hashes = []
965
+ for object in objects.values():
966
+ sizes.append(object["size"])
967
+ if compute_list_hash:
968
+ hashes.append(object[accessor].strip('"='))
969
+ size = sum(sizes)
970
+ n_files = len(sizes)
971
+ if compute_list_hash:
972
+ hash, hash_type = hash_from_hashes_list(hashes), "md5-d"
973
+ return size, hash, hash_type, n_files
974
+
975
+
976
+ # is as fast as boto3: https://lamin.ai/laminlabs/lamin-site-assets/transform/krGp3hT1f78N5zKv
977
+ def check_storage_is_empty(
978
+ root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
979
+ ) -> int:
980
+ from ._settings_storage import STORAGE_UID_FILE_KEY
981
+
982
+ root_upath = UPath(root)
983
+ root_string = root_upath.as_posix() # type: ignore
984
+ n_offset_objects = 1 # because of storage_uid.txt file, see mark_storage_root()
985
+ # if the storage_uid.txt was somehow deleted, we restore a dummy version of it
986
+ # because we need it to count files in an empty directory on S3 (otherwise permission error)
987
+ if not (root_upath / STORAGE_UID_FILE_KEY).exists():
988
+ try:
989
+ (root_upath / STORAGE_UID_FILE_KEY).write_text(
990
+ "was deleted, restored during delete"
991
+ )
992
+ except FileNotFoundError:
993
+ # this can happen if the root is a local non-existing path
994
+ pass
995
+ if account_for_sqlite_file:
996
+ n_offset_objects += 1 # the SQLite file is in the ".lamindb" directory
997
+ if root_string.startswith(HOSTED_BUCKETS):
998
+ # in hosted buckets, count across entire root
999
+ directory_string = root_string
1000
+ else:
1001
+ # in any other storage location, only count in .lamindb
1002
+ if not root_string.endswith("/"):
1003
+ root_string += "/"
1004
+ directory_string = root_string + ".lamindb"
1005
+ objects = root_upath.fs.find(directory_string)
1006
+ if account_for_sqlite_file:
1007
+ # ignore exclusion dir for cloud sqlite
1008
+ objects = [o for o in objects if "/.lamindb/_exclusion/" not in o]
1009
+ n_files = len(objects)
1010
+ n_diff = n_files - n_offset_objects
1011
+ if n_diff > 0:
1012
+ ask_for_deletion = (
1013
+ "delete them prior to deleting the storage location"
1014
+ if raise_error
1015
+ else "consider deleting them"
1016
+ )
1017
+ message = f"'{directory_string}' contains {n_diff} objects:\n"
1018
+ message += "\n".join(
1019
+ [
1020
+ o
1021
+ for o in objects
1022
+ if not o.endswith(".lamindb/storage_uid.txt")
1023
+ and not (account_for_sqlite_file and o.endswith(".lamindb/lamin.db"))
1024
+ ]
1025
+ )
1026
+ message += f"\n{ask_for_deletion}"
1027
+ if raise_error:
1028
+ raise StorageNotEmpty(message) from None
1029
+ else:
1030
+ logger.warning(message)
1031
+ return n_diff