lamindb_setup 0.77.2__py2.py3-none-any.whl → 0.77.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. lamindb_setup/__init__.py +1 -1
  2. lamindb_setup/_cache.py +34 -34
  3. lamindb_setup/_check.py +7 -7
  4. lamindb_setup/_check_setup.py +79 -79
  5. lamindb_setup/_close.py +35 -35
  6. lamindb_setup/_connect_instance.py +444 -444
  7. lamindb_setup/_delete.py +139 -137
  8. lamindb_setup/_django.py +41 -41
  9. lamindb_setup/_entry_points.py +22 -22
  10. lamindb_setup/_exportdb.py +68 -68
  11. lamindb_setup/_importdb.py +50 -50
  12. lamindb_setup/_init_instance.py +374 -374
  13. lamindb_setup/_migrate.py +239 -239
  14. lamindb_setup/_register_instance.py +36 -36
  15. lamindb_setup/_schema.py +27 -27
  16. lamindb_setup/_schema_metadata.py +411 -411
  17. lamindb_setup/_set_managed_storage.py +55 -55
  18. lamindb_setup/_setup_user.py +137 -137
  19. lamindb_setup/_silence_loggers.py +44 -44
  20. lamindb_setup/core/__init__.py +21 -21
  21. lamindb_setup/core/_aws_credentials.py +151 -151
  22. lamindb_setup/core/_aws_storage.py +48 -48
  23. lamindb_setup/core/_deprecated.py +55 -55
  24. lamindb_setup/core/_docs.py +14 -14
  25. lamindb_setup/core/_hub_core.py +590 -590
  26. lamindb_setup/core/_hub_crud.py +211 -211
  27. lamindb_setup/core/_hub_utils.py +109 -109
  28. lamindb_setup/core/_private_django_api.py +88 -88
  29. lamindb_setup/core/_settings.py +138 -138
  30. lamindb_setup/core/_settings_instance.py +467 -467
  31. lamindb_setup/core/_settings_load.py +105 -105
  32. lamindb_setup/core/_settings_save.py +81 -81
  33. lamindb_setup/core/_settings_storage.py +405 -393
  34. lamindb_setup/core/_settings_store.py +75 -75
  35. lamindb_setup/core/_settings_user.py +53 -53
  36. lamindb_setup/core/_setup_bionty_sources.py +101 -101
  37. lamindb_setup/core/cloud_sqlite_locker.py +232 -232
  38. lamindb_setup/core/django.py +114 -114
  39. lamindb_setup/core/exceptions.py +12 -12
  40. lamindb_setup/core/hashing.py +114 -114
  41. lamindb_setup/core/types.py +19 -19
  42. lamindb_setup/core/upath.py +779 -779
  43. {lamindb_setup-0.77.2.dist-info → lamindb_setup-0.77.3.dist-info}/METADATA +1 -1
  44. lamindb_setup-0.77.3.dist-info/RECORD +47 -0
  45. {lamindb_setup-0.77.2.dist-info → lamindb_setup-0.77.3.dist-info}/WHEEL +1 -1
  46. lamindb_setup-0.77.2.dist-info/RECORD +0 -47
  47. {lamindb_setup-0.77.2.dist-info → lamindb_setup-0.77.3.dist-info}/LICENSE +0 -0
@@ -1,779 +1,779 @@
1
- # we are not documenting UPath here because it's documented at lamindb.UPath
2
- """Paths & file systems."""
3
-
4
- from __future__ import annotations
5
-
6
- import os
7
- from collections import defaultdict
8
- from datetime import datetime, timezone
9
- from functools import partial
10
- from itertools import islice
11
- from pathlib import Path, PurePosixPath
12
- from typing import TYPE_CHECKING, Any, Literal
13
-
14
- import fsspec
15
- from lamin_utils import logger
16
- from upath import UPath
17
- from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
18
- from upath.implementations.local import LocalPath, PosixUPath, WindowsUPath
19
-
20
- from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
21
- from .hashing import HASH_LENGTH, b16_to_b64, hash_md5s_from_dir
22
-
23
- if TYPE_CHECKING:
24
- from .types import UPathStr
25
-
26
- LocalPathClasses = (PosixUPath, WindowsUPath, LocalPath)
27
-
28
- # also see https://gist.github.com/securifera/e7eed730cbe1ce43d0c29d7cd2d582f4
29
- # ".gz" is not listed here as it typically occurs with another suffix
30
- # the complete list is at lamindb.core.storage._suffixes
31
- VALID_SIMPLE_SUFFIXES = {
32
- #
33
- # without readers
34
- #
35
- ".fasta",
36
- ".fastq",
37
- ".jpg",
38
- ".mtx",
39
- ".obo",
40
- ".pdf",
41
- ".png",
42
- ".tar",
43
- ".tiff",
44
- ".txt",
45
- ".tsv",
46
- ".zip",
47
- ".xml",
48
- #
49
- # with readers (see below)
50
- #
51
- ".h5ad",
52
- ".parquet",
53
- ".csv",
54
- ".fcs",
55
- ".xslx",
56
- ".zarr",
57
- ".json",
58
- }
59
- # below gets updated within lamindb because it's frequently changing
60
- VALID_COMPOSITE_SUFFIXES = {".anndata.zarr"}
61
-
62
- TRAILING_SEP = (os.sep, os.altsep) if os.altsep is not None else os.sep
63
-
64
-
65
- def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
66
- def process_digits(suffix: str):
67
- if suffix[1:].isdigit(): # :1 to skip the dot
68
- return "" # digits are no valid suffixes
69
- else:
70
- return suffix
71
-
72
- if len(path.suffixes) <= 1:
73
- return process_digits(path.suffix)
74
-
75
- total_suffix = "".join(path.suffixes)
76
- if total_suffix in VALID_SIMPLE_SUFFIXES:
77
- return total_suffix
78
- elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
79
- # below seems slow but OK for now
80
- for suffix in VALID_COMPOSITE_SUFFIXES:
81
- if total_suffix.endswith(suffix):
82
- break
83
- return suffix
84
- else:
85
- print_hint = True
86
- arg_name = "file" if arg_name is None else arg_name # for the warning
87
- msg = f"{arg_name} has more than one suffix (path.suffixes), "
88
- # first check the 2nd-to-last suffix because it might be followed by .gz
89
- # or another compression-related suffix
90
- # Alex thought about adding logic along the lines of path.suffixes[-1]
91
- # in COMPRESSION_SUFFIXES to detect something like .random.gz and then
92
- # add ".random.gz" but concluded it's too dangerous it's safer to just
93
- # use ".gz" in such a case
94
- if path.suffixes[-2] in VALID_SIMPLE_SUFFIXES:
95
- suffix = "".join(path.suffixes[-2:])
96
- msg += f"inferring: '{suffix}'"
97
- # do not print a warning for things like .tar.gz, .fastq.gz
98
- if path.suffixes[-1] == ".gz":
99
- print_hint = False
100
- else:
101
- suffix = path.suffixes[-1] # this is equivalent to path.suffix
102
- msg += (
103
- f"using only last suffix: '{suffix}' - if you want your composite"
104
- " suffix to be recognized add it to"
105
- " lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()"
106
- )
107
- if print_hint:
108
- logger.hint(msg)
109
- return process_digits(suffix)
110
-
111
-
112
- def infer_filesystem(path: UPathStr):
113
- import fsspec # improve cold start
114
-
115
- path_str = str(path)
116
-
117
- if isinstance(path, UPath):
118
- fs = path.fs
119
- else:
120
- protocol = fsspec.utils.get_protocol(path_str)
121
- if protocol == "s3":
122
- fs_kwargs = {"cache_regions": True}
123
- else:
124
- fs_kwargs = {}
125
- fs = fsspec.filesystem(protocol, **fs_kwargs)
126
-
127
- return fs, path_str
128
-
129
-
130
- # this is needed to avoid CreateBucket permission
131
- class S3FSMap(fsspec.FSMap):
132
- def __setitem__(self, key, value):
133
- """Store value in key."""
134
- key = self._key_to_str(key)
135
- self.fs.pipe_file(key, fsspec.mapping.maybe_convert(value))
136
-
137
-
138
- def create_mapper(
139
- fs,
140
- url="",
141
- check=False,
142
- create=False,
143
- missing_exceptions=None,
144
- ):
145
- if fsspec.utils.get_protocol(url) == "s3":
146
- return S3FSMap(
147
- url, fs, check=check, create=False, missing_exceptions=missing_exceptions
148
- )
149
- else:
150
- return fsspec.FSMap(
151
- url, fs, check=check, create=create, missing_exceptions=missing_exceptions
152
- )
153
-
154
-
155
- def print_hook(size: int, value: int, objectname: str, action: str):
156
- if size == 0:
157
- progress_in_percent = 100.0
158
- else:
159
- progress_in_percent = (value / size) * 100
160
- out = f"... {action} {objectname}:" f" {min(progress_in_percent, 100):4.1f}%"
161
- if "NBPRJ_TEST_NBPATH" not in os.environ:
162
- end = "\n" if progress_in_percent >= 100 else "\r"
163
- print(out, end=end)
164
-
165
-
166
- class ProgressCallback(fsspec.callbacks.Callback):
167
- def __init__(
168
- self,
169
- objectname: str,
170
- action: Literal["uploading", "downloading", "synchronizing"],
171
- adjust_size: bool = False,
172
- ):
173
- assert action in {"uploading", "downloading", "synchronizing"}
174
-
175
- super().__init__()
176
-
177
- self.action = action
178
- print_progress = partial(print_hook, objectname=objectname, action=action)
179
- self.hooks = {"print_progress": print_progress}
180
-
181
- self.adjust_size = adjust_size
182
-
183
- def absolute_update(self, value):
184
- pass
185
-
186
- def relative_update(self, inc=1):
187
- pass
188
-
189
- def update_relative_value(self, inc=1):
190
- self.value += inc
191
- self.call()
192
-
193
- def branch(self, path_1, path_2, kwargs):
194
- if self.adjust_size:
195
- if Path(path_2 if self.action != "uploading" else path_1).is_dir():
196
- self.size -= 1
197
- kwargs["callback"] = ChildProgressCallback(self)
198
-
199
- def branched(self, path_1, path_2, **kwargs):
200
- self.branch(path_1, path_2, kwargs)
201
- return kwargs["callback"]
202
-
203
- def wrap(self, iterable):
204
- if self.adjust_size:
205
- paths = []
206
- for lpath, rpath in iterable:
207
- paths.append((lpath, rpath))
208
- if Path(lpath).is_dir():
209
- self.size -= 1
210
- self.adjust_size = False
211
- return paths
212
- else:
213
- return iterable
214
-
215
- @classmethod
216
- def requires_progress(
217
- cls,
218
- maybe_callback: fsspec.callbacks.Callback | None,
219
- print_progress: bool,
220
- objectname: str,
221
- action: Literal["uploading", "downloading", "synchronizing"],
222
- **kwargs,
223
- ):
224
- if maybe_callback is None:
225
- if print_progress:
226
- return cls(objectname, action, **kwargs)
227
- else:
228
- return fsspec.callbacks.NoOpCallback()
229
- return maybe_callback
230
-
231
-
232
- class ChildProgressCallback(fsspec.callbacks.Callback):
233
- def __init__(self, parent: ProgressCallback):
234
- super().__init__()
235
-
236
- self.parent = parent
237
-
238
- def parent_update(self, inc=1):
239
- self.parent.update_relative_value(inc)
240
-
241
- def relative_update(self, inc=1):
242
- if self.size != 0:
243
- self.parent_update(inc / self.size)
244
- else:
245
- self.parent_update(1)
246
-
247
-
248
- def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwargs):
249
- """Download from self (a destination in the cloud) to the local path."""
250
- if "recursive" not in kwargs:
251
- kwargs["recursive"] = True
252
- if print_progress and "callback" not in kwargs:
253
- callback = ProgressCallback(
254
- PurePosixPath(local_path).name, "downloading", adjust_size=True
255
- )
256
- kwargs["callback"] = callback
257
-
258
- self.fs.download(str(self), str(local_path), **kwargs)
259
-
260
-
261
- def upload_from(
262
- self,
263
- local_path: UPathStr,
264
- create_folder: bool | None = None,
265
- print_progress: bool = True,
266
- **kwargs,
267
- ) -> UPath:
268
- """Upload from the local path to `self` (a destination in the cloud).
269
-
270
- If the local path is a directory, recursively upload its contents.
271
-
272
- Args:
273
- local_path: A local path of a file or directory.
274
- create_folder: Only applies if `local_path` is a directory and then
275
- defaults to `True`. If `True`, make a new folder in the destination
276
- using the directory name of `local_path`. If `False`, upload the
277
- contents of the directory to to the root-level of the destination.
278
- print_progress: Print progress.
279
-
280
- Returns:
281
- The destination path.
282
- """
283
- local_path = Path(local_path)
284
- local_path_is_dir = local_path.is_dir()
285
- if create_folder is None:
286
- create_folder = local_path_is_dir
287
- if create_folder and not local_path_is_dir:
288
- raise ValueError("create_folder can only be True if local_path is a directory")
289
-
290
- if print_progress and "callback" not in kwargs:
291
- callback = ProgressCallback(local_path.name, "uploading")
292
- kwargs["callback"] = callback
293
-
294
- if local_path_is_dir and not create_folder:
295
- source = [f for f in local_path.rglob("*") if f.is_file()]
296
- destination = [str(self / f.relative_to(local_path)) for f in source]
297
- source = [str(f) for f in source] # type: ignore
298
- else:
299
- source = str(local_path) # type: ignore
300
- destination = str(self) # type: ignore
301
-
302
- # the below lines are to avoid s3fs triggering create_bucket in upload if
303
- # dirs are present it allows to avoid permission error
304
- # would be easier to just
305
- if self.protocol == "s3" and local_path_is_dir and create_folder:
306
- bucket = self._url.netloc
307
- if bucket not in self.fs.dircache:
308
- self.fs.dircache[bucket] = [{}]
309
- if not destination.endswith(TRAILING_SEP): # type: ignore
310
- destination += "/"
311
- cleanup_cache = True
312
- else:
313
- cleanup_cache = False
314
- else:
315
- cleanup_cache = False
316
-
317
- self.fs.upload(source, destination, recursive=create_folder, **kwargs)
318
-
319
- if cleanup_cache:
320
- # normally this is invalidated after the upload but still better to check
321
- if bucket in self.fs.dircache:
322
- del self.fs.dircache[bucket]
323
-
324
- if local_path_is_dir and create_folder:
325
- return self / local_path.name
326
- else:
327
- return self
328
-
329
-
330
- def synchronize(
331
- self,
332
- objectpath: Path,
333
- error_no_origin: bool = True,
334
- print_progress: bool = False,
335
- callback: fsspec.callbacks.Callback | None = None,
336
- timestamp: float | None = None,
337
- ):
338
- """Sync to a local destination path."""
339
- # optimize the number of network requests
340
- if timestamp is not None:
341
- is_dir = False
342
- exists = True
343
- cloud_mts = timestamp
344
- else:
345
- # perform only one network request to check existence, type and timestamp
346
- try:
347
- cloud_mts = self.modified.timestamp()
348
- is_dir = False
349
- exists = True
350
- except FileNotFoundError:
351
- exists = False
352
- except IsADirectoryError:
353
- is_dir = True
354
- exists = True
355
-
356
- if not exists:
357
- warn_or_error = f"The original path {self} does not exist anymore."
358
- if objectpath.exists():
359
- warn_or_error += (
360
- f"\nHowever, the local path {objectpath} still exists, you might want"
361
- " to reupload the object back."
362
- )
363
- logger.warning(warn_or_error)
364
- elif error_no_origin:
365
- warn_or_error += "\nIt is not possible to synchronize."
366
- raise FileNotFoundError(warn_or_error)
367
- return None
368
-
369
- # synchronization logic for directories
370
- if is_dir:
371
- files = self.fs.find(str(self), detail=True)
372
- protocol_modified = {"s3": "LastModified", "gs": "mtime"}
373
- modified_key = protocol_modified.get(self.protocol, None)
374
- if modified_key is None:
375
- raise ValueError(f"Can't synchronize a directory for {self.protocol}.")
376
- if objectpath.exists():
377
- destination_exists = True
378
- cloud_mts_max = max(
379
- file[modified_key] for file in files.values()
380
- ).timestamp()
381
- local_mts = [
382
- file.stat().st_mtime for file in objectpath.rglob("*") if file.is_file()
383
- ]
384
- n_local_files = len(local_mts)
385
- local_mts_max = max(local_mts)
386
- if local_mts_max == cloud_mts_max:
387
- need_synchronize = n_local_files != len(files)
388
- elif local_mts_max > cloud_mts_max:
389
- need_synchronize = False
390
- else:
391
- need_synchronize = True
392
- else:
393
- destination_exists = False
394
- need_synchronize = True
395
- if need_synchronize:
396
- callback = ProgressCallback.requires_progress(
397
- callback, print_progress, objectpath.name, "synchronizing"
398
- )
399
- callback.set_size(len(files))
400
- origin_file_keys = []
401
- for file, stat in callback.wrap(files.items()):
402
- file_key = PurePosixPath(file).relative_to(self.path)
403
- origin_file_keys.append(file_key.as_posix())
404
- timestamp = stat[modified_key].timestamp()
405
-
406
- origin = f"{self.protocol}://{file}"
407
- destination = objectpath / file_key
408
- child = callback.branched(origin, destination.as_posix())
409
- UPath(origin, **self.storage_options).synchronize(
410
- destination, callback=child, timestamp=timestamp
411
- )
412
- child.close()
413
- if destination_exists:
414
- local_files = [file for file in objectpath.rglob("*") if file.is_file()]
415
- if len(local_files) > len(files):
416
- for file in local_files:
417
- if (
418
- file.relative_to(objectpath).as_posix()
419
- not in origin_file_keys
420
- ):
421
- file.unlink()
422
- parent = file.parent
423
- if next(parent.iterdir(), None) is None:
424
- parent.rmdir()
425
- return None
426
-
427
- # synchronization logic for files
428
- callback = ProgressCallback.requires_progress(
429
- callback, print_progress, objectpath.name, "synchronizing"
430
- )
431
- if objectpath.exists():
432
- local_mts_obj = objectpath.stat().st_mtime # type: ignore
433
- need_synchronize = cloud_mts > local_mts_obj
434
- else:
435
- objectpath.parent.mkdir(parents=True, exist_ok=True)
436
- need_synchronize = True
437
- if need_synchronize:
438
- self.download_to(
439
- objectpath, recursive=False, print_progress=False, callback=callback
440
- )
441
- os.utime(objectpath, times=(cloud_mts, cloud_mts))
442
- else:
443
- # nothing happens if parent_update is not defined
444
- # because of Callback.no_op
445
- callback.parent_update()
446
-
447
-
448
- def modified(self) -> datetime | None:
449
- """Return modified time stamp."""
450
- mtime = self.fs.modified(str(self))
451
- if mtime.tzinfo is None:
452
- mtime = mtime.replace(tzinfo=timezone.utc)
453
- return mtime.astimezone().replace(tzinfo=None)
454
-
455
-
456
- def compute_file_tree(
457
- path: Path,
458
- *,
459
- level: int = -1,
460
- only_dirs: bool = False,
461
- n_max_files_per_dir_and_type: int = 100,
462
- n_max_files: int = 1000,
463
- include_paths: set[Any] | None = None,
464
- skip_suffixes: list[str] | None = None,
465
- ) -> tuple[str, int]:
466
- space = " "
467
- branch = "│ "
468
- tee = "├── "
469
- last = "└── "
470
- if skip_suffixes is None:
471
- skip_suffixes_tuple = ()
472
- else:
473
- skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
474
- n_objects = 0
475
- n_directories = 0
476
-
477
- # by default only including registered files
478
- # need a flag and a proper implementation
479
- suffixes = set()
480
- include_dirs = set()
481
- if include_paths is not None:
482
- include_dirs = {d for p in include_paths for d in p.parents}
483
- else:
484
- include_paths = set()
485
-
486
- def inner(dir_path: Path, prefix: str = "", level: int = -1):
487
- nonlocal n_objects, n_directories, suffixes
488
- if level == 0:
489
- return
490
- stripped_dir_path = dir_path.as_posix().rstrip("/")
491
- # do not iterate through zarr directories
492
- if stripped_dir_path.endswith(skip_suffixes_tuple):
493
- return
494
- # this is needed so that the passed folder is not listed
495
- contents = [
496
- i
497
- for i in dir_path.iterdir()
498
- if i.as_posix().rstrip("/") != stripped_dir_path
499
- ]
500
- if only_dirs:
501
- contents = [d for d in contents if d.is_dir()]
502
- pointers = [tee] * (len(contents) - 1) + [last]
503
- n_files_per_dir_and_type = defaultdict(lambda: 0) # type: ignore
504
- # TODO: pass strict=False to zip with python > 3.9
505
- for pointer, child_path in zip(pointers, contents): # type: ignore
506
- if child_path.is_dir():
507
- if include_dirs and child_path not in include_dirs:
508
- continue
509
- yield prefix + pointer + child_path.name + "/"
510
- n_directories += 1
511
- n_files_per_dir_and_type = defaultdict(lambda: 0)
512
- extension = branch if pointer == tee else space
513
- yield from inner(child_path, prefix=prefix + extension, level=level - 1)
514
- elif not only_dirs:
515
- if include_paths and child_path not in include_paths:
516
- continue
517
- suffix = extract_suffix_from_path(child_path)
518
- suffixes.add(suffix)
519
- n_files_per_dir_and_type[suffix] += 1
520
- n_objects += 1
521
- if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
522
- yield prefix + "..."
523
- elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
524
- continue
525
- else:
526
- yield prefix + pointer + child_path.name
527
-
528
- folder_tree = ""
529
- iterator = inner(path, level=level)
530
- for line in islice(iterator, n_max_files):
531
- folder_tree += f"\n{line}"
532
- if next(iterator, None):
533
- folder_tree += f"\n... only showing {n_max_files} out of {n_objects} files"
534
- directory_info = "directory" if n_directories == 1 else "directories"
535
- display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
536
- suffix_message = f" with suffixes {display_suffixes}" if n_objects > 0 else ""
537
- message = (
538
- f"{n_directories} sub-{directory_info} &"
539
- f" {n_objects} files{suffix_message}\n{path.resolve()}{folder_tree}"
540
- )
541
- return message, n_objects
542
-
543
-
544
- # adapted from: https://stackoverflow.com/questions/9727673
545
- def view_tree(
546
- path: Path,
547
- *,
548
- level: int = 2,
549
- only_dirs: bool = False,
550
- n_max_files_per_dir_and_type: int = 100,
551
- n_max_files: int = 1000,
552
- include_paths: set[Any] | None = None,
553
- skip_suffixes: list[str] | None = None,
554
- ) -> None:
555
- """Print a visual tree structure of files & directories.
556
-
557
- Args:
558
- level: If `1`, only iterate through one level, if `2` iterate through 2
559
- levels, if `-1` iterate through entire hierarchy.
560
- only_dirs: Only iterate through directories.
561
- n_max_files: Display limit. Will only show this many files. Doesn't affect count.
562
- include_paths: Restrict to these paths.
563
- skip_suffixes: Skip directories with these suffixes.
564
-
565
- Examples:
566
- >>> dir_path = ln.core.datasets.generate_cell_ranger_files(
567
- >>> "sample_001", ln.settings.storage
568
- >>> )
569
- >>> ln.UPath(dir_path).view_tree()
570
- 3 subdirectories, 15 files
571
- sample_001
572
- ├── web_summary.html
573
- ├── metrics_summary.csv
574
- ├── molecule_info.h5
575
- ├── filtered_feature_bc_matrix
576
- │ ├── features.tsv.gz
577
- │ ├── barcodes.tsv.gz
578
- │ └── matrix.mtx.gz
579
- ├── analysis
580
- │ └── analysis.csv
581
- ├── raw_feature_bc_matrix
582
- │ ├── features.tsv.gz
583
- │ ├── barcodes.tsv.gz
584
- │ └── matrix.mtx.gz
585
- ├── possorted_genome_bam.bam.bai
586
- ├── cloupe.cloupe
587
- ├── possorted_genome_bam.bam
588
- ├── filtered_feature_bc_matrix.h5
589
- └── raw_feature_bc_matrix.h5
590
- """
591
- message, _ = compute_file_tree(
592
- path,
593
- level=level,
594
- only_dirs=only_dirs,
595
- n_max_files=n_max_files,
596
- n_max_files_per_dir_and_type=n_max_files_per_dir_and_type,
597
- include_paths=include_paths,
598
- skip_suffixes=skip_suffixes,
599
- )
600
- logger.print(message)
601
-
602
-
603
- def to_url(upath):
604
- """Public storage URL.
605
-
606
- Generates a public URL for an object in an S3 bucket using fsspec's UPath,
607
- considering the bucket's region.
608
-
609
- Args:
610
- - upath: A UPath object representing an S3 path.
611
-
612
- Returns:
613
- - A string containing the public URL to the S3 object.
614
- """
615
- if upath.protocol != "s3":
616
- raise ValueError("The provided UPath must be an S3 path.")
617
- key = "/".join(upath.parts[1:])
618
- bucket = upath._url.netloc
619
- if bucket == "scverse-spatial-eu-central-1":
620
- region = "eu-central-1"
621
- elif f"s3://{bucket}" not in HOSTED_BUCKETS:
622
- response = upath.fs.call_s3("head_bucket", Bucket=upath._url.netloc)
623
- headers = response["ResponseMetadata"]["HTTPHeaders"]
624
- region = headers.get("x-amz-bucket-region")
625
- else:
626
- region = bucket.replace("lamin_", "")
627
- if region == "us-east-1":
628
- return f"https://{bucket}.s3.amazonaws.com/{key}"
629
- else:
630
- return f"https://{bucket}.s3-{region}.amazonaws.com/{key}"
631
-
632
-
633
- # Why aren't we subclassing?
634
- #
635
- # The problem is that UPath defines a type system of paths
636
- # Its __new__ method returns instances of different subclasses rather than a
637
- # UPath object
638
- # If we create a custom subclass naively, subclasses of the parent UPath won't
639
- # be subclasses of our custom subclass
640
- # This makes life really hard in type checks involving local to cloud
641
- # comparisons, etc.
642
- # Hence, we extend the existing UPath and amend the docs
643
- # Some of this might end up in the original UPath implementation over time,
644
- # we'll see.
645
-
646
-
647
- # add custom functions
648
- UPath.modified = property(modified)
649
- UPath.synchronize = synchronize
650
- UPath.upload_from = upload_from
651
- UPath.to_url = to_url
652
- UPath.download_to = download_to
653
- UPath.view_tree = view_tree
654
- # unfortunately, we also have to do this for the subclasses
655
- Path.view_tree = view_tree # type: ignore
656
-
657
- UPath.glob.__doc__ = Path.glob.__doc__
658
- UPath.rglob.__doc__ = Path.rglob.__doc__
659
- UPath.stat.__doc__ = Path.stat.__doc__
660
- UPath.iterdir.__doc__ = Path.iterdir.__doc__
661
- UPath.resolve.__doc__ = Path.resolve.__doc__
662
- UPath.relative_to.__doc__ = Path.relative_to.__doc__
663
- UPath.exists.__doc__ = Path.exists.__doc__
664
- UPath.is_dir.__doc__ = Path.is_dir.__doc__
665
- UPath.is_file.__doc__ = Path.is_file.__doc__
666
- UPath.unlink.__doc__ = Path.unlink.__doc__
667
- UPath.rename.__doc__ = """Move file, see fsspec.AbstractFileSystem.mv.
668
-
669
- >>> upath = Upath("s3://my-bucket/my-file")
670
- >>> upath.rename(UPath("s3://my-bucket/my-file-renamed"))
671
- >>> upath.rename("my-file-renamed")
672
-
673
- >>> upath = Upath("local-folder/my-file")
674
- >>> upath.rename("local-folder/my-file-renamed")
675
- """
676
- UPath.__doc__ = """Paths: low-level key-value access to files/objects.
677
-
678
- Paths are based on keys that offer the typical access patterns of file systems
679
- and object stores.
680
-
681
- >>> upath = UPath("s3://my-bucket/my-folder")
682
- >>> upath.exists()
683
-
684
- Args:
685
- pathlike: A string or Path to a local/cloud file/directory/folder.
686
- """
687
-
688
-
689
- def create_path(path: UPath, access_token: str | None = None) -> UPath:
690
- path = UPath(path)
691
- # test whether we have an AWS S3 path
692
- if not isinstance(path, S3Path):
693
- return path
694
- return get_aws_credentials_manager().enrich_path(path, access_token)
695
-
696
-
697
- def get_stat_file_cloud(stat: dict) -> tuple[int, str, str]:
698
- size = stat["size"]
699
- etag = stat["ETag"]
700
- # small files
701
- if "-" not in etag:
702
- # only store hash for non-multipart uploads
703
- # we can't rapidly validate multi-part uploaded files client-side
704
- # we can add more logic later down-the-road
705
- hash = b16_to_b64(etag)
706
- hash_type = "md5"
707
- else:
708
- stripped_etag, suffix = etag.split("-")
709
- suffix = suffix.strip('"')
710
- hash = b16_to_b64(stripped_etag)
711
- hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
712
- return size, hash[:HASH_LENGTH], hash_type
713
-
714
-
715
- def get_stat_dir_cloud(path: UPath) -> tuple[int, str, str, int]:
716
- sizes = []
717
- md5s = []
718
- objects = path.fs.find(path.as_posix(), detail=True)
719
- if path.protocol == "s3":
720
- accessor = "ETag"
721
- elif path.protocol == "gs":
722
- accessor = "md5Hash"
723
- for object in objects.values():
724
- sizes.append(object["size"])
725
- md5s.append(object[accessor].strip('"='))
726
- size = sum(sizes)
727
- hash, hash_type = hash_md5s_from_dir(md5s)
728
- n_objects = len(md5s)
729
- return size, hash, hash_type, n_objects
730
-
731
-
732
- class InstanceNotEmpty(Exception):
733
- pass
734
-
735
-
736
- # is as fast as boto3: https://lamin.ai/laminlabs/lamindata/transform/krGp3hT1f78N5zKv
737
- def check_storage_is_empty(
738
- root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
739
- ) -> int:
740
- root_upath = UPath(root)
741
- root_string = root_upath.as_posix() # type: ignore
742
- # we currently touch a 0-byte file in the root of a hosted storage location
743
- # ({storage_root}/.lamindb/_is_initialized) during storage initialization
744
- # since path.fs.find raises a PermissionError on empty hosted
745
- # subdirectories (see lamindb_setup/core/_settings_storage/init_storage).
746
- n_offset_objects = 1 # because of touched dummy file, see mark_storage_root()
747
- if root_string.startswith(HOSTED_BUCKETS):
748
- # in hosted buckets, count across entire root
749
- directory_string = root_string
750
- # the SQLite file is not in the ".lamindb" directory
751
- if account_for_sqlite_file:
752
- n_offset_objects += 1 # because of SQLite file
753
- else:
754
- # in any other storage location, only count in .lamindb
755
- if not root_string.endswith("/"):
756
- root_string += "/"
757
- directory_string = root_string + ".lamindb"
758
- objects = root_upath.fs.find(directory_string)
759
- n_objects = len(objects)
760
- n_diff = n_objects - n_offset_objects
761
- ask_for_deletion = (
762
- "delete them prior to deleting the instance"
763
- if raise_error
764
- else "consider deleting them"
765
- )
766
- hint = "'_is_initialized'"
767
- if n_offset_objects == 2:
768
- hint += " & SQLite file"
769
- hint += " ignored"
770
- message = (
771
- f"Storage {directory_string} contains {n_objects - n_offset_objects} objects "
772
- f"({hint}) - {ask_for_deletion}"
773
- )
774
- if n_diff > 0:
775
- if raise_error:
776
- raise InstanceNotEmpty(message)
777
- else:
778
- logger.warning(message)
779
- return n_diff
1
+ # we are not documenting UPath here because it's documented at lamindb.UPath
2
+ """Paths & file systems."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import os
7
+ from collections import defaultdict
8
+ from datetime import datetime, timezone
9
+ from functools import partial
10
+ from itertools import islice
11
+ from pathlib import Path, PurePosixPath
12
+ from typing import TYPE_CHECKING, Any, Literal
13
+
14
+ import fsspec
15
+ from lamin_utils import logger
16
+ from upath import UPath
17
+ from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
18
+ from upath.implementations.local import LocalPath, PosixUPath, WindowsUPath
19
+
20
+ from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
21
+ from .hashing import HASH_LENGTH, b16_to_b64, hash_md5s_from_dir
22
+
23
+ if TYPE_CHECKING:
24
+ from .types import UPathStr
25
+
26
+ LocalPathClasses = (PosixUPath, WindowsUPath, LocalPath)
27
+
28
+ # also see https://gist.github.com/securifera/e7eed730cbe1ce43d0c29d7cd2d582f4
29
+ # ".gz" is not listed here as it typically occurs with another suffix
30
+ # the complete list is at lamindb.core.storage._suffixes
31
+ VALID_SIMPLE_SUFFIXES = {
32
+ #
33
+ # without readers
34
+ #
35
+ ".fasta",
36
+ ".fastq",
37
+ ".jpg",
38
+ ".mtx",
39
+ ".obo",
40
+ ".pdf",
41
+ ".png",
42
+ ".tar",
43
+ ".tiff",
44
+ ".txt",
45
+ ".tsv",
46
+ ".zip",
47
+ ".xml",
48
+ #
49
+ # with readers (see below)
50
+ #
51
+ ".h5ad",
52
+ ".parquet",
53
+ ".csv",
54
+ ".fcs",
55
+ ".xslx",
56
+ ".zarr",
57
+ ".json",
58
+ }
59
+ # below gets updated within lamindb because it's frequently changing
60
+ VALID_COMPOSITE_SUFFIXES = {".anndata.zarr"}
61
+
62
+ TRAILING_SEP = (os.sep, os.altsep) if os.altsep is not None else os.sep
63
+
64
+
65
+ def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
66
+ def process_digits(suffix: str):
67
+ if suffix[1:].isdigit(): # :1 to skip the dot
68
+ return "" # digits are no valid suffixes
69
+ else:
70
+ return suffix
71
+
72
+ if len(path.suffixes) <= 1:
73
+ return process_digits(path.suffix)
74
+
75
+ total_suffix = "".join(path.suffixes)
76
+ if total_suffix in VALID_SIMPLE_SUFFIXES:
77
+ return total_suffix
78
+ elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
79
+ # below seems slow but OK for now
80
+ for suffix in VALID_COMPOSITE_SUFFIXES:
81
+ if total_suffix.endswith(suffix):
82
+ break
83
+ return suffix
84
+ else:
85
+ print_hint = True
86
+ arg_name = "file" if arg_name is None else arg_name # for the warning
87
+ msg = f"{arg_name} has more than one suffix (path.suffixes), "
88
+ # first check the 2nd-to-last suffix because it might be followed by .gz
89
+ # or another compression-related suffix
90
+ # Alex thought about adding logic along the lines of path.suffixes[-1]
91
+ # in COMPRESSION_SUFFIXES to detect something like .random.gz and then
92
+ # add ".random.gz" but concluded it's too dangerous it's safer to just
93
+ # use ".gz" in such a case
94
+ if path.suffixes[-2] in VALID_SIMPLE_SUFFIXES:
95
+ suffix = "".join(path.suffixes[-2:])
96
+ msg += f"inferring: '{suffix}'"
97
+ # do not print a warning for things like .tar.gz, .fastq.gz
98
+ if path.suffixes[-1] == ".gz":
99
+ print_hint = False
100
+ else:
101
+ suffix = path.suffixes[-1] # this is equivalent to path.suffix
102
+ msg += (
103
+ f"using only last suffix: '{suffix}' - if you want your composite"
104
+ " suffix to be recognized add it to"
105
+ " lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()"
106
+ )
107
+ if print_hint:
108
+ logger.hint(msg)
109
+ return process_digits(suffix)
110
+
111
+
112
+ def infer_filesystem(path: UPathStr):
113
+ import fsspec # improve cold start
114
+
115
+ path_str = str(path)
116
+
117
+ if isinstance(path, UPath):
118
+ fs = path.fs
119
+ else:
120
+ protocol = fsspec.utils.get_protocol(path_str)
121
+ if protocol == "s3":
122
+ fs_kwargs = {"cache_regions": True}
123
+ else:
124
+ fs_kwargs = {}
125
+ fs = fsspec.filesystem(protocol, **fs_kwargs)
126
+
127
+ return fs, path_str
128
+
129
+
130
+ # this is needed to avoid CreateBucket permission
131
+ class S3FSMap(fsspec.FSMap):
132
+ def __setitem__(self, key, value):
133
+ """Store value in key."""
134
+ key = self._key_to_str(key)
135
+ self.fs.pipe_file(key, fsspec.mapping.maybe_convert(value))
136
+
137
+
138
+ def create_mapper(
139
+ fs,
140
+ url="",
141
+ check=False,
142
+ create=False,
143
+ missing_exceptions=None,
144
+ ):
145
+ if fsspec.utils.get_protocol(url) == "s3":
146
+ return S3FSMap(
147
+ url, fs, check=check, create=False, missing_exceptions=missing_exceptions
148
+ )
149
+ else:
150
+ return fsspec.FSMap(
151
+ url, fs, check=check, create=create, missing_exceptions=missing_exceptions
152
+ )
153
+
154
+
155
+ def print_hook(size: int, value: int, objectname: str, action: str):
156
+ if size == 0:
157
+ progress_in_percent = 100.0
158
+ else:
159
+ progress_in_percent = (value / size) * 100
160
+ out = f"... {action} {objectname}:" f" {min(progress_in_percent, 100):4.1f}%"
161
+ if "NBPRJ_TEST_NBPATH" not in os.environ:
162
+ end = "\n" if progress_in_percent >= 100 else "\r"
163
+ print(out, end=end)
164
+
165
+
166
+ class ProgressCallback(fsspec.callbacks.Callback):
167
+ def __init__(
168
+ self,
169
+ objectname: str,
170
+ action: Literal["uploading", "downloading", "synchronizing"],
171
+ adjust_size: bool = False,
172
+ ):
173
+ assert action in {"uploading", "downloading", "synchronizing"}
174
+
175
+ super().__init__()
176
+
177
+ self.action = action
178
+ print_progress = partial(print_hook, objectname=objectname, action=action)
179
+ self.hooks = {"print_progress": print_progress}
180
+
181
+ self.adjust_size = adjust_size
182
+
183
+ def absolute_update(self, value):
184
+ pass
185
+
186
+ def relative_update(self, inc=1):
187
+ pass
188
+
189
+ def update_relative_value(self, inc=1):
190
+ self.value += inc
191
+ self.call()
192
+
193
+ def branch(self, path_1, path_2, kwargs):
194
+ if self.adjust_size:
195
+ if Path(path_2 if self.action != "uploading" else path_1).is_dir():
196
+ self.size -= 1
197
+ kwargs["callback"] = ChildProgressCallback(self)
198
+
199
+ def branched(self, path_1, path_2, **kwargs):
200
+ self.branch(path_1, path_2, kwargs)
201
+ return kwargs["callback"]
202
+
203
+ def wrap(self, iterable):
204
+ if self.adjust_size:
205
+ paths = []
206
+ for lpath, rpath in iterable:
207
+ paths.append((lpath, rpath))
208
+ if Path(lpath).is_dir():
209
+ self.size -= 1
210
+ self.adjust_size = False
211
+ return paths
212
+ else:
213
+ return iterable
214
+
215
+ @classmethod
216
+ def requires_progress(
217
+ cls,
218
+ maybe_callback: fsspec.callbacks.Callback | None,
219
+ print_progress: bool,
220
+ objectname: str,
221
+ action: Literal["uploading", "downloading", "synchronizing"],
222
+ **kwargs,
223
+ ):
224
+ if maybe_callback is None:
225
+ if print_progress:
226
+ return cls(objectname, action, **kwargs)
227
+ else:
228
+ return fsspec.callbacks.NoOpCallback()
229
+ return maybe_callback
230
+
231
+
232
+ class ChildProgressCallback(fsspec.callbacks.Callback):
233
+ def __init__(self, parent: ProgressCallback):
234
+ super().__init__()
235
+
236
+ self.parent = parent
237
+
238
+ def parent_update(self, inc=1):
239
+ self.parent.update_relative_value(inc)
240
+
241
+ def relative_update(self, inc=1):
242
+ if self.size != 0:
243
+ self.parent_update(inc / self.size)
244
+ else:
245
+ self.parent_update(1)
246
+
247
+
248
+ def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwargs):
249
+ """Download from self (a destination in the cloud) to the local path."""
250
+ if "recursive" not in kwargs:
251
+ kwargs["recursive"] = True
252
+ if print_progress and "callback" not in kwargs:
253
+ callback = ProgressCallback(
254
+ PurePosixPath(local_path).name, "downloading", adjust_size=True
255
+ )
256
+ kwargs["callback"] = callback
257
+
258
+ self.fs.download(str(self), str(local_path), **kwargs)
259
+
260
+
261
+ def upload_from(
262
+ self,
263
+ local_path: UPathStr,
264
+ create_folder: bool | None = None,
265
+ print_progress: bool = True,
266
+ **kwargs,
267
+ ) -> UPath:
268
+ """Upload from the local path to `self` (a destination in the cloud).
269
+
270
+ If the local path is a directory, recursively upload its contents.
271
+
272
+ Args:
273
+ local_path: A local path of a file or directory.
274
+ create_folder: Only applies if `local_path` is a directory and then
275
+ defaults to `True`. If `True`, make a new folder in the destination
276
+ using the directory name of `local_path`. If `False`, upload the
277
+ contents of the directory to to the root-level of the destination.
278
+ print_progress: Print progress.
279
+
280
+ Returns:
281
+ The destination path.
282
+ """
283
+ local_path = Path(local_path)
284
+ local_path_is_dir = local_path.is_dir()
285
+ if create_folder is None:
286
+ create_folder = local_path_is_dir
287
+ if create_folder and not local_path_is_dir:
288
+ raise ValueError("create_folder can only be True if local_path is a directory")
289
+
290
+ if print_progress and "callback" not in kwargs:
291
+ callback = ProgressCallback(local_path.name, "uploading")
292
+ kwargs["callback"] = callback
293
+
294
+ if local_path_is_dir and not create_folder:
295
+ source = [f for f in local_path.rglob("*") if f.is_file()]
296
+ destination = [str(self / f.relative_to(local_path)) for f in source]
297
+ source = [str(f) for f in source] # type: ignore
298
+ else:
299
+ source = str(local_path) # type: ignore
300
+ destination = str(self) # type: ignore
301
+
302
+ # the below lines are to avoid s3fs triggering create_bucket in upload if
303
+ # dirs are present it allows to avoid permission error
304
+ # would be easier to just
305
+ if self.protocol == "s3" and local_path_is_dir and create_folder:
306
+ bucket = self._url.netloc
307
+ if bucket not in self.fs.dircache:
308
+ self.fs.dircache[bucket] = [{}]
309
+ if not destination.endswith(TRAILING_SEP): # type: ignore
310
+ destination += "/"
311
+ cleanup_cache = True
312
+ else:
313
+ cleanup_cache = False
314
+ else:
315
+ cleanup_cache = False
316
+
317
+ self.fs.upload(source, destination, recursive=create_folder, **kwargs)
318
+
319
+ if cleanup_cache:
320
+ # normally this is invalidated after the upload but still better to check
321
+ if bucket in self.fs.dircache:
322
+ del self.fs.dircache[bucket]
323
+
324
+ if local_path_is_dir and create_folder:
325
+ return self / local_path.name
326
+ else:
327
+ return self
328
+
329
+
330
+ def synchronize(
331
+ self,
332
+ objectpath: Path,
333
+ error_no_origin: bool = True,
334
+ print_progress: bool = False,
335
+ callback: fsspec.callbacks.Callback | None = None,
336
+ timestamp: float | None = None,
337
+ ):
338
+ """Sync to a local destination path."""
339
+ # optimize the number of network requests
340
+ if timestamp is not None:
341
+ is_dir = False
342
+ exists = True
343
+ cloud_mts = timestamp
344
+ else:
345
+ # perform only one network request to check existence, type and timestamp
346
+ try:
347
+ cloud_mts = self.modified.timestamp()
348
+ is_dir = False
349
+ exists = True
350
+ except FileNotFoundError:
351
+ exists = False
352
+ except IsADirectoryError:
353
+ is_dir = True
354
+ exists = True
355
+
356
+ if not exists:
357
+ warn_or_error = f"The original path {self} does not exist anymore."
358
+ if objectpath.exists():
359
+ warn_or_error += (
360
+ f"\nHowever, the local path {objectpath} still exists, you might want"
361
+ " to reupload the object back."
362
+ )
363
+ logger.warning(warn_or_error)
364
+ elif error_no_origin:
365
+ warn_or_error += "\nIt is not possible to synchronize."
366
+ raise FileNotFoundError(warn_or_error)
367
+ return None
368
+
369
+ # synchronization logic for directories
370
+ if is_dir:
371
+ files = self.fs.find(str(self), detail=True)
372
+ protocol_modified = {"s3": "LastModified", "gs": "mtime"}
373
+ modified_key = protocol_modified.get(self.protocol, None)
374
+ if modified_key is None:
375
+ raise ValueError(f"Can't synchronize a directory for {self.protocol}.")
376
+ if objectpath.exists():
377
+ destination_exists = True
378
+ cloud_mts_max = max(
379
+ file[modified_key] for file in files.values()
380
+ ).timestamp()
381
+ local_mts = [
382
+ file.stat().st_mtime for file in objectpath.rglob("*") if file.is_file()
383
+ ]
384
+ n_local_files = len(local_mts)
385
+ local_mts_max = max(local_mts)
386
+ if local_mts_max == cloud_mts_max:
387
+ need_synchronize = n_local_files != len(files)
388
+ elif local_mts_max > cloud_mts_max:
389
+ need_synchronize = False
390
+ else:
391
+ need_synchronize = True
392
+ else:
393
+ destination_exists = False
394
+ need_synchronize = True
395
+ if need_synchronize:
396
+ callback = ProgressCallback.requires_progress(
397
+ callback, print_progress, objectpath.name, "synchronizing"
398
+ )
399
+ callback.set_size(len(files))
400
+ origin_file_keys = []
401
+ for file, stat in callback.wrap(files.items()):
402
+ file_key = PurePosixPath(file).relative_to(self.path)
403
+ origin_file_keys.append(file_key.as_posix())
404
+ timestamp = stat[modified_key].timestamp()
405
+
406
+ origin = f"{self.protocol}://{file}"
407
+ destination = objectpath / file_key
408
+ child = callback.branched(origin, destination.as_posix())
409
+ UPath(origin, **self.storage_options).synchronize(
410
+ destination, callback=child, timestamp=timestamp
411
+ )
412
+ child.close()
413
+ if destination_exists:
414
+ local_files = [file for file in objectpath.rglob("*") if file.is_file()]
415
+ if len(local_files) > len(files):
416
+ for file in local_files:
417
+ if (
418
+ file.relative_to(objectpath).as_posix()
419
+ not in origin_file_keys
420
+ ):
421
+ file.unlink()
422
+ parent = file.parent
423
+ if next(parent.iterdir(), None) is None:
424
+ parent.rmdir()
425
+ return None
426
+
427
+ # synchronization logic for files
428
+ callback = ProgressCallback.requires_progress(
429
+ callback, print_progress, objectpath.name, "synchronizing"
430
+ )
431
+ if objectpath.exists():
432
+ local_mts_obj = objectpath.stat().st_mtime # type: ignore
433
+ need_synchronize = cloud_mts > local_mts_obj
434
+ else:
435
+ objectpath.parent.mkdir(parents=True, exist_ok=True)
436
+ need_synchronize = True
437
+ if need_synchronize:
438
+ self.download_to(
439
+ objectpath, recursive=False, print_progress=False, callback=callback
440
+ )
441
+ os.utime(objectpath, times=(cloud_mts, cloud_mts))
442
+ else:
443
+ # nothing happens if parent_update is not defined
444
+ # because of Callback.no_op
445
+ callback.parent_update()
446
+
447
+
448
+ def modified(self) -> datetime | None:
449
+ """Return modified time stamp."""
450
+ mtime = self.fs.modified(str(self))
451
+ if mtime.tzinfo is None:
452
+ mtime = mtime.replace(tzinfo=timezone.utc)
453
+ return mtime.astimezone().replace(tzinfo=None)
454
+
455
+
456
+ def compute_file_tree(
457
+ path: Path,
458
+ *,
459
+ level: int = -1,
460
+ only_dirs: bool = False,
461
+ n_max_files_per_dir_and_type: int = 100,
462
+ n_max_files: int = 1000,
463
+ include_paths: set[Any] | None = None,
464
+ skip_suffixes: list[str] | None = None,
465
+ ) -> tuple[str, int]:
466
+ space = " "
467
+ branch = "│ "
468
+ tee = "├── "
469
+ last = "└── "
470
+ if skip_suffixes is None:
471
+ skip_suffixes_tuple = ()
472
+ else:
473
+ skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
474
+ n_objects = 0
475
+ n_directories = 0
476
+
477
+ # by default only including registered files
478
+ # need a flag and a proper implementation
479
+ suffixes = set()
480
+ include_dirs = set()
481
+ if include_paths is not None:
482
+ include_dirs = {d for p in include_paths for d in p.parents}
483
+ else:
484
+ include_paths = set()
485
+
486
+ def inner(dir_path: Path, prefix: str = "", level: int = -1):
487
+ nonlocal n_objects, n_directories, suffixes
488
+ if level == 0:
489
+ return
490
+ stripped_dir_path = dir_path.as_posix().rstrip("/")
491
+ # do not iterate through zarr directories
492
+ if stripped_dir_path.endswith(skip_suffixes_tuple):
493
+ return
494
+ # this is needed so that the passed folder is not listed
495
+ contents = [
496
+ i
497
+ for i in dir_path.iterdir()
498
+ if i.as_posix().rstrip("/") != stripped_dir_path
499
+ ]
500
+ if only_dirs:
501
+ contents = [d for d in contents if d.is_dir()]
502
+ pointers = [tee] * (len(contents) - 1) + [last]
503
+ n_files_per_dir_and_type = defaultdict(lambda: 0) # type: ignore
504
+ # TODO: pass strict=False to zip with python > 3.9
505
+ for pointer, child_path in zip(pointers, contents): # type: ignore
506
+ if child_path.is_dir():
507
+ if include_dirs and child_path not in include_dirs:
508
+ continue
509
+ yield prefix + pointer + child_path.name + "/"
510
+ n_directories += 1
511
+ n_files_per_dir_and_type = defaultdict(lambda: 0)
512
+ extension = branch if pointer == tee else space
513
+ yield from inner(child_path, prefix=prefix + extension, level=level - 1)
514
+ elif not only_dirs:
515
+ if include_paths and child_path not in include_paths:
516
+ continue
517
+ suffix = extract_suffix_from_path(child_path)
518
+ suffixes.add(suffix)
519
+ n_files_per_dir_and_type[suffix] += 1
520
+ n_objects += 1
521
+ if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
522
+ yield prefix + "..."
523
+ elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
524
+ continue
525
+ else:
526
+ yield prefix + pointer + child_path.name
527
+
528
+ folder_tree = ""
529
+ iterator = inner(path, level=level)
530
+ for line in islice(iterator, n_max_files):
531
+ folder_tree += f"\n{line}"
532
+ if next(iterator, None):
533
+ folder_tree += f"\n... only showing {n_max_files} out of {n_objects} files"
534
+ directory_info = "directory" if n_directories == 1 else "directories"
535
+ display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
536
+ suffix_message = f" with suffixes {display_suffixes}" if n_objects > 0 else ""
537
+ message = (
538
+ f"{n_directories} sub-{directory_info} &"
539
+ f" {n_objects} files{suffix_message}\n{path.resolve()}{folder_tree}"
540
+ )
541
+ return message, n_objects
542
+
543
+
544
+ # adapted from: https://stackoverflow.com/questions/9727673
545
+ def view_tree(
546
+ path: Path,
547
+ *,
548
+ level: int = 2,
549
+ only_dirs: bool = False,
550
+ n_max_files_per_dir_and_type: int = 100,
551
+ n_max_files: int = 1000,
552
+ include_paths: set[Any] | None = None,
553
+ skip_suffixes: list[str] | None = None,
554
+ ) -> None:
555
+ """Print a visual tree structure of files & directories.
556
+
557
+ Args:
558
+ level: If `1`, only iterate through one level, if `2` iterate through 2
559
+ levels, if `-1` iterate through entire hierarchy.
560
+ only_dirs: Only iterate through directories.
561
+ n_max_files: Display limit. Will only show this many files. Doesn't affect count.
562
+ include_paths: Restrict to these paths.
563
+ skip_suffixes: Skip directories with these suffixes.
564
+
565
+ Examples:
566
+ >>> dir_path = ln.core.datasets.generate_cell_ranger_files(
567
+ >>> "sample_001", ln.settings.storage
568
+ >>> )
569
+ >>> ln.UPath(dir_path).view_tree()
570
+ 3 subdirectories, 15 files
571
+ sample_001
572
+ ├── web_summary.html
573
+ ├── metrics_summary.csv
574
+ ├── molecule_info.h5
575
+ ├── filtered_feature_bc_matrix
576
+ │ ├── features.tsv.gz
577
+ │ ├── barcodes.tsv.gz
578
+ │ └── matrix.mtx.gz
579
+ ├── analysis
580
+ │ └── analysis.csv
581
+ ├── raw_feature_bc_matrix
582
+ │ ├── features.tsv.gz
583
+ │ ├── barcodes.tsv.gz
584
+ │ └── matrix.mtx.gz
585
+ ├── possorted_genome_bam.bam.bai
586
+ ├── cloupe.cloupe
587
+ ├── possorted_genome_bam.bam
588
+ ├── filtered_feature_bc_matrix.h5
589
+ └── raw_feature_bc_matrix.h5
590
+ """
591
+ message, _ = compute_file_tree(
592
+ path,
593
+ level=level,
594
+ only_dirs=only_dirs,
595
+ n_max_files=n_max_files,
596
+ n_max_files_per_dir_and_type=n_max_files_per_dir_and_type,
597
+ include_paths=include_paths,
598
+ skip_suffixes=skip_suffixes,
599
+ )
600
+ logger.print(message)
601
+
602
+
603
+ def to_url(upath):
604
+ """Public storage URL.
605
+
606
+ Generates a public URL for an object in an S3 bucket using fsspec's UPath,
607
+ considering the bucket's region.
608
+
609
+ Args:
610
+ - upath: A UPath object representing an S3 path.
611
+
612
+ Returns:
613
+ - A string containing the public URL to the S3 object.
614
+ """
615
+ if upath.protocol != "s3":
616
+ raise ValueError("The provided UPath must be an S3 path.")
617
+ key = "/".join(upath.parts[1:])
618
+ bucket = upath._url.netloc
619
+ if bucket == "scverse-spatial-eu-central-1":
620
+ region = "eu-central-1"
621
+ elif f"s3://{bucket}" not in HOSTED_BUCKETS:
622
+ response = upath.fs.call_s3("head_bucket", Bucket=upath._url.netloc)
623
+ headers = response["ResponseMetadata"]["HTTPHeaders"]
624
+ region = headers.get("x-amz-bucket-region")
625
+ else:
626
+ region = bucket.replace("lamin_", "")
627
+ if region == "us-east-1":
628
+ return f"https://{bucket}.s3.amazonaws.com/{key}"
629
+ else:
630
+ return f"https://{bucket}.s3-{region}.amazonaws.com/{key}"
631
+
632
+
633
+ # Why aren't we subclassing?
634
+ #
635
+ # The problem is that UPath defines a type system of paths
636
+ # Its __new__ method returns instances of different subclasses rather than a
637
+ # UPath object
638
+ # If we create a custom subclass naively, subclasses of the parent UPath won't
639
+ # be subclasses of our custom subclass
640
+ # This makes life really hard in type checks involving local to cloud
641
+ # comparisons, etc.
642
+ # Hence, we extend the existing UPath and amend the docs
643
+ # Some of this might end up in the original UPath implementation over time,
644
+ # we'll see.
645
+
646
+
647
+ # add custom functions
648
+ UPath.modified = property(modified)
649
+ UPath.synchronize = synchronize
650
+ UPath.upload_from = upload_from
651
+ UPath.to_url = to_url
652
+ UPath.download_to = download_to
653
+ UPath.view_tree = view_tree
654
+ # unfortunately, we also have to do this for the subclasses
655
+ Path.view_tree = view_tree # type: ignore
656
+
657
+ UPath.glob.__doc__ = Path.glob.__doc__
658
+ UPath.rglob.__doc__ = Path.rglob.__doc__
659
+ UPath.stat.__doc__ = Path.stat.__doc__
660
+ UPath.iterdir.__doc__ = Path.iterdir.__doc__
661
+ UPath.resolve.__doc__ = Path.resolve.__doc__
662
+ UPath.relative_to.__doc__ = Path.relative_to.__doc__
663
+ UPath.exists.__doc__ = Path.exists.__doc__
664
+ UPath.is_dir.__doc__ = Path.is_dir.__doc__
665
+ UPath.is_file.__doc__ = Path.is_file.__doc__
666
+ UPath.unlink.__doc__ = Path.unlink.__doc__
667
+ UPath.rename.__doc__ = """Move file, see fsspec.AbstractFileSystem.mv.
668
+
669
+ >>> upath = Upath("s3://my-bucket/my-file")
670
+ >>> upath.rename(UPath("s3://my-bucket/my-file-renamed"))
671
+ >>> upath.rename("my-file-renamed")
672
+
673
+ >>> upath = Upath("local-folder/my-file")
674
+ >>> upath.rename("local-folder/my-file-renamed")
675
+ """
676
+ UPath.__doc__ = """Paths: low-level key-value access to files/objects.
677
+
678
+ Paths are based on keys that offer the typical access patterns of file systems
679
+ and object stores.
680
+
681
+ >>> upath = UPath("s3://my-bucket/my-folder")
682
+ >>> upath.exists()
683
+
684
+ Args:
685
+ pathlike: A string or Path to a local/cloud file/directory/folder.
686
+ """
687
+
688
+
689
+ def create_path(path: UPath, access_token: str | None = None) -> UPath:
690
+ path = UPath(path)
691
+ # test whether we have an AWS S3 path
692
+ if not isinstance(path, S3Path):
693
+ return path
694
+ return get_aws_credentials_manager().enrich_path(path, access_token)
695
+
696
+
697
+ def get_stat_file_cloud(stat: dict) -> tuple[int, str, str]:
698
+ size = stat["size"]
699
+ etag = stat["ETag"]
700
+ # small files
701
+ if "-" not in etag:
702
+ # only store hash for non-multipart uploads
703
+ # we can't rapidly validate multi-part uploaded files client-side
704
+ # we can add more logic later down-the-road
705
+ hash = b16_to_b64(etag)
706
+ hash_type = "md5"
707
+ else:
708
+ stripped_etag, suffix = etag.split("-")
709
+ suffix = suffix.strip('"')
710
+ hash = b16_to_b64(stripped_etag)
711
+ hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
712
+ return size, hash[:HASH_LENGTH], hash_type
713
+
714
+
715
+ def get_stat_dir_cloud(path: UPath) -> tuple[int, str, str, int]:
716
+ sizes = []
717
+ md5s = []
718
+ objects = path.fs.find(path.as_posix(), detail=True)
719
+ if path.protocol == "s3":
720
+ accessor = "ETag"
721
+ elif path.protocol == "gs":
722
+ accessor = "md5Hash"
723
+ for object in objects.values():
724
+ sizes.append(object["size"])
725
+ md5s.append(object[accessor].strip('"='))
726
+ size = sum(sizes)
727
+ hash, hash_type = hash_md5s_from_dir(md5s)
728
+ n_objects = len(md5s)
729
+ return size, hash, hash_type, n_objects
730
+
731
+
732
+ class InstanceNotEmpty(Exception):
733
+ pass
734
+
735
+
736
+ # is as fast as boto3: https://lamin.ai/laminlabs/lamindata/transform/krGp3hT1f78N5zKv
737
+ def check_storage_is_empty(
738
+ root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
739
+ ) -> int:
740
+ root_upath = UPath(root)
741
+ root_string = root_upath.as_posix() # type: ignore
742
+ # we currently touch a 0-byte file in the root of a hosted storage location
743
+ # ({storage_root}/.lamindb/_is_initialized) during storage initialization
744
+ # since path.fs.find raises a PermissionError on empty hosted
745
+ # subdirectories (see lamindb_setup/core/_settings_storage/init_storage).
746
+ n_offset_objects = 1 # because of touched dummy file, see mark_storage_root()
747
+ if root_string.startswith(HOSTED_BUCKETS):
748
+ # in hosted buckets, count across entire root
749
+ directory_string = root_string
750
+ # the SQLite file is not in the ".lamindb" directory
751
+ if account_for_sqlite_file:
752
+ n_offset_objects += 1 # because of SQLite file
753
+ else:
754
+ # in any other storage location, only count in .lamindb
755
+ if not root_string.endswith("/"):
756
+ root_string += "/"
757
+ directory_string = root_string + ".lamindb"
758
+ objects = root_upath.fs.find(directory_string)
759
+ n_objects = len(objects)
760
+ n_diff = n_objects - n_offset_objects
761
+ ask_for_deletion = (
762
+ "delete them prior to deleting the instance"
763
+ if raise_error
764
+ else "consider deleting them"
765
+ )
766
+ hint = "'_is_initialized'"
767
+ if n_offset_objects == 2:
768
+ hint += " & SQLite file"
769
+ hint += " ignored"
770
+ message = (
771
+ f"Storage {directory_string} contains {n_objects - n_offset_objects} objects "
772
+ f"({hint}) - {ask_for_deletion}"
773
+ )
774
+ if n_diff > 0:
775
+ if raise_error:
776
+ raise InstanceNotEmpty(message)
777
+ else:
778
+ logger.warning(message)
779
+ return n_diff