lamindb_setup 0.77.3__py2.py3-none-any.whl → 0.77.5__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +1 -1
- lamindb_setup/_cache.py +34 -34
- lamindb_setup/_check.py +7 -7
- lamindb_setup/_check_setup.py +79 -79
- lamindb_setup/_close.py +35 -35
- lamindb_setup/_connect_instance.py +431 -444
- lamindb_setup/_delete.py +141 -139
- lamindb_setup/_django.py +41 -41
- lamindb_setup/_entry_points.py +22 -22
- lamindb_setup/_exportdb.py +68 -68
- lamindb_setup/_importdb.py +50 -50
- lamindb_setup/_init_instance.py +417 -374
- lamindb_setup/_migrate.py +239 -239
- lamindb_setup/_register_instance.py +36 -36
- lamindb_setup/_schema.py +27 -27
- lamindb_setup/_schema_metadata.py +411 -411
- lamindb_setup/_set_managed_storage.py +55 -55
- lamindb_setup/_setup_user.py +137 -137
- lamindb_setup/_silence_loggers.py +44 -44
- lamindb_setup/core/__init__.py +21 -21
- lamindb_setup/core/_aws_credentials.py +151 -151
- lamindb_setup/core/_aws_storage.py +48 -48
- lamindb_setup/core/_deprecated.py +55 -55
- lamindb_setup/core/_docs.py +14 -14
- lamindb_setup/core/_hub_core.py +611 -590
- lamindb_setup/core/_hub_crud.py +211 -211
- lamindb_setup/core/_hub_utils.py +109 -109
- lamindb_setup/core/_private_django_api.py +88 -88
- lamindb_setup/core/_settings.py +138 -138
- lamindb_setup/core/_settings_instance.py +480 -467
- lamindb_setup/core/_settings_load.py +105 -105
- lamindb_setup/core/_settings_save.py +81 -81
- lamindb_setup/core/_settings_storage.py +412 -405
- lamindb_setup/core/_settings_store.py +75 -75
- lamindb_setup/core/_settings_user.py +53 -53
- lamindb_setup/core/_setup_bionty_sources.py +101 -101
- lamindb_setup/core/cloud_sqlite_locker.py +237 -232
- lamindb_setup/core/django.py +114 -114
- lamindb_setup/core/exceptions.py +12 -12
- lamindb_setup/core/hashing.py +114 -114
- lamindb_setup/core/types.py +19 -19
- lamindb_setup/core/upath.py +779 -779
- {lamindb_setup-0.77.3.dist-info → lamindb_setup-0.77.5.dist-info}/METADATA +1 -1
- lamindb_setup-0.77.5.dist-info/RECORD +47 -0
- {lamindb_setup-0.77.3.dist-info → lamindb_setup-0.77.5.dist-info}/WHEEL +1 -1
- lamindb_setup-0.77.3.dist-info/RECORD +0 -47
- {lamindb_setup-0.77.3.dist-info → lamindb_setup-0.77.5.dist-info}/LICENSE +0 -0
lamindb_setup/core/upath.py
CHANGED
|
@@ -1,779 +1,779 @@
|
|
|
1
|
-
# we are not documenting UPath here because it's documented at lamindb.UPath
|
|
2
|
-
"""Paths & file systems."""
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
from collections import defaultdict
|
|
8
|
-
from datetime import datetime, timezone
|
|
9
|
-
from functools import partial
|
|
10
|
-
from itertools import islice
|
|
11
|
-
from pathlib import Path, PurePosixPath
|
|
12
|
-
from typing import TYPE_CHECKING, Any, Literal
|
|
13
|
-
|
|
14
|
-
import fsspec
|
|
15
|
-
from lamin_utils import logger
|
|
16
|
-
from upath import UPath
|
|
17
|
-
from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
|
|
18
|
-
from upath.implementations.local import LocalPath, PosixUPath, WindowsUPath
|
|
19
|
-
|
|
20
|
-
from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
|
|
21
|
-
from .hashing import HASH_LENGTH, b16_to_b64, hash_md5s_from_dir
|
|
22
|
-
|
|
23
|
-
if TYPE_CHECKING:
|
|
24
|
-
from .types import UPathStr
|
|
25
|
-
|
|
26
|
-
LocalPathClasses = (PosixUPath, WindowsUPath, LocalPath)
|
|
27
|
-
|
|
28
|
-
# also see https://gist.github.com/securifera/e7eed730cbe1ce43d0c29d7cd2d582f4
|
|
29
|
-
# ".gz" is not listed here as it typically occurs with another suffix
|
|
30
|
-
# the complete list is at lamindb.core.storage._suffixes
|
|
31
|
-
VALID_SIMPLE_SUFFIXES = {
|
|
32
|
-
#
|
|
33
|
-
# without readers
|
|
34
|
-
#
|
|
35
|
-
".fasta",
|
|
36
|
-
".fastq",
|
|
37
|
-
".jpg",
|
|
38
|
-
".mtx",
|
|
39
|
-
".obo",
|
|
40
|
-
".pdf",
|
|
41
|
-
".png",
|
|
42
|
-
".tar",
|
|
43
|
-
".tiff",
|
|
44
|
-
".txt",
|
|
45
|
-
".tsv",
|
|
46
|
-
".zip",
|
|
47
|
-
".xml",
|
|
48
|
-
#
|
|
49
|
-
# with readers (see below)
|
|
50
|
-
#
|
|
51
|
-
".h5ad",
|
|
52
|
-
".parquet",
|
|
53
|
-
".csv",
|
|
54
|
-
".fcs",
|
|
55
|
-
".xslx",
|
|
56
|
-
".zarr",
|
|
57
|
-
".json",
|
|
58
|
-
}
|
|
59
|
-
# below gets updated within lamindb because it's frequently changing
|
|
60
|
-
VALID_COMPOSITE_SUFFIXES = {".anndata.zarr"}
|
|
61
|
-
|
|
62
|
-
TRAILING_SEP = (os.sep, os.altsep) if os.altsep is not None else os.sep
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
|
|
66
|
-
def process_digits(suffix: str):
|
|
67
|
-
if suffix[1:].isdigit(): # :1 to skip the dot
|
|
68
|
-
return "" # digits are no valid suffixes
|
|
69
|
-
else:
|
|
70
|
-
return suffix
|
|
71
|
-
|
|
72
|
-
if len(path.suffixes) <= 1:
|
|
73
|
-
return process_digits(path.suffix)
|
|
74
|
-
|
|
75
|
-
total_suffix = "".join(path.suffixes)
|
|
76
|
-
if total_suffix in VALID_SIMPLE_SUFFIXES:
|
|
77
|
-
return total_suffix
|
|
78
|
-
elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
|
|
79
|
-
# below seems slow but OK for now
|
|
80
|
-
for suffix in VALID_COMPOSITE_SUFFIXES:
|
|
81
|
-
if total_suffix.endswith(suffix):
|
|
82
|
-
break
|
|
83
|
-
return suffix
|
|
84
|
-
else:
|
|
85
|
-
print_hint = True
|
|
86
|
-
arg_name = "file" if arg_name is None else arg_name # for the warning
|
|
87
|
-
msg = f"{arg_name} has more than one suffix (path.suffixes), "
|
|
88
|
-
# first check the 2nd-to-last suffix because it might be followed by .gz
|
|
89
|
-
# or another compression-related suffix
|
|
90
|
-
# Alex thought about adding logic along the lines of path.suffixes[-1]
|
|
91
|
-
# in COMPRESSION_SUFFIXES to detect something like .random.gz and then
|
|
92
|
-
# add ".random.gz" but concluded it's too dangerous it's safer to just
|
|
93
|
-
# use ".gz" in such a case
|
|
94
|
-
if path.suffixes[-2] in VALID_SIMPLE_SUFFIXES:
|
|
95
|
-
suffix = "".join(path.suffixes[-2:])
|
|
96
|
-
msg += f"inferring: '{suffix}'"
|
|
97
|
-
# do not print a warning for things like .tar.gz, .fastq.gz
|
|
98
|
-
if path.suffixes[-1] == ".gz":
|
|
99
|
-
print_hint = False
|
|
100
|
-
else:
|
|
101
|
-
suffix = path.suffixes[-1] # this is equivalent to path.suffix
|
|
102
|
-
msg += (
|
|
103
|
-
f"using only last suffix: '{suffix}' - if you want your composite"
|
|
104
|
-
" suffix to be recognized add it to"
|
|
105
|
-
" lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()"
|
|
106
|
-
)
|
|
107
|
-
if print_hint:
|
|
108
|
-
logger.hint(msg)
|
|
109
|
-
return process_digits(suffix)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def infer_filesystem(path: UPathStr):
|
|
113
|
-
import fsspec # improve cold start
|
|
114
|
-
|
|
115
|
-
path_str = str(path)
|
|
116
|
-
|
|
117
|
-
if isinstance(path, UPath):
|
|
118
|
-
fs = path.fs
|
|
119
|
-
else:
|
|
120
|
-
protocol = fsspec.utils.get_protocol(path_str)
|
|
121
|
-
if protocol == "s3":
|
|
122
|
-
fs_kwargs = {"cache_regions": True}
|
|
123
|
-
else:
|
|
124
|
-
fs_kwargs = {}
|
|
125
|
-
fs = fsspec.filesystem(protocol, **fs_kwargs)
|
|
126
|
-
|
|
127
|
-
return fs, path_str
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
# this is needed to avoid CreateBucket permission
|
|
131
|
-
class S3FSMap(fsspec.FSMap):
|
|
132
|
-
def __setitem__(self, key, value):
|
|
133
|
-
"""Store value in key."""
|
|
134
|
-
key = self._key_to_str(key)
|
|
135
|
-
self.fs.pipe_file(key, fsspec.mapping.maybe_convert(value))
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def create_mapper(
|
|
139
|
-
fs,
|
|
140
|
-
url="",
|
|
141
|
-
check=False,
|
|
142
|
-
create=False,
|
|
143
|
-
missing_exceptions=None,
|
|
144
|
-
):
|
|
145
|
-
if fsspec.utils.get_protocol(url) == "s3":
|
|
146
|
-
return S3FSMap(
|
|
147
|
-
url, fs, check=check, create=False, missing_exceptions=missing_exceptions
|
|
148
|
-
)
|
|
149
|
-
else:
|
|
150
|
-
return fsspec.FSMap(
|
|
151
|
-
url, fs, check=check, create=create, missing_exceptions=missing_exceptions
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def print_hook(size: int, value: int, objectname: str, action: str):
|
|
156
|
-
if size == 0:
|
|
157
|
-
progress_in_percent = 100.0
|
|
158
|
-
else:
|
|
159
|
-
progress_in_percent = (value / size) * 100
|
|
160
|
-
out = f"... {action} {objectname}:" f" {min(progress_in_percent, 100):4.1f}%"
|
|
161
|
-
if "NBPRJ_TEST_NBPATH" not in os.environ:
|
|
162
|
-
end = "\n" if progress_in_percent >= 100 else "\r"
|
|
163
|
-
print(out, end=end)
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
class ProgressCallback(fsspec.callbacks.Callback):
|
|
167
|
-
def __init__(
|
|
168
|
-
self,
|
|
169
|
-
objectname: str,
|
|
170
|
-
action: Literal["uploading", "downloading", "synchronizing"],
|
|
171
|
-
adjust_size: bool = False,
|
|
172
|
-
):
|
|
173
|
-
assert action in {"uploading", "downloading", "synchronizing"}
|
|
174
|
-
|
|
175
|
-
super().__init__()
|
|
176
|
-
|
|
177
|
-
self.action = action
|
|
178
|
-
print_progress = partial(print_hook, objectname=objectname, action=action)
|
|
179
|
-
self.hooks = {"print_progress": print_progress}
|
|
180
|
-
|
|
181
|
-
self.adjust_size = adjust_size
|
|
182
|
-
|
|
183
|
-
def absolute_update(self, value):
|
|
184
|
-
pass
|
|
185
|
-
|
|
186
|
-
def relative_update(self, inc=1):
|
|
187
|
-
pass
|
|
188
|
-
|
|
189
|
-
def update_relative_value(self, inc=1):
|
|
190
|
-
self.value += inc
|
|
191
|
-
self.call()
|
|
192
|
-
|
|
193
|
-
def branch(self, path_1, path_2, kwargs):
|
|
194
|
-
if self.adjust_size:
|
|
195
|
-
if Path(path_2 if self.action != "uploading" else path_1).is_dir():
|
|
196
|
-
self.size -= 1
|
|
197
|
-
kwargs["callback"] = ChildProgressCallback(self)
|
|
198
|
-
|
|
199
|
-
def branched(self, path_1, path_2, **kwargs):
|
|
200
|
-
self.branch(path_1, path_2, kwargs)
|
|
201
|
-
return kwargs["callback"]
|
|
202
|
-
|
|
203
|
-
def wrap(self, iterable):
|
|
204
|
-
if self.adjust_size:
|
|
205
|
-
paths = []
|
|
206
|
-
for lpath, rpath in iterable:
|
|
207
|
-
paths.append((lpath, rpath))
|
|
208
|
-
if Path(lpath).is_dir():
|
|
209
|
-
self.size -= 1
|
|
210
|
-
self.adjust_size = False
|
|
211
|
-
return paths
|
|
212
|
-
else:
|
|
213
|
-
return iterable
|
|
214
|
-
|
|
215
|
-
@classmethod
|
|
216
|
-
def requires_progress(
|
|
217
|
-
cls,
|
|
218
|
-
maybe_callback: fsspec.callbacks.Callback | None,
|
|
219
|
-
print_progress: bool,
|
|
220
|
-
objectname: str,
|
|
221
|
-
action: Literal["uploading", "downloading", "synchronizing"],
|
|
222
|
-
**kwargs,
|
|
223
|
-
):
|
|
224
|
-
if maybe_callback is None:
|
|
225
|
-
if print_progress:
|
|
226
|
-
return cls(objectname, action, **kwargs)
|
|
227
|
-
else:
|
|
228
|
-
return fsspec.callbacks.NoOpCallback()
|
|
229
|
-
return maybe_callback
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
class ChildProgressCallback(fsspec.callbacks.Callback):
|
|
233
|
-
def __init__(self, parent: ProgressCallback):
|
|
234
|
-
super().__init__()
|
|
235
|
-
|
|
236
|
-
self.parent = parent
|
|
237
|
-
|
|
238
|
-
def parent_update(self, inc=1):
|
|
239
|
-
self.parent.update_relative_value(inc)
|
|
240
|
-
|
|
241
|
-
def relative_update(self, inc=1):
|
|
242
|
-
if self.size != 0:
|
|
243
|
-
self.parent_update(inc / self.size)
|
|
244
|
-
else:
|
|
245
|
-
self.parent_update(1)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwargs):
|
|
249
|
-
"""Download from self (a destination in the cloud) to the local path."""
|
|
250
|
-
if "recursive" not in kwargs:
|
|
251
|
-
kwargs["recursive"] = True
|
|
252
|
-
if print_progress and "callback" not in kwargs:
|
|
253
|
-
callback = ProgressCallback(
|
|
254
|
-
PurePosixPath(local_path).name, "downloading", adjust_size=True
|
|
255
|
-
)
|
|
256
|
-
kwargs["callback"] = callback
|
|
257
|
-
|
|
258
|
-
self.fs.download(str(self), str(local_path), **kwargs)
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
def upload_from(
|
|
262
|
-
self,
|
|
263
|
-
local_path: UPathStr,
|
|
264
|
-
create_folder: bool | None = None,
|
|
265
|
-
print_progress: bool = True,
|
|
266
|
-
**kwargs,
|
|
267
|
-
) -> UPath:
|
|
268
|
-
"""Upload from the local path to `self` (a destination in the cloud).
|
|
269
|
-
|
|
270
|
-
If the local path is a directory, recursively upload its contents.
|
|
271
|
-
|
|
272
|
-
Args:
|
|
273
|
-
local_path: A local path of a file or directory.
|
|
274
|
-
create_folder: Only applies if `local_path` is a directory and then
|
|
275
|
-
defaults to `True`. If `True`, make a new folder in the destination
|
|
276
|
-
using the directory name of `local_path`. If `False`, upload the
|
|
277
|
-
contents of the directory to to the root-level of the destination.
|
|
278
|
-
print_progress: Print progress.
|
|
279
|
-
|
|
280
|
-
Returns:
|
|
281
|
-
The destination path.
|
|
282
|
-
"""
|
|
283
|
-
local_path = Path(local_path)
|
|
284
|
-
local_path_is_dir = local_path.is_dir()
|
|
285
|
-
if create_folder is None:
|
|
286
|
-
create_folder = local_path_is_dir
|
|
287
|
-
if create_folder and not local_path_is_dir:
|
|
288
|
-
raise ValueError("create_folder can only be True if local_path is a directory")
|
|
289
|
-
|
|
290
|
-
if print_progress and "callback" not in kwargs:
|
|
291
|
-
callback = ProgressCallback(local_path.name, "uploading")
|
|
292
|
-
kwargs["callback"] = callback
|
|
293
|
-
|
|
294
|
-
if local_path_is_dir and not create_folder:
|
|
295
|
-
source = [f for f in local_path.rglob("*") if f.is_file()]
|
|
296
|
-
destination = [str(self / f.relative_to(local_path)) for f in source]
|
|
297
|
-
source = [str(f) for f in source] # type: ignore
|
|
298
|
-
else:
|
|
299
|
-
source = str(local_path) # type: ignore
|
|
300
|
-
destination = str(self) # type: ignore
|
|
301
|
-
|
|
302
|
-
# the below lines are to avoid s3fs triggering create_bucket in upload if
|
|
303
|
-
# dirs are present it allows to avoid permission error
|
|
304
|
-
# would be easier to just
|
|
305
|
-
if self.protocol == "s3" and local_path_is_dir and create_folder:
|
|
306
|
-
bucket = self._url.netloc
|
|
307
|
-
if bucket not in self.fs.dircache:
|
|
308
|
-
self.fs.dircache[bucket] = [{}]
|
|
309
|
-
if not destination.endswith(TRAILING_SEP): # type: ignore
|
|
310
|
-
destination += "/"
|
|
311
|
-
cleanup_cache = True
|
|
312
|
-
else:
|
|
313
|
-
cleanup_cache = False
|
|
314
|
-
else:
|
|
315
|
-
cleanup_cache = False
|
|
316
|
-
|
|
317
|
-
self.fs.upload(source, destination, recursive=create_folder, **kwargs)
|
|
318
|
-
|
|
319
|
-
if cleanup_cache:
|
|
320
|
-
# normally this is invalidated after the upload but still better to check
|
|
321
|
-
if bucket in self.fs.dircache:
|
|
322
|
-
del self.fs.dircache[bucket]
|
|
323
|
-
|
|
324
|
-
if local_path_is_dir and create_folder:
|
|
325
|
-
return self / local_path.name
|
|
326
|
-
else:
|
|
327
|
-
return self
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
def synchronize(
|
|
331
|
-
self,
|
|
332
|
-
objectpath: Path,
|
|
333
|
-
error_no_origin: bool = True,
|
|
334
|
-
print_progress: bool = False,
|
|
335
|
-
callback: fsspec.callbacks.Callback | None = None,
|
|
336
|
-
timestamp: float | None = None,
|
|
337
|
-
):
|
|
338
|
-
"""Sync to a local destination path."""
|
|
339
|
-
# optimize the number of network requests
|
|
340
|
-
if timestamp is not None:
|
|
341
|
-
is_dir = False
|
|
342
|
-
exists = True
|
|
343
|
-
cloud_mts = timestamp
|
|
344
|
-
else:
|
|
345
|
-
# perform only one network request to check existence, type and timestamp
|
|
346
|
-
try:
|
|
347
|
-
cloud_mts = self.modified.timestamp()
|
|
348
|
-
is_dir = False
|
|
349
|
-
exists = True
|
|
350
|
-
except FileNotFoundError:
|
|
351
|
-
exists = False
|
|
352
|
-
except IsADirectoryError:
|
|
353
|
-
is_dir = True
|
|
354
|
-
exists = True
|
|
355
|
-
|
|
356
|
-
if not exists:
|
|
357
|
-
warn_or_error = f"The original path {self} does not exist anymore."
|
|
358
|
-
if objectpath.exists():
|
|
359
|
-
warn_or_error += (
|
|
360
|
-
f"\nHowever, the local path {objectpath} still exists, you might want"
|
|
361
|
-
" to reupload the object back."
|
|
362
|
-
)
|
|
363
|
-
logger.warning(warn_or_error)
|
|
364
|
-
elif error_no_origin:
|
|
365
|
-
warn_or_error += "\nIt is not possible to synchronize."
|
|
366
|
-
raise FileNotFoundError(warn_or_error)
|
|
367
|
-
return None
|
|
368
|
-
|
|
369
|
-
# synchronization logic for directories
|
|
370
|
-
if is_dir:
|
|
371
|
-
files = self.fs.find(str(self), detail=True)
|
|
372
|
-
protocol_modified = {"s3": "LastModified", "gs": "mtime"}
|
|
373
|
-
modified_key = protocol_modified.get(self.protocol, None)
|
|
374
|
-
if modified_key is None:
|
|
375
|
-
raise ValueError(f"Can't synchronize a directory for {self.protocol}.")
|
|
376
|
-
if objectpath.exists():
|
|
377
|
-
destination_exists = True
|
|
378
|
-
cloud_mts_max = max(
|
|
379
|
-
file[modified_key] for file in files.values()
|
|
380
|
-
).timestamp()
|
|
381
|
-
local_mts = [
|
|
382
|
-
file.stat().st_mtime for file in objectpath.rglob("*") if file.is_file()
|
|
383
|
-
]
|
|
384
|
-
n_local_files = len(local_mts)
|
|
385
|
-
local_mts_max = max(local_mts)
|
|
386
|
-
if local_mts_max == cloud_mts_max:
|
|
387
|
-
need_synchronize = n_local_files != len(files)
|
|
388
|
-
elif local_mts_max > cloud_mts_max:
|
|
389
|
-
need_synchronize = False
|
|
390
|
-
else:
|
|
391
|
-
need_synchronize = True
|
|
392
|
-
else:
|
|
393
|
-
destination_exists = False
|
|
394
|
-
need_synchronize = True
|
|
395
|
-
if need_synchronize:
|
|
396
|
-
callback = ProgressCallback.requires_progress(
|
|
397
|
-
callback, print_progress, objectpath.name, "synchronizing"
|
|
398
|
-
)
|
|
399
|
-
callback.set_size(len(files))
|
|
400
|
-
origin_file_keys = []
|
|
401
|
-
for file, stat in callback.wrap(files.items()):
|
|
402
|
-
file_key = PurePosixPath(file).relative_to(self.path)
|
|
403
|
-
origin_file_keys.append(file_key.as_posix())
|
|
404
|
-
timestamp = stat[modified_key].timestamp()
|
|
405
|
-
|
|
406
|
-
origin = f"{self.protocol}://{file}"
|
|
407
|
-
destination = objectpath / file_key
|
|
408
|
-
child = callback.branched(origin, destination.as_posix())
|
|
409
|
-
UPath(origin, **self.storage_options).synchronize(
|
|
410
|
-
destination, callback=child, timestamp=timestamp
|
|
411
|
-
)
|
|
412
|
-
child.close()
|
|
413
|
-
if destination_exists:
|
|
414
|
-
local_files = [file for file in objectpath.rglob("*") if file.is_file()]
|
|
415
|
-
if len(local_files) > len(files):
|
|
416
|
-
for file in local_files:
|
|
417
|
-
if (
|
|
418
|
-
file.relative_to(objectpath).as_posix()
|
|
419
|
-
not in origin_file_keys
|
|
420
|
-
):
|
|
421
|
-
file.unlink()
|
|
422
|
-
parent = file.parent
|
|
423
|
-
if next(parent.iterdir(), None) is None:
|
|
424
|
-
parent.rmdir()
|
|
425
|
-
return None
|
|
426
|
-
|
|
427
|
-
# synchronization logic for files
|
|
428
|
-
callback = ProgressCallback.requires_progress(
|
|
429
|
-
callback, print_progress, objectpath.name, "synchronizing"
|
|
430
|
-
)
|
|
431
|
-
if objectpath.exists():
|
|
432
|
-
local_mts_obj = objectpath.stat().st_mtime # type: ignore
|
|
433
|
-
need_synchronize = cloud_mts > local_mts_obj
|
|
434
|
-
else:
|
|
435
|
-
objectpath.parent.mkdir(parents=True, exist_ok=True)
|
|
436
|
-
need_synchronize = True
|
|
437
|
-
if need_synchronize:
|
|
438
|
-
self.download_to(
|
|
439
|
-
objectpath, recursive=False, print_progress=False, callback=callback
|
|
440
|
-
)
|
|
441
|
-
os.utime(objectpath, times=(cloud_mts, cloud_mts))
|
|
442
|
-
else:
|
|
443
|
-
# nothing happens if parent_update is not defined
|
|
444
|
-
# because of Callback.no_op
|
|
445
|
-
callback.parent_update()
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
def modified(self) -> datetime | None:
|
|
449
|
-
"""Return modified time stamp."""
|
|
450
|
-
mtime = self.fs.modified(str(self))
|
|
451
|
-
if mtime.tzinfo is None:
|
|
452
|
-
mtime = mtime.replace(tzinfo=timezone.utc)
|
|
453
|
-
return mtime.astimezone().replace(tzinfo=None)
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
def compute_file_tree(
|
|
457
|
-
path: Path,
|
|
458
|
-
*,
|
|
459
|
-
level: int = -1,
|
|
460
|
-
only_dirs: bool = False,
|
|
461
|
-
n_max_files_per_dir_and_type: int = 100,
|
|
462
|
-
n_max_files: int = 1000,
|
|
463
|
-
include_paths: set[Any] | None = None,
|
|
464
|
-
skip_suffixes: list[str] | None = None,
|
|
465
|
-
) -> tuple[str, int]:
|
|
466
|
-
space = " "
|
|
467
|
-
branch = "│ "
|
|
468
|
-
tee = "├── "
|
|
469
|
-
last = "└── "
|
|
470
|
-
if skip_suffixes is None:
|
|
471
|
-
skip_suffixes_tuple = ()
|
|
472
|
-
else:
|
|
473
|
-
skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
|
|
474
|
-
n_objects = 0
|
|
475
|
-
n_directories = 0
|
|
476
|
-
|
|
477
|
-
# by default only including registered files
|
|
478
|
-
# need a flag and a proper implementation
|
|
479
|
-
suffixes = set()
|
|
480
|
-
include_dirs = set()
|
|
481
|
-
if include_paths is not None:
|
|
482
|
-
include_dirs = {d for p in include_paths for d in p.parents}
|
|
483
|
-
else:
|
|
484
|
-
include_paths = set()
|
|
485
|
-
|
|
486
|
-
def inner(dir_path: Path, prefix: str = "", level: int = -1):
|
|
487
|
-
nonlocal n_objects, n_directories, suffixes
|
|
488
|
-
if level == 0:
|
|
489
|
-
return
|
|
490
|
-
stripped_dir_path = dir_path.as_posix().rstrip("/")
|
|
491
|
-
# do not iterate through zarr directories
|
|
492
|
-
if stripped_dir_path.endswith(skip_suffixes_tuple):
|
|
493
|
-
return
|
|
494
|
-
# this is needed so that the passed folder is not listed
|
|
495
|
-
contents = [
|
|
496
|
-
i
|
|
497
|
-
for i in dir_path.iterdir()
|
|
498
|
-
if i.as_posix().rstrip("/") != stripped_dir_path
|
|
499
|
-
]
|
|
500
|
-
if only_dirs:
|
|
501
|
-
contents = [d for d in contents if d.is_dir()]
|
|
502
|
-
pointers = [tee] * (len(contents) - 1) + [last]
|
|
503
|
-
n_files_per_dir_and_type = defaultdict(lambda: 0) # type: ignore
|
|
504
|
-
# TODO: pass strict=False to zip with python > 3.9
|
|
505
|
-
for pointer, child_path in zip(pointers, contents): # type: ignore
|
|
506
|
-
if child_path.is_dir():
|
|
507
|
-
if include_dirs and child_path not in include_dirs:
|
|
508
|
-
continue
|
|
509
|
-
yield prefix + pointer + child_path.name + "/"
|
|
510
|
-
n_directories += 1
|
|
511
|
-
n_files_per_dir_and_type = defaultdict(lambda: 0)
|
|
512
|
-
extension = branch if pointer == tee else space
|
|
513
|
-
yield from inner(child_path, prefix=prefix + extension, level=level - 1)
|
|
514
|
-
elif not only_dirs:
|
|
515
|
-
if include_paths and child_path not in include_paths:
|
|
516
|
-
continue
|
|
517
|
-
suffix = extract_suffix_from_path(child_path)
|
|
518
|
-
suffixes.add(suffix)
|
|
519
|
-
n_files_per_dir_and_type[suffix] += 1
|
|
520
|
-
n_objects += 1
|
|
521
|
-
if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
|
|
522
|
-
yield prefix + "..."
|
|
523
|
-
elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
|
|
524
|
-
continue
|
|
525
|
-
else:
|
|
526
|
-
yield prefix + pointer + child_path.name
|
|
527
|
-
|
|
528
|
-
folder_tree = ""
|
|
529
|
-
iterator = inner(path, level=level)
|
|
530
|
-
for line in islice(iterator, n_max_files):
|
|
531
|
-
folder_tree += f"\n{line}"
|
|
532
|
-
if next(iterator, None):
|
|
533
|
-
folder_tree += f"\n... only showing {n_max_files} out of {n_objects} files"
|
|
534
|
-
directory_info = "directory" if n_directories == 1 else "directories"
|
|
535
|
-
display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
|
|
536
|
-
suffix_message = f" with suffixes {display_suffixes}" if n_objects > 0 else ""
|
|
537
|
-
message = (
|
|
538
|
-
f"{n_directories} sub-{directory_info} &"
|
|
539
|
-
f" {n_objects} files{suffix_message}\n{path.resolve()}{folder_tree}"
|
|
540
|
-
)
|
|
541
|
-
return message, n_objects
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
# adapted from: https://stackoverflow.com/questions/9727673
|
|
545
|
-
def view_tree(
|
|
546
|
-
path: Path,
|
|
547
|
-
*,
|
|
548
|
-
level: int = 2,
|
|
549
|
-
only_dirs: bool = False,
|
|
550
|
-
n_max_files_per_dir_and_type: int = 100,
|
|
551
|
-
n_max_files: int = 1000,
|
|
552
|
-
include_paths: set[Any] | None = None,
|
|
553
|
-
skip_suffixes: list[str] | None = None,
|
|
554
|
-
) -> None:
|
|
555
|
-
"""Print a visual tree structure of files & directories.
|
|
556
|
-
|
|
557
|
-
Args:
|
|
558
|
-
level: If `1`, only iterate through one level, if `2` iterate through 2
|
|
559
|
-
levels, if `-1` iterate through entire hierarchy.
|
|
560
|
-
only_dirs: Only iterate through directories.
|
|
561
|
-
n_max_files: Display limit. Will only show this many files. Doesn't affect count.
|
|
562
|
-
include_paths: Restrict to these paths.
|
|
563
|
-
skip_suffixes: Skip directories with these suffixes.
|
|
564
|
-
|
|
565
|
-
Examples:
|
|
566
|
-
>>> dir_path = ln.core.datasets.generate_cell_ranger_files(
|
|
567
|
-
>>> "sample_001", ln.settings.storage
|
|
568
|
-
>>> )
|
|
569
|
-
>>> ln.UPath(dir_path).view_tree()
|
|
570
|
-
3 subdirectories, 15 files
|
|
571
|
-
sample_001
|
|
572
|
-
├── web_summary.html
|
|
573
|
-
├── metrics_summary.csv
|
|
574
|
-
├── molecule_info.h5
|
|
575
|
-
├── filtered_feature_bc_matrix
|
|
576
|
-
│ ├── features.tsv.gz
|
|
577
|
-
│ ├── barcodes.tsv.gz
|
|
578
|
-
│ └── matrix.mtx.gz
|
|
579
|
-
├── analysis
|
|
580
|
-
│ └── analysis.csv
|
|
581
|
-
├── raw_feature_bc_matrix
|
|
582
|
-
│ ├── features.tsv.gz
|
|
583
|
-
│ ├── barcodes.tsv.gz
|
|
584
|
-
│ └── matrix.mtx.gz
|
|
585
|
-
├── possorted_genome_bam.bam.bai
|
|
586
|
-
├── cloupe.cloupe
|
|
587
|
-
├── possorted_genome_bam.bam
|
|
588
|
-
├── filtered_feature_bc_matrix.h5
|
|
589
|
-
└── raw_feature_bc_matrix.h5
|
|
590
|
-
"""
|
|
591
|
-
message, _ = compute_file_tree(
|
|
592
|
-
path,
|
|
593
|
-
level=level,
|
|
594
|
-
only_dirs=only_dirs,
|
|
595
|
-
n_max_files=n_max_files,
|
|
596
|
-
n_max_files_per_dir_and_type=n_max_files_per_dir_and_type,
|
|
597
|
-
include_paths=include_paths,
|
|
598
|
-
skip_suffixes=skip_suffixes,
|
|
599
|
-
)
|
|
600
|
-
logger.print(message)
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
def to_url(upath):
|
|
604
|
-
"""Public storage URL.
|
|
605
|
-
|
|
606
|
-
Generates a public URL for an object in an S3 bucket using fsspec's UPath,
|
|
607
|
-
considering the bucket's region.
|
|
608
|
-
|
|
609
|
-
Args:
|
|
610
|
-
- upath: A UPath object representing an S3 path.
|
|
611
|
-
|
|
612
|
-
Returns:
|
|
613
|
-
- A string containing the public URL to the S3 object.
|
|
614
|
-
"""
|
|
615
|
-
if upath.protocol != "s3":
|
|
616
|
-
raise ValueError("The provided UPath must be an S3 path.")
|
|
617
|
-
key = "/".join(upath.parts[1:])
|
|
618
|
-
bucket = upath._url.netloc
|
|
619
|
-
if bucket == "scverse-spatial-eu-central-1":
|
|
620
|
-
region = "eu-central-1"
|
|
621
|
-
elif f"s3://{bucket}" not in HOSTED_BUCKETS:
|
|
622
|
-
response = upath.fs.call_s3("head_bucket", Bucket=upath._url.netloc)
|
|
623
|
-
headers = response["ResponseMetadata"]["HTTPHeaders"]
|
|
624
|
-
region = headers.get("x-amz-bucket-region")
|
|
625
|
-
else:
|
|
626
|
-
region = bucket.replace("lamin_", "")
|
|
627
|
-
if region == "us-east-1":
|
|
628
|
-
return f"https://{bucket}.s3.amazonaws.com/{key}"
|
|
629
|
-
else:
|
|
630
|
-
return f"https://{bucket}.s3-{region}.amazonaws.com/{key}"
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
# Why aren't we subclassing?
|
|
634
|
-
#
|
|
635
|
-
# The problem is that UPath defines a type system of paths
|
|
636
|
-
# Its __new__ method returns instances of different subclasses rather than a
|
|
637
|
-
# UPath object
|
|
638
|
-
# If we create a custom subclass naively, subclasses of the parent UPath won't
|
|
639
|
-
# be subclasses of our custom subclass
|
|
640
|
-
# This makes life really hard in type checks involving local to cloud
|
|
641
|
-
# comparisons, etc.
|
|
642
|
-
# Hence, we extend the existing UPath and amend the docs
|
|
643
|
-
# Some of this might end up in the original UPath implementation over time,
|
|
644
|
-
# we'll see.
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
# add custom functions
|
|
648
|
-
UPath.modified = property(modified)
|
|
649
|
-
UPath.synchronize = synchronize
|
|
650
|
-
UPath.upload_from = upload_from
|
|
651
|
-
UPath.to_url = to_url
|
|
652
|
-
UPath.download_to = download_to
|
|
653
|
-
UPath.view_tree = view_tree
|
|
654
|
-
# unfortunately, we also have to do this for the subclasses
|
|
655
|
-
Path.view_tree = view_tree # type: ignore
|
|
656
|
-
|
|
657
|
-
UPath.glob.__doc__ = Path.glob.__doc__
|
|
658
|
-
UPath.rglob.__doc__ = Path.rglob.__doc__
|
|
659
|
-
UPath.stat.__doc__ = Path.stat.__doc__
|
|
660
|
-
UPath.iterdir.__doc__ = Path.iterdir.__doc__
|
|
661
|
-
UPath.resolve.__doc__ = Path.resolve.__doc__
|
|
662
|
-
UPath.relative_to.__doc__ = Path.relative_to.__doc__
|
|
663
|
-
UPath.exists.__doc__ = Path.exists.__doc__
|
|
664
|
-
UPath.is_dir.__doc__ = Path.is_dir.__doc__
|
|
665
|
-
UPath.is_file.__doc__ = Path.is_file.__doc__
|
|
666
|
-
UPath.unlink.__doc__ = Path.unlink.__doc__
|
|
667
|
-
UPath.rename.__doc__ = """Move file, see fsspec.AbstractFileSystem.mv.
|
|
668
|
-
|
|
669
|
-
>>> upath = Upath("s3://my-bucket/my-file")
|
|
670
|
-
>>> upath.rename(UPath("s3://my-bucket/my-file-renamed"))
|
|
671
|
-
>>> upath.rename("my-file-renamed")
|
|
672
|
-
|
|
673
|
-
>>> upath = Upath("local-folder/my-file")
|
|
674
|
-
>>> upath.rename("local-folder/my-file-renamed")
|
|
675
|
-
"""
|
|
676
|
-
UPath.__doc__ = """Paths: low-level key-value access to files/objects.
|
|
677
|
-
|
|
678
|
-
Paths are based on keys that offer the typical access patterns of file systems
|
|
679
|
-
and object stores.
|
|
680
|
-
|
|
681
|
-
>>> upath = UPath("s3://my-bucket/my-folder")
|
|
682
|
-
>>> upath.exists()
|
|
683
|
-
|
|
684
|
-
Args:
|
|
685
|
-
pathlike: A string or Path to a local/cloud file/directory/folder.
|
|
686
|
-
"""
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
def create_path(path: UPath, access_token: str | None = None) -> UPath:
|
|
690
|
-
path = UPath(path)
|
|
691
|
-
# test whether we have an AWS S3 path
|
|
692
|
-
if not isinstance(path, S3Path):
|
|
693
|
-
return path
|
|
694
|
-
return get_aws_credentials_manager().enrich_path(path, access_token)
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
def get_stat_file_cloud(stat: dict) -> tuple[int, str, str]:
|
|
698
|
-
size = stat["size"]
|
|
699
|
-
etag = stat["ETag"]
|
|
700
|
-
# small files
|
|
701
|
-
if "-" not in etag:
|
|
702
|
-
# only store hash for non-multipart uploads
|
|
703
|
-
# we can't rapidly validate multi-part uploaded files client-side
|
|
704
|
-
# we can add more logic later down-the-road
|
|
705
|
-
hash = b16_to_b64(etag)
|
|
706
|
-
hash_type = "md5"
|
|
707
|
-
else:
|
|
708
|
-
stripped_etag, suffix = etag.split("-")
|
|
709
|
-
suffix = suffix.strip('"')
|
|
710
|
-
hash = b16_to_b64(stripped_etag)
|
|
711
|
-
hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
|
|
712
|
-
return size, hash[:HASH_LENGTH], hash_type
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
def get_stat_dir_cloud(path: UPath) -> tuple[int, str, str, int]:
|
|
716
|
-
sizes = []
|
|
717
|
-
md5s = []
|
|
718
|
-
objects = path.fs.find(path.as_posix(), detail=True)
|
|
719
|
-
if path.protocol == "s3":
|
|
720
|
-
accessor = "ETag"
|
|
721
|
-
elif path.protocol == "gs":
|
|
722
|
-
accessor = "md5Hash"
|
|
723
|
-
for object in objects.values():
|
|
724
|
-
sizes.append(object["size"])
|
|
725
|
-
md5s.append(object[accessor].strip('"='))
|
|
726
|
-
size = sum(sizes)
|
|
727
|
-
hash, hash_type = hash_md5s_from_dir(md5s)
|
|
728
|
-
n_objects = len(md5s)
|
|
729
|
-
return size, hash, hash_type, n_objects
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
class InstanceNotEmpty(Exception):
|
|
733
|
-
pass
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
# is as fast as boto3: https://lamin.ai/laminlabs/lamindata/transform/krGp3hT1f78N5zKv
|
|
737
|
-
def check_storage_is_empty(
|
|
738
|
-
root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
|
|
739
|
-
) -> int:
|
|
740
|
-
root_upath = UPath(root)
|
|
741
|
-
root_string = root_upath.as_posix() # type: ignore
|
|
742
|
-
# we currently touch a 0-byte file in the root of a hosted storage location
|
|
743
|
-
# ({storage_root}/.lamindb/_is_initialized) during storage initialization
|
|
744
|
-
# since path.fs.find raises a PermissionError on empty hosted
|
|
745
|
-
# subdirectories (see lamindb_setup/core/_settings_storage/init_storage).
|
|
746
|
-
n_offset_objects = 1 # because of touched dummy file, see mark_storage_root()
|
|
747
|
-
if root_string.startswith(HOSTED_BUCKETS):
|
|
748
|
-
# in hosted buckets, count across entire root
|
|
749
|
-
directory_string = root_string
|
|
750
|
-
# the SQLite file is not in the ".lamindb" directory
|
|
751
|
-
if account_for_sqlite_file:
|
|
752
|
-
n_offset_objects += 1 # because of SQLite file
|
|
753
|
-
else:
|
|
754
|
-
# in any other storage location, only count in .lamindb
|
|
755
|
-
if not root_string.endswith("/"):
|
|
756
|
-
root_string += "/"
|
|
757
|
-
directory_string = root_string + ".lamindb"
|
|
758
|
-
objects = root_upath.fs.find(directory_string)
|
|
759
|
-
n_objects = len(objects)
|
|
760
|
-
n_diff = n_objects - n_offset_objects
|
|
761
|
-
ask_for_deletion = (
|
|
762
|
-
"delete them prior to deleting the instance"
|
|
763
|
-
if raise_error
|
|
764
|
-
else "consider deleting them"
|
|
765
|
-
)
|
|
766
|
-
hint = "'_is_initialized'"
|
|
767
|
-
if n_offset_objects == 2:
|
|
768
|
-
hint += " & SQLite file"
|
|
769
|
-
hint += " ignored"
|
|
770
|
-
message = (
|
|
771
|
-
f"Storage {directory_string} contains {n_objects - n_offset_objects} objects "
|
|
772
|
-
f"({hint}) - {ask_for_deletion}"
|
|
773
|
-
)
|
|
774
|
-
if n_diff > 0:
|
|
775
|
-
if raise_error:
|
|
776
|
-
raise InstanceNotEmpty(message)
|
|
777
|
-
else:
|
|
778
|
-
logger.warning(message)
|
|
779
|
-
return n_diff
|
|
1
|
+
# we are not documenting UPath here because it's documented at lamindb.UPath
|
|
2
|
+
"""Paths & file systems."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from functools import partial
|
|
10
|
+
from itertools import islice
|
|
11
|
+
from pathlib import Path, PurePosixPath
|
|
12
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
13
|
+
|
|
14
|
+
import fsspec
|
|
15
|
+
from lamin_utils import logger
|
|
16
|
+
from upath import UPath
|
|
17
|
+
from upath.implementations.cloud import CloudPath, S3Path # keep CloudPath!
|
|
18
|
+
from upath.implementations.local import LocalPath, PosixUPath, WindowsUPath
|
|
19
|
+
|
|
20
|
+
from ._aws_credentials import HOSTED_BUCKETS, get_aws_credentials_manager
|
|
21
|
+
from .hashing import HASH_LENGTH, b16_to_b64, hash_md5s_from_dir
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from .types import UPathStr
|
|
25
|
+
|
|
26
|
+
LocalPathClasses = (PosixUPath, WindowsUPath, LocalPath)
|
|
27
|
+
|
|
28
|
+
# also see https://gist.github.com/securifera/e7eed730cbe1ce43d0c29d7cd2d582f4
|
|
29
|
+
# ".gz" is not listed here as it typically occurs with another suffix
|
|
30
|
+
# the complete list is at lamindb.core.storage._suffixes
|
|
31
|
+
VALID_SIMPLE_SUFFIXES = {
|
|
32
|
+
#
|
|
33
|
+
# without readers
|
|
34
|
+
#
|
|
35
|
+
".fasta",
|
|
36
|
+
".fastq",
|
|
37
|
+
".jpg",
|
|
38
|
+
".mtx",
|
|
39
|
+
".obo",
|
|
40
|
+
".pdf",
|
|
41
|
+
".png",
|
|
42
|
+
".tar",
|
|
43
|
+
".tiff",
|
|
44
|
+
".txt",
|
|
45
|
+
".tsv",
|
|
46
|
+
".zip",
|
|
47
|
+
".xml",
|
|
48
|
+
#
|
|
49
|
+
# with readers (see below)
|
|
50
|
+
#
|
|
51
|
+
".h5ad",
|
|
52
|
+
".parquet",
|
|
53
|
+
".csv",
|
|
54
|
+
".fcs",
|
|
55
|
+
".xslx",
|
|
56
|
+
".zarr",
|
|
57
|
+
".json",
|
|
58
|
+
}
|
|
59
|
+
# below gets updated within lamindb because it's frequently changing
|
|
60
|
+
VALID_COMPOSITE_SUFFIXES = {".anndata.zarr"}
|
|
61
|
+
|
|
62
|
+
TRAILING_SEP = (os.sep, os.altsep) if os.altsep is not None else os.sep
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def extract_suffix_from_path(path: Path, arg_name: str | None = None) -> str:
|
|
66
|
+
def process_digits(suffix: str):
|
|
67
|
+
if suffix[1:].isdigit(): # :1 to skip the dot
|
|
68
|
+
return "" # digits are no valid suffixes
|
|
69
|
+
else:
|
|
70
|
+
return suffix
|
|
71
|
+
|
|
72
|
+
if len(path.suffixes) <= 1:
|
|
73
|
+
return process_digits(path.suffix)
|
|
74
|
+
|
|
75
|
+
total_suffix = "".join(path.suffixes)
|
|
76
|
+
if total_suffix in VALID_SIMPLE_SUFFIXES:
|
|
77
|
+
return total_suffix
|
|
78
|
+
elif total_suffix.endswith(tuple(VALID_COMPOSITE_SUFFIXES)):
|
|
79
|
+
# below seems slow but OK for now
|
|
80
|
+
for suffix in VALID_COMPOSITE_SUFFIXES:
|
|
81
|
+
if total_suffix.endswith(suffix):
|
|
82
|
+
break
|
|
83
|
+
return suffix
|
|
84
|
+
else:
|
|
85
|
+
print_hint = True
|
|
86
|
+
arg_name = "file" if arg_name is None else arg_name # for the warning
|
|
87
|
+
msg = f"{arg_name} has more than one suffix (path.suffixes), "
|
|
88
|
+
# first check the 2nd-to-last suffix because it might be followed by .gz
|
|
89
|
+
# or another compression-related suffix
|
|
90
|
+
# Alex thought about adding logic along the lines of path.suffixes[-1]
|
|
91
|
+
# in COMPRESSION_SUFFIXES to detect something like .random.gz and then
|
|
92
|
+
# add ".random.gz" but concluded it's too dangerous it's safer to just
|
|
93
|
+
# use ".gz" in such a case
|
|
94
|
+
if path.suffixes[-2] in VALID_SIMPLE_SUFFIXES:
|
|
95
|
+
suffix = "".join(path.suffixes[-2:])
|
|
96
|
+
msg += f"inferring: '{suffix}'"
|
|
97
|
+
# do not print a warning for things like .tar.gz, .fastq.gz
|
|
98
|
+
if path.suffixes[-1] == ".gz":
|
|
99
|
+
print_hint = False
|
|
100
|
+
else:
|
|
101
|
+
suffix = path.suffixes[-1] # this is equivalent to path.suffix
|
|
102
|
+
msg += (
|
|
103
|
+
f"using only last suffix: '{suffix}' - if you want your composite"
|
|
104
|
+
" suffix to be recognized add it to"
|
|
105
|
+
" lamindb.core.storage.VALID_SIMPLE_SUFFIXES.add()"
|
|
106
|
+
)
|
|
107
|
+
if print_hint:
|
|
108
|
+
logger.hint(msg)
|
|
109
|
+
return process_digits(suffix)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def infer_filesystem(path: UPathStr):
|
|
113
|
+
import fsspec # improve cold start
|
|
114
|
+
|
|
115
|
+
path_str = str(path)
|
|
116
|
+
|
|
117
|
+
if isinstance(path, UPath):
|
|
118
|
+
fs = path.fs
|
|
119
|
+
else:
|
|
120
|
+
protocol = fsspec.utils.get_protocol(path_str)
|
|
121
|
+
if protocol == "s3":
|
|
122
|
+
fs_kwargs = {"cache_regions": True}
|
|
123
|
+
else:
|
|
124
|
+
fs_kwargs = {}
|
|
125
|
+
fs = fsspec.filesystem(protocol, **fs_kwargs)
|
|
126
|
+
|
|
127
|
+
return fs, path_str
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# this is needed to avoid CreateBucket permission
|
|
131
|
+
class S3FSMap(fsspec.FSMap):
|
|
132
|
+
def __setitem__(self, key, value):
|
|
133
|
+
"""Store value in key."""
|
|
134
|
+
key = self._key_to_str(key)
|
|
135
|
+
self.fs.pipe_file(key, fsspec.mapping.maybe_convert(value))
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def create_mapper(
|
|
139
|
+
fs,
|
|
140
|
+
url="",
|
|
141
|
+
check=False,
|
|
142
|
+
create=False,
|
|
143
|
+
missing_exceptions=None,
|
|
144
|
+
):
|
|
145
|
+
if fsspec.utils.get_protocol(url) == "s3":
|
|
146
|
+
return S3FSMap(
|
|
147
|
+
url, fs, check=check, create=False, missing_exceptions=missing_exceptions
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
return fsspec.FSMap(
|
|
151
|
+
url, fs, check=check, create=create, missing_exceptions=missing_exceptions
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def print_hook(size: int, value: int, objectname: str, action: str):
|
|
156
|
+
if size == 0:
|
|
157
|
+
progress_in_percent = 100.0
|
|
158
|
+
else:
|
|
159
|
+
progress_in_percent = (value / size) * 100
|
|
160
|
+
out = f"... {action} {objectname}:" f" {min(progress_in_percent, 100):4.1f}%"
|
|
161
|
+
if "NBPRJ_TEST_NBPATH" not in os.environ:
|
|
162
|
+
end = "\n" if progress_in_percent >= 100 else "\r"
|
|
163
|
+
print(out, end=end)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class ProgressCallback(fsspec.callbacks.Callback):
|
|
167
|
+
def __init__(
|
|
168
|
+
self,
|
|
169
|
+
objectname: str,
|
|
170
|
+
action: Literal["uploading", "downloading", "synchronizing"],
|
|
171
|
+
adjust_size: bool = False,
|
|
172
|
+
):
|
|
173
|
+
assert action in {"uploading", "downloading", "synchronizing"}
|
|
174
|
+
|
|
175
|
+
super().__init__()
|
|
176
|
+
|
|
177
|
+
self.action = action
|
|
178
|
+
print_progress = partial(print_hook, objectname=objectname, action=action)
|
|
179
|
+
self.hooks = {"print_progress": print_progress}
|
|
180
|
+
|
|
181
|
+
self.adjust_size = adjust_size
|
|
182
|
+
|
|
183
|
+
def absolute_update(self, value):
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
def relative_update(self, inc=1):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
def update_relative_value(self, inc=1):
|
|
190
|
+
self.value += inc
|
|
191
|
+
self.call()
|
|
192
|
+
|
|
193
|
+
def branch(self, path_1, path_2, kwargs):
|
|
194
|
+
if self.adjust_size:
|
|
195
|
+
if Path(path_2 if self.action != "uploading" else path_1).is_dir():
|
|
196
|
+
self.size -= 1
|
|
197
|
+
kwargs["callback"] = ChildProgressCallback(self)
|
|
198
|
+
|
|
199
|
+
def branched(self, path_1, path_2, **kwargs):
|
|
200
|
+
self.branch(path_1, path_2, kwargs)
|
|
201
|
+
return kwargs["callback"]
|
|
202
|
+
|
|
203
|
+
def wrap(self, iterable):
|
|
204
|
+
if self.adjust_size:
|
|
205
|
+
paths = []
|
|
206
|
+
for lpath, rpath in iterable:
|
|
207
|
+
paths.append((lpath, rpath))
|
|
208
|
+
if Path(lpath).is_dir():
|
|
209
|
+
self.size -= 1
|
|
210
|
+
self.adjust_size = False
|
|
211
|
+
return paths
|
|
212
|
+
else:
|
|
213
|
+
return iterable
|
|
214
|
+
|
|
215
|
+
@classmethod
|
|
216
|
+
def requires_progress(
|
|
217
|
+
cls,
|
|
218
|
+
maybe_callback: fsspec.callbacks.Callback | None,
|
|
219
|
+
print_progress: bool,
|
|
220
|
+
objectname: str,
|
|
221
|
+
action: Literal["uploading", "downloading", "synchronizing"],
|
|
222
|
+
**kwargs,
|
|
223
|
+
):
|
|
224
|
+
if maybe_callback is None:
|
|
225
|
+
if print_progress:
|
|
226
|
+
return cls(objectname, action, **kwargs)
|
|
227
|
+
else:
|
|
228
|
+
return fsspec.callbacks.NoOpCallback()
|
|
229
|
+
return maybe_callback
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class ChildProgressCallback(fsspec.callbacks.Callback):
|
|
233
|
+
def __init__(self, parent: ProgressCallback):
|
|
234
|
+
super().__init__()
|
|
235
|
+
|
|
236
|
+
self.parent = parent
|
|
237
|
+
|
|
238
|
+
def parent_update(self, inc=1):
|
|
239
|
+
self.parent.update_relative_value(inc)
|
|
240
|
+
|
|
241
|
+
def relative_update(self, inc=1):
|
|
242
|
+
if self.size != 0:
|
|
243
|
+
self.parent_update(inc / self.size)
|
|
244
|
+
else:
|
|
245
|
+
self.parent_update(1)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def download_to(self, local_path: UPathStr, print_progress: bool = True, **kwargs):
|
|
249
|
+
"""Download from self (a destination in the cloud) to the local path."""
|
|
250
|
+
if "recursive" not in kwargs:
|
|
251
|
+
kwargs["recursive"] = True
|
|
252
|
+
if print_progress and "callback" not in kwargs:
|
|
253
|
+
callback = ProgressCallback(
|
|
254
|
+
PurePosixPath(local_path).name, "downloading", adjust_size=True
|
|
255
|
+
)
|
|
256
|
+
kwargs["callback"] = callback
|
|
257
|
+
|
|
258
|
+
self.fs.download(str(self), str(local_path), **kwargs)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def upload_from(
|
|
262
|
+
self,
|
|
263
|
+
local_path: UPathStr,
|
|
264
|
+
create_folder: bool | None = None,
|
|
265
|
+
print_progress: bool = True,
|
|
266
|
+
**kwargs,
|
|
267
|
+
) -> UPath:
|
|
268
|
+
"""Upload from the local path to `self` (a destination in the cloud).
|
|
269
|
+
|
|
270
|
+
If the local path is a directory, recursively upload its contents.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
local_path: A local path of a file or directory.
|
|
274
|
+
create_folder: Only applies if `local_path` is a directory and then
|
|
275
|
+
defaults to `True`. If `True`, make a new folder in the destination
|
|
276
|
+
using the directory name of `local_path`. If `False`, upload the
|
|
277
|
+
contents of the directory to to the root-level of the destination.
|
|
278
|
+
print_progress: Print progress.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
The destination path.
|
|
282
|
+
"""
|
|
283
|
+
local_path = Path(local_path)
|
|
284
|
+
local_path_is_dir = local_path.is_dir()
|
|
285
|
+
if create_folder is None:
|
|
286
|
+
create_folder = local_path_is_dir
|
|
287
|
+
if create_folder and not local_path_is_dir:
|
|
288
|
+
raise ValueError("create_folder can only be True if local_path is a directory")
|
|
289
|
+
|
|
290
|
+
if print_progress and "callback" not in kwargs:
|
|
291
|
+
callback = ProgressCallback(local_path.name, "uploading")
|
|
292
|
+
kwargs["callback"] = callback
|
|
293
|
+
|
|
294
|
+
if local_path_is_dir and not create_folder:
|
|
295
|
+
source = [f for f in local_path.rglob("*") if f.is_file()]
|
|
296
|
+
destination = [str(self / f.relative_to(local_path)) for f in source]
|
|
297
|
+
source = [str(f) for f in source] # type: ignore
|
|
298
|
+
else:
|
|
299
|
+
source = str(local_path) # type: ignore
|
|
300
|
+
destination = str(self) # type: ignore
|
|
301
|
+
|
|
302
|
+
# the below lines are to avoid s3fs triggering create_bucket in upload if
|
|
303
|
+
# dirs are present it allows to avoid permission error
|
|
304
|
+
# would be easier to just
|
|
305
|
+
if self.protocol == "s3" and local_path_is_dir and create_folder:
|
|
306
|
+
bucket = self._url.netloc
|
|
307
|
+
if bucket not in self.fs.dircache:
|
|
308
|
+
self.fs.dircache[bucket] = [{}]
|
|
309
|
+
if not destination.endswith(TRAILING_SEP): # type: ignore
|
|
310
|
+
destination += "/"
|
|
311
|
+
cleanup_cache = True
|
|
312
|
+
else:
|
|
313
|
+
cleanup_cache = False
|
|
314
|
+
else:
|
|
315
|
+
cleanup_cache = False
|
|
316
|
+
|
|
317
|
+
self.fs.upload(source, destination, recursive=create_folder, **kwargs)
|
|
318
|
+
|
|
319
|
+
if cleanup_cache:
|
|
320
|
+
# normally this is invalidated after the upload but still better to check
|
|
321
|
+
if bucket in self.fs.dircache:
|
|
322
|
+
del self.fs.dircache[bucket]
|
|
323
|
+
|
|
324
|
+
if local_path_is_dir and create_folder:
|
|
325
|
+
return self / local_path.name
|
|
326
|
+
else:
|
|
327
|
+
return self
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def synchronize(
|
|
331
|
+
self,
|
|
332
|
+
objectpath: Path,
|
|
333
|
+
error_no_origin: bool = True,
|
|
334
|
+
print_progress: bool = False,
|
|
335
|
+
callback: fsspec.callbacks.Callback | None = None,
|
|
336
|
+
timestamp: float | None = None,
|
|
337
|
+
):
|
|
338
|
+
"""Sync to a local destination path."""
|
|
339
|
+
# optimize the number of network requests
|
|
340
|
+
if timestamp is not None:
|
|
341
|
+
is_dir = False
|
|
342
|
+
exists = True
|
|
343
|
+
cloud_mts = timestamp
|
|
344
|
+
else:
|
|
345
|
+
# perform only one network request to check existence, type and timestamp
|
|
346
|
+
try:
|
|
347
|
+
cloud_mts = self.modified.timestamp()
|
|
348
|
+
is_dir = False
|
|
349
|
+
exists = True
|
|
350
|
+
except FileNotFoundError:
|
|
351
|
+
exists = False
|
|
352
|
+
except IsADirectoryError:
|
|
353
|
+
is_dir = True
|
|
354
|
+
exists = True
|
|
355
|
+
|
|
356
|
+
if not exists:
|
|
357
|
+
warn_or_error = f"The original path {self} does not exist anymore."
|
|
358
|
+
if objectpath.exists():
|
|
359
|
+
warn_or_error += (
|
|
360
|
+
f"\nHowever, the local path {objectpath} still exists, you might want"
|
|
361
|
+
" to reupload the object back."
|
|
362
|
+
)
|
|
363
|
+
logger.warning(warn_or_error)
|
|
364
|
+
elif error_no_origin:
|
|
365
|
+
warn_or_error += "\nIt is not possible to synchronize."
|
|
366
|
+
raise FileNotFoundError(warn_or_error)
|
|
367
|
+
return None
|
|
368
|
+
|
|
369
|
+
# synchronization logic for directories
|
|
370
|
+
if is_dir:
|
|
371
|
+
files = self.fs.find(str(self), detail=True)
|
|
372
|
+
protocol_modified = {"s3": "LastModified", "gs": "mtime"}
|
|
373
|
+
modified_key = protocol_modified.get(self.protocol, None)
|
|
374
|
+
if modified_key is None:
|
|
375
|
+
raise ValueError(f"Can't synchronize a directory for {self.protocol}.")
|
|
376
|
+
if objectpath.exists():
|
|
377
|
+
destination_exists = True
|
|
378
|
+
cloud_mts_max = max(
|
|
379
|
+
file[modified_key] for file in files.values()
|
|
380
|
+
).timestamp()
|
|
381
|
+
local_mts = [
|
|
382
|
+
file.stat().st_mtime for file in objectpath.rglob("*") if file.is_file()
|
|
383
|
+
]
|
|
384
|
+
n_local_files = len(local_mts)
|
|
385
|
+
local_mts_max = max(local_mts)
|
|
386
|
+
if local_mts_max == cloud_mts_max:
|
|
387
|
+
need_synchronize = n_local_files != len(files)
|
|
388
|
+
elif local_mts_max > cloud_mts_max:
|
|
389
|
+
need_synchronize = False
|
|
390
|
+
else:
|
|
391
|
+
need_synchronize = True
|
|
392
|
+
else:
|
|
393
|
+
destination_exists = False
|
|
394
|
+
need_synchronize = True
|
|
395
|
+
if need_synchronize:
|
|
396
|
+
callback = ProgressCallback.requires_progress(
|
|
397
|
+
callback, print_progress, objectpath.name, "synchronizing"
|
|
398
|
+
)
|
|
399
|
+
callback.set_size(len(files))
|
|
400
|
+
origin_file_keys = []
|
|
401
|
+
for file, stat in callback.wrap(files.items()):
|
|
402
|
+
file_key = PurePosixPath(file).relative_to(self.path)
|
|
403
|
+
origin_file_keys.append(file_key.as_posix())
|
|
404
|
+
timestamp = stat[modified_key].timestamp()
|
|
405
|
+
|
|
406
|
+
origin = f"{self.protocol}://{file}"
|
|
407
|
+
destination = objectpath / file_key
|
|
408
|
+
child = callback.branched(origin, destination.as_posix())
|
|
409
|
+
UPath(origin, **self.storage_options).synchronize(
|
|
410
|
+
destination, callback=child, timestamp=timestamp
|
|
411
|
+
)
|
|
412
|
+
child.close()
|
|
413
|
+
if destination_exists:
|
|
414
|
+
local_files = [file for file in objectpath.rglob("*") if file.is_file()]
|
|
415
|
+
if len(local_files) > len(files):
|
|
416
|
+
for file in local_files:
|
|
417
|
+
if (
|
|
418
|
+
file.relative_to(objectpath).as_posix()
|
|
419
|
+
not in origin_file_keys
|
|
420
|
+
):
|
|
421
|
+
file.unlink()
|
|
422
|
+
parent = file.parent
|
|
423
|
+
if next(parent.iterdir(), None) is None:
|
|
424
|
+
parent.rmdir()
|
|
425
|
+
return None
|
|
426
|
+
|
|
427
|
+
# synchronization logic for files
|
|
428
|
+
callback = ProgressCallback.requires_progress(
|
|
429
|
+
callback, print_progress, objectpath.name, "synchronizing"
|
|
430
|
+
)
|
|
431
|
+
if objectpath.exists():
|
|
432
|
+
local_mts_obj = objectpath.stat().st_mtime # type: ignore
|
|
433
|
+
need_synchronize = cloud_mts > local_mts_obj
|
|
434
|
+
else:
|
|
435
|
+
objectpath.parent.mkdir(parents=True, exist_ok=True)
|
|
436
|
+
need_synchronize = True
|
|
437
|
+
if need_synchronize:
|
|
438
|
+
self.download_to(
|
|
439
|
+
objectpath, recursive=False, print_progress=False, callback=callback
|
|
440
|
+
)
|
|
441
|
+
os.utime(objectpath, times=(cloud_mts, cloud_mts))
|
|
442
|
+
else:
|
|
443
|
+
# nothing happens if parent_update is not defined
|
|
444
|
+
# because of Callback.no_op
|
|
445
|
+
callback.parent_update()
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def modified(self) -> datetime | None:
|
|
449
|
+
"""Return modified time stamp."""
|
|
450
|
+
mtime = self.fs.modified(str(self))
|
|
451
|
+
if mtime.tzinfo is None:
|
|
452
|
+
mtime = mtime.replace(tzinfo=timezone.utc)
|
|
453
|
+
return mtime.astimezone().replace(tzinfo=None)
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def compute_file_tree(
|
|
457
|
+
path: Path,
|
|
458
|
+
*,
|
|
459
|
+
level: int = -1,
|
|
460
|
+
only_dirs: bool = False,
|
|
461
|
+
n_max_files_per_dir_and_type: int = 100,
|
|
462
|
+
n_max_files: int = 1000,
|
|
463
|
+
include_paths: set[Any] | None = None,
|
|
464
|
+
skip_suffixes: list[str] | None = None,
|
|
465
|
+
) -> tuple[str, int]:
|
|
466
|
+
space = " "
|
|
467
|
+
branch = "│ "
|
|
468
|
+
tee = "├── "
|
|
469
|
+
last = "└── "
|
|
470
|
+
if skip_suffixes is None:
|
|
471
|
+
skip_suffixes_tuple = ()
|
|
472
|
+
else:
|
|
473
|
+
skip_suffixes_tuple = tuple(skip_suffixes) # type: ignore
|
|
474
|
+
n_objects = 0
|
|
475
|
+
n_directories = 0
|
|
476
|
+
|
|
477
|
+
# by default only including registered files
|
|
478
|
+
# need a flag and a proper implementation
|
|
479
|
+
suffixes = set()
|
|
480
|
+
include_dirs = set()
|
|
481
|
+
if include_paths is not None:
|
|
482
|
+
include_dirs = {d for p in include_paths for d in p.parents}
|
|
483
|
+
else:
|
|
484
|
+
include_paths = set()
|
|
485
|
+
|
|
486
|
+
def inner(dir_path: Path, prefix: str = "", level: int = -1):
|
|
487
|
+
nonlocal n_objects, n_directories, suffixes
|
|
488
|
+
if level == 0:
|
|
489
|
+
return
|
|
490
|
+
stripped_dir_path = dir_path.as_posix().rstrip("/")
|
|
491
|
+
# do not iterate through zarr directories
|
|
492
|
+
if stripped_dir_path.endswith(skip_suffixes_tuple):
|
|
493
|
+
return
|
|
494
|
+
# this is needed so that the passed folder is not listed
|
|
495
|
+
contents = [
|
|
496
|
+
i
|
|
497
|
+
for i in dir_path.iterdir()
|
|
498
|
+
if i.as_posix().rstrip("/") != stripped_dir_path
|
|
499
|
+
]
|
|
500
|
+
if only_dirs:
|
|
501
|
+
contents = [d for d in contents if d.is_dir()]
|
|
502
|
+
pointers = [tee] * (len(contents) - 1) + [last]
|
|
503
|
+
n_files_per_dir_and_type = defaultdict(lambda: 0) # type: ignore
|
|
504
|
+
# TODO: pass strict=False to zip with python > 3.9
|
|
505
|
+
for pointer, child_path in zip(pointers, contents): # type: ignore
|
|
506
|
+
if child_path.is_dir():
|
|
507
|
+
if include_dirs and child_path not in include_dirs:
|
|
508
|
+
continue
|
|
509
|
+
yield prefix + pointer + child_path.name + "/"
|
|
510
|
+
n_directories += 1
|
|
511
|
+
n_files_per_dir_and_type = defaultdict(lambda: 0)
|
|
512
|
+
extension = branch if pointer == tee else space
|
|
513
|
+
yield from inner(child_path, prefix=prefix + extension, level=level - 1)
|
|
514
|
+
elif not only_dirs:
|
|
515
|
+
if include_paths and child_path not in include_paths:
|
|
516
|
+
continue
|
|
517
|
+
suffix = extract_suffix_from_path(child_path)
|
|
518
|
+
suffixes.add(suffix)
|
|
519
|
+
n_files_per_dir_and_type[suffix] += 1
|
|
520
|
+
n_objects += 1
|
|
521
|
+
if n_files_per_dir_and_type[suffix] == n_max_files_per_dir_and_type:
|
|
522
|
+
yield prefix + "..."
|
|
523
|
+
elif n_files_per_dir_and_type[suffix] > n_max_files_per_dir_and_type:
|
|
524
|
+
continue
|
|
525
|
+
else:
|
|
526
|
+
yield prefix + pointer + child_path.name
|
|
527
|
+
|
|
528
|
+
folder_tree = ""
|
|
529
|
+
iterator = inner(path, level=level)
|
|
530
|
+
for line in islice(iterator, n_max_files):
|
|
531
|
+
folder_tree += f"\n{line}"
|
|
532
|
+
if next(iterator, None):
|
|
533
|
+
folder_tree += f"\n... only showing {n_max_files} out of {n_objects} files"
|
|
534
|
+
directory_info = "directory" if n_directories == 1 else "directories"
|
|
535
|
+
display_suffixes = ", ".join([f"{suffix!r}" for suffix in suffixes])
|
|
536
|
+
suffix_message = f" with suffixes {display_suffixes}" if n_objects > 0 else ""
|
|
537
|
+
message = (
|
|
538
|
+
f"{n_directories} sub-{directory_info} &"
|
|
539
|
+
f" {n_objects} files{suffix_message}\n{path.resolve()}{folder_tree}"
|
|
540
|
+
)
|
|
541
|
+
return message, n_objects
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
# adapted from: https://stackoverflow.com/questions/9727673
|
|
545
|
+
def view_tree(
|
|
546
|
+
path: Path,
|
|
547
|
+
*,
|
|
548
|
+
level: int = 2,
|
|
549
|
+
only_dirs: bool = False,
|
|
550
|
+
n_max_files_per_dir_and_type: int = 100,
|
|
551
|
+
n_max_files: int = 1000,
|
|
552
|
+
include_paths: set[Any] | None = None,
|
|
553
|
+
skip_suffixes: list[str] | None = None,
|
|
554
|
+
) -> None:
|
|
555
|
+
"""Print a visual tree structure of files & directories.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
level: If `1`, only iterate through one level, if `2` iterate through 2
|
|
559
|
+
levels, if `-1` iterate through entire hierarchy.
|
|
560
|
+
only_dirs: Only iterate through directories.
|
|
561
|
+
n_max_files: Display limit. Will only show this many files. Doesn't affect count.
|
|
562
|
+
include_paths: Restrict to these paths.
|
|
563
|
+
skip_suffixes: Skip directories with these suffixes.
|
|
564
|
+
|
|
565
|
+
Examples:
|
|
566
|
+
>>> dir_path = ln.core.datasets.generate_cell_ranger_files(
|
|
567
|
+
>>> "sample_001", ln.settings.storage
|
|
568
|
+
>>> )
|
|
569
|
+
>>> ln.UPath(dir_path).view_tree()
|
|
570
|
+
3 subdirectories, 15 files
|
|
571
|
+
sample_001
|
|
572
|
+
├── web_summary.html
|
|
573
|
+
├── metrics_summary.csv
|
|
574
|
+
├── molecule_info.h5
|
|
575
|
+
├── filtered_feature_bc_matrix
|
|
576
|
+
│ ├── features.tsv.gz
|
|
577
|
+
│ ├── barcodes.tsv.gz
|
|
578
|
+
│ └── matrix.mtx.gz
|
|
579
|
+
├── analysis
|
|
580
|
+
│ └── analysis.csv
|
|
581
|
+
├── raw_feature_bc_matrix
|
|
582
|
+
│ ├── features.tsv.gz
|
|
583
|
+
│ ├── barcodes.tsv.gz
|
|
584
|
+
│ └── matrix.mtx.gz
|
|
585
|
+
├── possorted_genome_bam.bam.bai
|
|
586
|
+
├── cloupe.cloupe
|
|
587
|
+
├── possorted_genome_bam.bam
|
|
588
|
+
├── filtered_feature_bc_matrix.h5
|
|
589
|
+
└── raw_feature_bc_matrix.h5
|
|
590
|
+
"""
|
|
591
|
+
message, _ = compute_file_tree(
|
|
592
|
+
path,
|
|
593
|
+
level=level,
|
|
594
|
+
only_dirs=only_dirs,
|
|
595
|
+
n_max_files=n_max_files,
|
|
596
|
+
n_max_files_per_dir_and_type=n_max_files_per_dir_and_type,
|
|
597
|
+
include_paths=include_paths,
|
|
598
|
+
skip_suffixes=skip_suffixes,
|
|
599
|
+
)
|
|
600
|
+
logger.print(message)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def to_url(upath):
|
|
604
|
+
"""Public storage URL.
|
|
605
|
+
|
|
606
|
+
Generates a public URL for an object in an S3 bucket using fsspec's UPath,
|
|
607
|
+
considering the bucket's region.
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
- upath: A UPath object representing an S3 path.
|
|
611
|
+
|
|
612
|
+
Returns:
|
|
613
|
+
- A string containing the public URL to the S3 object.
|
|
614
|
+
"""
|
|
615
|
+
if upath.protocol != "s3":
|
|
616
|
+
raise ValueError("The provided UPath must be an S3 path.")
|
|
617
|
+
key = "/".join(upath.parts[1:])
|
|
618
|
+
bucket = upath._url.netloc
|
|
619
|
+
if bucket == "scverse-spatial-eu-central-1":
|
|
620
|
+
region = "eu-central-1"
|
|
621
|
+
elif f"s3://{bucket}" not in HOSTED_BUCKETS:
|
|
622
|
+
response = upath.fs.call_s3("head_bucket", Bucket=upath._url.netloc)
|
|
623
|
+
headers = response["ResponseMetadata"]["HTTPHeaders"]
|
|
624
|
+
region = headers.get("x-amz-bucket-region")
|
|
625
|
+
else:
|
|
626
|
+
region = bucket.replace("lamin_", "")
|
|
627
|
+
if region == "us-east-1":
|
|
628
|
+
return f"https://{bucket}.s3.amazonaws.com/{key}"
|
|
629
|
+
else:
|
|
630
|
+
return f"https://{bucket}.s3-{region}.amazonaws.com/{key}"
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
# Why aren't we subclassing?
|
|
634
|
+
#
|
|
635
|
+
# The problem is that UPath defines a type system of paths
|
|
636
|
+
# Its __new__ method returns instances of different subclasses rather than a
|
|
637
|
+
# UPath object
|
|
638
|
+
# If we create a custom subclass naively, subclasses of the parent UPath won't
|
|
639
|
+
# be subclasses of our custom subclass
|
|
640
|
+
# This makes life really hard in type checks involving local to cloud
|
|
641
|
+
# comparisons, etc.
|
|
642
|
+
# Hence, we extend the existing UPath and amend the docs
|
|
643
|
+
# Some of this might end up in the original UPath implementation over time,
|
|
644
|
+
# we'll see.
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
# add custom functions
|
|
648
|
+
UPath.modified = property(modified)
|
|
649
|
+
UPath.synchronize = synchronize
|
|
650
|
+
UPath.upload_from = upload_from
|
|
651
|
+
UPath.to_url = to_url
|
|
652
|
+
UPath.download_to = download_to
|
|
653
|
+
UPath.view_tree = view_tree
|
|
654
|
+
# unfortunately, we also have to do this for the subclasses
|
|
655
|
+
Path.view_tree = view_tree # type: ignore
|
|
656
|
+
|
|
657
|
+
UPath.glob.__doc__ = Path.glob.__doc__
|
|
658
|
+
UPath.rglob.__doc__ = Path.rglob.__doc__
|
|
659
|
+
UPath.stat.__doc__ = Path.stat.__doc__
|
|
660
|
+
UPath.iterdir.__doc__ = Path.iterdir.__doc__
|
|
661
|
+
UPath.resolve.__doc__ = Path.resolve.__doc__
|
|
662
|
+
UPath.relative_to.__doc__ = Path.relative_to.__doc__
|
|
663
|
+
UPath.exists.__doc__ = Path.exists.__doc__
|
|
664
|
+
UPath.is_dir.__doc__ = Path.is_dir.__doc__
|
|
665
|
+
UPath.is_file.__doc__ = Path.is_file.__doc__
|
|
666
|
+
UPath.unlink.__doc__ = Path.unlink.__doc__
|
|
667
|
+
UPath.rename.__doc__ = """Move file, see fsspec.AbstractFileSystem.mv.
|
|
668
|
+
|
|
669
|
+
>>> upath = Upath("s3://my-bucket/my-file")
|
|
670
|
+
>>> upath.rename(UPath("s3://my-bucket/my-file-renamed"))
|
|
671
|
+
>>> upath.rename("my-file-renamed")
|
|
672
|
+
|
|
673
|
+
>>> upath = Upath("local-folder/my-file")
|
|
674
|
+
>>> upath.rename("local-folder/my-file-renamed")
|
|
675
|
+
"""
|
|
676
|
+
UPath.__doc__ = """Paths: low-level key-value access to files/objects.
|
|
677
|
+
|
|
678
|
+
Paths are based on keys that offer the typical access patterns of file systems
|
|
679
|
+
and object stores.
|
|
680
|
+
|
|
681
|
+
>>> upath = UPath("s3://my-bucket/my-folder")
|
|
682
|
+
>>> upath.exists()
|
|
683
|
+
|
|
684
|
+
Args:
|
|
685
|
+
pathlike: A string or Path to a local/cloud file/directory/folder.
|
|
686
|
+
"""
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def create_path(path: UPath, access_token: str | None = None) -> UPath:
|
|
690
|
+
path = UPath(path)
|
|
691
|
+
# test whether we have an AWS S3 path
|
|
692
|
+
if not isinstance(path, S3Path):
|
|
693
|
+
return path
|
|
694
|
+
return get_aws_credentials_manager().enrich_path(path, access_token)
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def get_stat_file_cloud(stat: dict) -> tuple[int, str, str]:
|
|
698
|
+
size = stat["size"]
|
|
699
|
+
etag = stat["ETag"]
|
|
700
|
+
# small files
|
|
701
|
+
if "-" not in etag:
|
|
702
|
+
# only store hash for non-multipart uploads
|
|
703
|
+
# we can't rapidly validate multi-part uploaded files client-side
|
|
704
|
+
# we can add more logic later down-the-road
|
|
705
|
+
hash = b16_to_b64(etag)
|
|
706
|
+
hash_type = "md5"
|
|
707
|
+
else:
|
|
708
|
+
stripped_etag, suffix = etag.split("-")
|
|
709
|
+
suffix = suffix.strip('"')
|
|
710
|
+
hash = b16_to_b64(stripped_etag)
|
|
711
|
+
hash_type = f"md5-{suffix}" # this is the S3 chunk-hashing strategy
|
|
712
|
+
return size, hash[:HASH_LENGTH], hash_type
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def get_stat_dir_cloud(path: UPath) -> tuple[int, str, str, int]:
|
|
716
|
+
sizes = []
|
|
717
|
+
md5s = []
|
|
718
|
+
objects = path.fs.find(path.as_posix(), detail=True)
|
|
719
|
+
if path.protocol == "s3":
|
|
720
|
+
accessor = "ETag"
|
|
721
|
+
elif path.protocol == "gs":
|
|
722
|
+
accessor = "md5Hash"
|
|
723
|
+
for object in objects.values():
|
|
724
|
+
sizes.append(object["size"])
|
|
725
|
+
md5s.append(object[accessor].strip('"='))
|
|
726
|
+
size = sum(sizes)
|
|
727
|
+
hash, hash_type = hash_md5s_from_dir(md5s)
|
|
728
|
+
n_objects = len(md5s)
|
|
729
|
+
return size, hash, hash_type, n_objects
|
|
730
|
+
|
|
731
|
+
|
|
732
|
+
class InstanceNotEmpty(Exception):
|
|
733
|
+
pass
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
# is as fast as boto3: https://lamin.ai/laminlabs/lamindata/transform/krGp3hT1f78N5zKv
|
|
737
|
+
def check_storage_is_empty(
|
|
738
|
+
root: UPathStr, *, raise_error: bool = True, account_for_sqlite_file: bool = False
|
|
739
|
+
) -> int:
|
|
740
|
+
root_upath = UPath(root)
|
|
741
|
+
root_string = root_upath.as_posix() # type: ignore
|
|
742
|
+
# we currently touch a 0-byte file in the root of a hosted storage location
|
|
743
|
+
# ({storage_root}/.lamindb/_is_initialized) during storage initialization
|
|
744
|
+
# since path.fs.find raises a PermissionError on empty hosted
|
|
745
|
+
# subdirectories (see lamindb_setup/core/_settings_storage/init_storage).
|
|
746
|
+
n_offset_objects = 1 # because of touched dummy file, see mark_storage_root()
|
|
747
|
+
if root_string.startswith(HOSTED_BUCKETS):
|
|
748
|
+
# in hosted buckets, count across entire root
|
|
749
|
+
directory_string = root_string
|
|
750
|
+
# the SQLite file is not in the ".lamindb" directory
|
|
751
|
+
if account_for_sqlite_file:
|
|
752
|
+
n_offset_objects += 1 # because of SQLite file
|
|
753
|
+
else:
|
|
754
|
+
# in any other storage location, only count in .lamindb
|
|
755
|
+
if not root_string.endswith("/"):
|
|
756
|
+
root_string += "/"
|
|
757
|
+
directory_string = root_string + ".lamindb"
|
|
758
|
+
objects = root_upath.fs.find(directory_string)
|
|
759
|
+
n_objects = len(objects)
|
|
760
|
+
n_diff = n_objects - n_offset_objects
|
|
761
|
+
ask_for_deletion = (
|
|
762
|
+
"delete them prior to deleting the instance"
|
|
763
|
+
if raise_error
|
|
764
|
+
else "consider deleting them"
|
|
765
|
+
)
|
|
766
|
+
hint = "'_is_initialized'"
|
|
767
|
+
if n_offset_objects == 2:
|
|
768
|
+
hint += " & SQLite file"
|
|
769
|
+
hint += " ignored"
|
|
770
|
+
message = (
|
|
771
|
+
f"Storage {directory_string} contains {n_objects - n_offset_objects} objects "
|
|
772
|
+
f"({hint}) - {ask_for_deletion}"
|
|
773
|
+
)
|
|
774
|
+
if n_diff > 0:
|
|
775
|
+
if raise_error:
|
|
776
|
+
raise InstanceNotEmpty(message)
|
|
777
|
+
else:
|
|
778
|
+
logger.warning(message)
|
|
779
|
+
return n_diff
|