lamindb_setup 1.9.0__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +107 -107
- lamindb_setup/_cache.py +87 -87
- lamindb_setup/_check_setup.py +166 -166
- lamindb_setup/_connect_instance.py +328 -342
- lamindb_setup/_delete.py +141 -141
- lamindb_setup/_disconnect.py +32 -32
- lamindb_setup/_init_instance.py +440 -440
- lamindb_setup/_migrate.py +266 -266
- lamindb_setup/_register_instance.py +35 -35
- lamindb_setup/_schema_metadata.py +441 -441
- lamindb_setup/_set_managed_storage.py +70 -70
- lamindb_setup/_setup_user.py +133 -133
- lamindb_setup/core/__init__.py +21 -21
- lamindb_setup/core/_aws_options.py +223 -223
- lamindb_setup/core/_hub_client.py +248 -248
- lamindb_setup/core/_hub_core.py +665 -665
- lamindb_setup/core/_hub_crud.py +227 -227
- lamindb_setup/core/_private_django_api.py +83 -83
- lamindb_setup/core/_settings.py +377 -377
- lamindb_setup/core/_settings_instance.py +569 -569
- lamindb_setup/core/_settings_load.py +141 -141
- lamindb_setup/core/_settings_save.py +95 -95
- lamindb_setup/core/_settings_storage.py +429 -429
- lamindb_setup/core/_settings_store.py +91 -91
- lamindb_setup/core/_settings_user.py +55 -55
- lamindb_setup/core/_setup_bionty_sources.py +44 -44
- lamindb_setup/core/cloud_sqlite_locker.py +240 -240
- lamindb_setup/core/django.py +305 -296
- lamindb_setup/core/exceptions.py +1 -1
- lamindb_setup/core/hashing.py +134 -134
- lamindb_setup/core/types.py +1 -1
- lamindb_setup/core/upath.py +1013 -1013
- lamindb_setup/errors.py +70 -70
- lamindb_setup/types.py +20 -20
- {lamindb_setup-1.9.0.dist-info → lamindb_setup-1.9.1.dist-info}/METADATA +1 -1
- lamindb_setup-1.9.1.dist-info/RECORD +50 -0
- lamindb_setup-1.9.0.dist-info/RECORD +0 -50
- {lamindb_setup-1.9.0.dist-info → lamindb_setup-1.9.1.dist-info}/LICENSE +0 -0
- {lamindb_setup-1.9.0.dist-info → lamindb_setup-1.9.1.dist-info}/WHEEL +0 -0
|
@@ -1,429 +1,429 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import secrets
|
|
5
|
-
import string
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Literal
|
|
7
|
-
from uuid import UUID
|
|
8
|
-
|
|
9
|
-
import fsspec
|
|
10
|
-
from lamin_utils import logger
|
|
11
|
-
|
|
12
|
-
from lamindb_setup.errors import StorageAlreadyManaged
|
|
13
|
-
|
|
14
|
-
from ._aws_options import (
|
|
15
|
-
HOSTED_REGIONS,
|
|
16
|
-
LAMIN_ENDPOINTS,
|
|
17
|
-
get_aws_options_manager,
|
|
18
|
-
)
|
|
19
|
-
from ._aws_storage import find_closest_aws_region
|
|
20
|
-
from ._deprecated import deprecated
|
|
21
|
-
from .hashing import hash_and_encode_as_b62
|
|
22
|
-
from .upath import (
|
|
23
|
-
LocalPathClasses,
|
|
24
|
-
UPath,
|
|
25
|
-
_split_path_query,
|
|
26
|
-
create_path,
|
|
27
|
-
get_storage_region,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
if TYPE_CHECKING:
|
|
31
|
-
from lamindb_setup.types import StorageType, UPathStr
|
|
32
|
-
|
|
33
|
-
STORAGE_UID_FILE_KEY = ".lamindb/storage_uid.txt"
|
|
34
|
-
LEGACY_STORAGE_UID_FILE_KEY = ".lamindb/_is_initialized"
|
|
35
|
-
|
|
36
|
-
# a list of supported fsspec protocols
|
|
37
|
-
# rename file to local before showing to a user
|
|
38
|
-
VALID_PROTOCOLS = ("file", "gs", "s3", "hf", "http", "https")
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def base62(n_char: int) -> str:
|
|
42
|
-
"""Like nanoid without hyphen and underscore."""
|
|
43
|
-
alphabet = string.digits + string.ascii_letters.swapcase()
|
|
44
|
-
id = "".join(secrets.choice(alphabet) for i in range(n_char))
|
|
45
|
-
return id
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def instance_uid_from_uuid(instance_id: UUID) -> str:
|
|
49
|
-
return hash_and_encode_as_b62(instance_id.hex)[:12]
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def get_storage_type(root_as_str: str) -> StorageType:
|
|
53
|
-
import fsspec
|
|
54
|
-
|
|
55
|
-
convert = {"file": "local"}
|
|
56
|
-
# init_storage checks that the root protocol belongs to VALID_PROTOCOLS
|
|
57
|
-
protocol = fsspec.utils.get_protocol(root_as_str)
|
|
58
|
-
return convert.get(protocol, protocol) # type: ignore
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def mark_storage_root(
|
|
62
|
-
root: UPathStr, uid: str, instance_id: UUID, instance_slug: str
|
|
63
|
-
) -> Literal["__marked__"] | str:
|
|
64
|
-
# we need a file in folder-like storage locations on S3 to avoid
|
|
65
|
-
# permission errors from leveraging s3fs on an empty hosted storage location
|
|
66
|
-
# (path.fs.find raises a PermissionError)
|
|
67
|
-
# we also need it in case a storage location is ambiguous because a server / local environment
|
|
68
|
-
# doesn't have a globally unique identifier, then we screen for this file to map the
|
|
69
|
-
# path on a storage location in the registry
|
|
70
|
-
|
|
71
|
-
root_upath = UPath(root)
|
|
72
|
-
existing_uid = ""
|
|
73
|
-
legacy_mark_upath = root_upath / LEGACY_STORAGE_UID_FILE_KEY
|
|
74
|
-
mark_upath = root_upath / STORAGE_UID_FILE_KEY
|
|
75
|
-
if legacy_mark_upath.exists():
|
|
76
|
-
legacy_mark_upath.rename(mark_upath)
|
|
77
|
-
if mark_upath.exists():
|
|
78
|
-
existing_uid = mark_upath.read_text().splitlines()[0]
|
|
79
|
-
if existing_uid == "":
|
|
80
|
-
instance_uid = instance_uid_from_uuid(instance_id)
|
|
81
|
-
text = f"{uid}\ncreation info:\ninstance_slug={instance_slug}\ninstance_id={instance_id.hex}\ninstance_uid={instance_uid}"
|
|
82
|
-
mark_upath.write_text(text)
|
|
83
|
-
elif existing_uid != uid:
|
|
84
|
-
return uid
|
|
85
|
-
# covers the case in which existing uid is the same as uid
|
|
86
|
-
# and the case in which there was no existing uid
|
|
87
|
-
return "__is_marked__"
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def init_storage(
|
|
91
|
-
root: UPathStr,
|
|
92
|
-
instance_id: UUID,
|
|
93
|
-
instance_slug: str,
|
|
94
|
-
register_hub: bool | None = None,
|
|
95
|
-
prevent_register_hub: bool = False,
|
|
96
|
-
init_instance: bool = False,
|
|
97
|
-
created_by: UUID | None = None,
|
|
98
|
-
access_token: str | None = None,
|
|
99
|
-
region: str | None = None,
|
|
100
|
-
) -> tuple[
|
|
101
|
-
StorageSettings,
|
|
102
|
-
Literal["hub-record-not-created", "hub-record-retrieved", "hub-record-created"],
|
|
103
|
-
]:
|
|
104
|
-
from ._hub_core import delete_storage_record, init_storage_hub
|
|
105
|
-
|
|
106
|
-
assert root is not None, "`root` argument can't be `None`"
|
|
107
|
-
|
|
108
|
-
root_str = str(root) # ensure we have a string
|
|
109
|
-
if ".lamindb" in root_str:
|
|
110
|
-
raise ValueError(
|
|
111
|
-
'Please pass a folder name that does not end or contain ".lamindb"'
|
|
112
|
-
)
|
|
113
|
-
uid = os.getenv("LAMINDB_STORAGE_LNID_INIT")
|
|
114
|
-
if uid is None:
|
|
115
|
-
uid = base62(12)
|
|
116
|
-
else:
|
|
117
|
-
# this means we constructed a hosted location of shape s3://bucket-name/uid
|
|
118
|
-
# within LaminHub
|
|
119
|
-
assert root_str.endswith(uid)
|
|
120
|
-
lamin_env = os.getenv("LAMIN_ENV")
|
|
121
|
-
if root_str.startswith("create-s3"):
|
|
122
|
-
if root_str != "create-s3":
|
|
123
|
-
assert "--" in root_str, "example: `create-s3--eu-central-1`"
|
|
124
|
-
region = root_str.replace("create-s3--", "")
|
|
125
|
-
if region is None:
|
|
126
|
-
region = find_closest_aws_region()
|
|
127
|
-
else:
|
|
128
|
-
if region not in HOSTED_REGIONS:
|
|
129
|
-
raise ValueError(f"region has to be one of {HOSTED_REGIONS}")
|
|
130
|
-
if lamin_env is None or lamin_env == "prod":
|
|
131
|
-
root = f"s3://lamin-{region}/{uid}"
|
|
132
|
-
else:
|
|
133
|
-
root = f"s3://lamin-hosted-test/{uid}"
|
|
134
|
-
elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in VALID_PROTOCOLS:
|
|
135
|
-
valid_protocols = ("local",) + VALID_PROTOCOLS[1:] # show local instead of file
|
|
136
|
-
raise ValueError(
|
|
137
|
-
f"Protocol {input_protocol} is not supported, valid protocols are {', '.join(valid_protocols)}"
|
|
138
|
-
)
|
|
139
|
-
ssettings = StorageSettings(
|
|
140
|
-
uid=uid,
|
|
141
|
-
root=root,
|
|
142
|
-
region=region,
|
|
143
|
-
instance_id=instance_id,
|
|
144
|
-
access_token=access_token,
|
|
145
|
-
)
|
|
146
|
-
# this retrieves the storage record if it exists already in the hub
|
|
147
|
-
# and updates uid and instance_id in ssettings
|
|
148
|
-
register_hub = (
|
|
149
|
-
register_hub or ssettings.type_is_cloud
|
|
150
|
-
) # default to registering cloud storage
|
|
151
|
-
if register_hub and not ssettings.type_is_cloud and ssettings.host is None:
|
|
152
|
-
raise ValueError(
|
|
153
|
-
"`host` must be set for local storage locations that are registered on the hub"
|
|
154
|
-
)
|
|
155
|
-
hub_record_status = init_storage_hub(
|
|
156
|
-
ssettings,
|
|
157
|
-
auto_populate_instance=not init_instance,
|
|
158
|
-
created_by=created_by,
|
|
159
|
-
access_token=access_token,
|
|
160
|
-
prevent_creation=prevent_register_hub or not register_hub,
|
|
161
|
-
)
|
|
162
|
-
# we check the write access here if the storage record has not been retrieved from the hub
|
|
163
|
-
if hub_record_status != "hub-record-retrieved":
|
|
164
|
-
try:
|
|
165
|
-
# (federated) credentials for AWS access are provisioned under-the-hood
|
|
166
|
-
# discussion: https://laminlabs.slack.com/archives/C04FPE8V01W/p1719260587167489
|
|
167
|
-
# if access_token was passed in ssettings, it is used here
|
|
168
|
-
marking_result = mark_storage_root(
|
|
169
|
-
root=ssettings.root,
|
|
170
|
-
uid=ssettings.uid,
|
|
171
|
-
instance_id=instance_id,
|
|
172
|
-
instance_slug=instance_slug,
|
|
173
|
-
)
|
|
174
|
-
except Exception:
|
|
175
|
-
marking_result = "no-write-access"
|
|
176
|
-
if marking_result != "__is_marked__":
|
|
177
|
-
if marking_result == "no-write-access":
|
|
178
|
-
logger.important(
|
|
179
|
-
f"due to lack of write access, LaminDB won't manage this storage location: {ssettings.root_as_str}"
|
|
180
|
-
)
|
|
181
|
-
ssettings._instance_id = None # indicate that this storage location is not managed by the instance
|
|
182
|
-
else:
|
|
183
|
-
s = "S" if init_instance else "s" # upper case for error message
|
|
184
|
-
message = (
|
|
185
|
-
f"{s}torage location {ssettings.root_as_str} is already marked with uid {marking_result}, meaning that it is managed by another LaminDB instance -- "
|
|
186
|
-
"if you manage your instance with LaminHub you get an overview of all your storage locations"
|
|
187
|
-
)
|
|
188
|
-
if init_instance:
|
|
189
|
-
raise StorageAlreadyManaged(message)
|
|
190
|
-
logger.warning(message)
|
|
191
|
-
ssettings._instance_id = UUID(
|
|
192
|
-
"00000000000000000000000000000000"
|
|
193
|
-
) # indicate not known
|
|
194
|
-
ssettings._uid = marking_result
|
|
195
|
-
# this condition means that the hub record was created
|
|
196
|
-
if ssettings._uuid is not None:
|
|
197
|
-
delete_storage_record(ssettings, access_token=access_token) # type: ignore
|
|
198
|
-
ssettings._uuid_ = None
|
|
199
|
-
hub_record_status = "hub-record-not-created"
|
|
200
|
-
return ssettings, hub_record_status
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
class StorageSettings:
|
|
204
|
-
"""Settings for a storage location (local or cloud).
|
|
205
|
-
|
|
206
|
-
Do not instantiate this class yourself, use `ln.Storage` instead.
|
|
207
|
-
"""
|
|
208
|
-
|
|
209
|
-
def __init__(
|
|
210
|
-
self,
|
|
211
|
-
root: UPathStr,
|
|
212
|
-
region: str | None = None,
|
|
213
|
-
uid: str | None = None,
|
|
214
|
-
uuid: UUID | None = None,
|
|
215
|
-
instance_id: UUID | None = None,
|
|
216
|
-
# note that passing access_token prevents credentials caching
|
|
217
|
-
access_token: str | None = None,
|
|
218
|
-
):
|
|
219
|
-
self._uid = uid
|
|
220
|
-
self._uuid_ = uuid
|
|
221
|
-
self._root_init = UPath(root).expanduser()
|
|
222
|
-
if isinstance(self._root_init, LocalPathClasses): # local paths
|
|
223
|
-
try:
|
|
224
|
-
(self._root_init / ".lamindb").mkdir(parents=True, exist_ok=True)
|
|
225
|
-
self._root_init = self._root_init.resolve()
|
|
226
|
-
except Exception:
|
|
227
|
-
logger.warning(
|
|
228
|
-
f"unable to create .lamindb/ folder in {self._root_init}"
|
|
229
|
-
)
|
|
230
|
-
self._root = None
|
|
231
|
-
self._instance_id = instance_id
|
|
232
|
-
# we don't yet infer region here to make init fast
|
|
233
|
-
self._region = region
|
|
234
|
-
# would prefer to type below as Registry, but need to think through import order
|
|
235
|
-
self._record: Any | None = None
|
|
236
|
-
# save access_token here for use in self.root
|
|
237
|
-
self.access_token = access_token
|
|
238
|
-
|
|
239
|
-
# local storage
|
|
240
|
-
self._has_local = False
|
|
241
|
-
self._local = None
|
|
242
|
-
|
|
243
|
-
@property
|
|
244
|
-
@deprecated("_id")
|
|
245
|
-
def id(self) -> int:
|
|
246
|
-
return self._id
|
|
247
|
-
|
|
248
|
-
@property
|
|
249
|
-
def _id(self) -> int:
|
|
250
|
-
"""Storage id.
|
|
251
|
-
|
|
252
|
-
This id is only valid in the current instance and not globally unique. Only for internal use.
|
|
253
|
-
"""
|
|
254
|
-
return self.record.id
|
|
255
|
-
|
|
256
|
-
@property
|
|
257
|
-
def _uuid(self) -> UUID | None:
|
|
258
|
-
"""Lamin's internal storage uuid."""
|
|
259
|
-
return self._uuid_
|
|
260
|
-
|
|
261
|
-
@property
|
|
262
|
-
def uid(self) -> str:
|
|
263
|
-
"""Storage uid."""
|
|
264
|
-
if self._uid is None:
|
|
265
|
-
self._uid = self.record.uid
|
|
266
|
-
return self._uid
|
|
267
|
-
|
|
268
|
-
@property
|
|
269
|
-
def instance_uid(self) -> str | None:
|
|
270
|
-
"""The `uid` of the managing LaminDB instance.
|
|
271
|
-
|
|
272
|
-
If `None`, the storage location is not managed by any LaminDB instance.
|
|
273
|
-
"""
|
|
274
|
-
if self._instance_id is not None:
|
|
275
|
-
if self._instance_id.hex == "00000000000000000000000000000000":
|
|
276
|
-
instance_uid = "__unknown__"
|
|
277
|
-
else:
|
|
278
|
-
instance_uid = instance_uid_from_uuid(self._instance_id)
|
|
279
|
-
else:
|
|
280
|
-
instance_uid = None
|
|
281
|
-
return instance_uid
|
|
282
|
-
|
|
283
|
-
@property
|
|
284
|
-
def _mark_storage_root(self) -> UPath:
|
|
285
|
-
marker_path = self.root / STORAGE_UID_FILE_KEY
|
|
286
|
-
legacy_filepath = self.root / LEGACY_STORAGE_UID_FILE_KEY
|
|
287
|
-
if legacy_filepath.exists():
|
|
288
|
-
logger.warning(
|
|
289
|
-
f"found legacy marker file, renaming it from {legacy_filepath} to {marker_path}"
|
|
290
|
-
)
|
|
291
|
-
legacy_filepath.rename(marker_path)
|
|
292
|
-
return marker_path
|
|
293
|
-
|
|
294
|
-
@property
|
|
295
|
-
def record(self) -> Any:
|
|
296
|
-
"""Storage record in the current instance."""
|
|
297
|
-
if self._record is None:
|
|
298
|
-
# dynamic import because of import order
|
|
299
|
-
from lamindb.models import Storage
|
|
300
|
-
|
|
301
|
-
from ._settings import settings
|
|
302
|
-
|
|
303
|
-
self._record = Storage.objects.using(settings._using_key).get(
|
|
304
|
-
root=self.root_as_str
|
|
305
|
-
)
|
|
306
|
-
return self._record
|
|
307
|
-
|
|
308
|
-
def __repr__(self):
|
|
309
|
-
"""String rep."""
|
|
310
|
-
s = f"root='{self.root_as_str}', uid='{self.uid}'"
|
|
311
|
-
if self._uuid is not None:
|
|
312
|
-
s += f", uuid='{self._uuid.hex}'"
|
|
313
|
-
return f"StorageSettings({s})"
|
|
314
|
-
|
|
315
|
-
@property
|
|
316
|
-
def root(self) -> UPath:
|
|
317
|
-
"""Root storage location."""
|
|
318
|
-
if self._root is None:
|
|
319
|
-
# below makes network requests to get credentials
|
|
320
|
-
self._root = create_path(self._root_init, access_token=self.access_token)
|
|
321
|
-
elif getattr(self._root, "protocol", "") == "s3":
|
|
322
|
-
# this is needed to be sure that the root always has nonexpired credentials
|
|
323
|
-
# this just checks for time of the cached credentials in most cases
|
|
324
|
-
return get_aws_options_manager().enrich_path(
|
|
325
|
-
self._root, access_token=self.access_token
|
|
326
|
-
)
|
|
327
|
-
return self._root
|
|
328
|
-
|
|
329
|
-
def _set_fs_kwargs(self, **kwargs):
|
|
330
|
-
"""Set additional fsspec arguments for cloud root.
|
|
331
|
-
|
|
332
|
-
Example:
|
|
333
|
-
|
|
334
|
-
>>> ln.setup.settings.storage._set_fs_kwargs( # any fsspec args
|
|
335
|
-
>>> profile="some_profile", cache_regions=True
|
|
336
|
-
>>> )
|
|
337
|
-
"""
|
|
338
|
-
if not isinstance(self._root, LocalPathClasses) and kwargs != {}:
|
|
339
|
-
self._root = UPath(self.root, **kwargs)
|
|
340
|
-
|
|
341
|
-
@property
|
|
342
|
-
def root_as_str(self) -> str:
|
|
343
|
-
"""Formatted root string."""
|
|
344
|
-
# embed endpoint_url into path string for storing and displaying
|
|
345
|
-
if self._root_init.protocol == "s3":
|
|
346
|
-
endpoint_url = self._root_init.storage_options.get("endpoint_url", None)
|
|
347
|
-
# LAMIN_ENDPOINTS include None
|
|
348
|
-
if endpoint_url not in LAMIN_ENDPOINTS:
|
|
349
|
-
return f"s3://{self._root_init.path.rstrip('/')}?endpoint_url={endpoint_url}"
|
|
350
|
-
return self._root_init.as_posix().rstrip("/")
|
|
351
|
-
|
|
352
|
-
@property
|
|
353
|
-
def cache_dir(
|
|
354
|
-
self,
|
|
355
|
-
) -> UPath:
|
|
356
|
-
"""Cache root, a local directory to cache cloud files."""
|
|
357
|
-
from lamindb_setup import settings
|
|
358
|
-
|
|
359
|
-
return settings.cache_dir
|
|
360
|
-
|
|
361
|
-
@property
|
|
362
|
-
def type_is_cloud(self) -> bool:
|
|
363
|
-
"""`True` if `storage_root` is in cloud, `False` otherwise."""
|
|
364
|
-
return self.type != "local"
|
|
365
|
-
|
|
366
|
-
@property
|
|
367
|
-
def host(self) -> str | None:
|
|
368
|
-
"""Host identifier for local storage locations.
|
|
369
|
-
|
|
370
|
-
Is `None` for locations with `type != "local"`.
|
|
371
|
-
|
|
372
|
-
A globally unique user-defined host identifier (cluster, server, laptop, etc.).
|
|
373
|
-
"""
|
|
374
|
-
if self.type != "local":
|
|
375
|
-
return None
|
|
376
|
-
return self.region
|
|
377
|
-
|
|
378
|
-
@property
|
|
379
|
-
def region(self) -> str | None:
|
|
380
|
-
"""Storage region."""
|
|
381
|
-
if self._region is None:
|
|
382
|
-
self._region = get_storage_region(self.root_as_str)
|
|
383
|
-
return self._region
|
|
384
|
-
|
|
385
|
-
@property
|
|
386
|
-
def type(self) -> StorageType:
|
|
387
|
-
"""AWS S3 vs. Google Cloud vs. local.
|
|
388
|
-
|
|
389
|
-
Returns the protocol as a stringe, e.g., "local", "s3", "gs", "http", "https".
|
|
390
|
-
"""
|
|
391
|
-
return get_storage_type(self.root_as_str)
|
|
392
|
-
|
|
393
|
-
@property
|
|
394
|
-
def is_on_hub(self) -> bool:
|
|
395
|
-
"""Is this instance on the hub.
|
|
396
|
-
|
|
397
|
-
Only works if user has access to the instance.
|
|
398
|
-
"""
|
|
399
|
-
if self._uuid is None:
|
|
400
|
-
return False
|
|
401
|
-
else:
|
|
402
|
-
return True
|
|
403
|
-
|
|
404
|
-
def cloud_to_local(
|
|
405
|
-
self, filepath: UPathStr, cache_key: str | None = None, **kwargs
|
|
406
|
-
) -> UPath:
|
|
407
|
-
"""Local (or local cache) filepath from filepath."""
|
|
408
|
-
from lamindb_setup import settings
|
|
409
|
-
|
|
410
|
-
return settings.paths.cloud_to_local(
|
|
411
|
-
filepath=filepath, cache_key=cache_key, **kwargs
|
|
412
|
-
)
|
|
413
|
-
|
|
414
|
-
def cloud_to_local_no_update(
|
|
415
|
-
self, filepath: UPathStr, cache_key: str | None = None
|
|
416
|
-
) -> UPath:
|
|
417
|
-
from lamindb_setup import settings
|
|
418
|
-
|
|
419
|
-
return settings.paths.cloud_to_local_no_update(
|
|
420
|
-
filepath=filepath, cache_key=cache_key
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
def key_to_filepath(self, filekey: UPathStr) -> UPath:
|
|
424
|
-
"""Cloud or local filepath from filekey."""
|
|
425
|
-
return self.root / filekey
|
|
426
|
-
|
|
427
|
-
def local_filepath(self, filekey: UPathStr) -> UPath:
|
|
428
|
-
"""Local (cache) filepath from filekey."""
|
|
429
|
-
return self.cloud_to_local(self.key_to_filepath(filekey))
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import secrets
|
|
5
|
+
import string
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
7
|
+
from uuid import UUID
|
|
8
|
+
|
|
9
|
+
import fsspec
|
|
10
|
+
from lamin_utils import logger
|
|
11
|
+
|
|
12
|
+
from lamindb_setup.errors import StorageAlreadyManaged
|
|
13
|
+
|
|
14
|
+
from ._aws_options import (
|
|
15
|
+
HOSTED_REGIONS,
|
|
16
|
+
LAMIN_ENDPOINTS,
|
|
17
|
+
get_aws_options_manager,
|
|
18
|
+
)
|
|
19
|
+
from ._aws_storage import find_closest_aws_region
|
|
20
|
+
from ._deprecated import deprecated
|
|
21
|
+
from .hashing import hash_and_encode_as_b62
|
|
22
|
+
from .upath import (
|
|
23
|
+
LocalPathClasses,
|
|
24
|
+
UPath,
|
|
25
|
+
_split_path_query,
|
|
26
|
+
create_path,
|
|
27
|
+
get_storage_region,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from lamindb_setup.types import StorageType, UPathStr
|
|
32
|
+
|
|
33
|
+
STORAGE_UID_FILE_KEY = ".lamindb/storage_uid.txt"
|
|
34
|
+
LEGACY_STORAGE_UID_FILE_KEY = ".lamindb/_is_initialized"
|
|
35
|
+
|
|
36
|
+
# a list of supported fsspec protocols
|
|
37
|
+
# rename file to local before showing to a user
|
|
38
|
+
VALID_PROTOCOLS = ("file", "gs", "s3", "hf", "http", "https")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def base62(n_char: int) -> str:
|
|
42
|
+
"""Like nanoid without hyphen and underscore."""
|
|
43
|
+
alphabet = string.digits + string.ascii_letters.swapcase()
|
|
44
|
+
id = "".join(secrets.choice(alphabet) for i in range(n_char))
|
|
45
|
+
return id
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def instance_uid_from_uuid(instance_id: UUID) -> str:
|
|
49
|
+
return hash_and_encode_as_b62(instance_id.hex)[:12]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_storage_type(root_as_str: str) -> StorageType:
|
|
53
|
+
import fsspec
|
|
54
|
+
|
|
55
|
+
convert = {"file": "local"}
|
|
56
|
+
# init_storage checks that the root protocol belongs to VALID_PROTOCOLS
|
|
57
|
+
protocol = fsspec.utils.get_protocol(root_as_str)
|
|
58
|
+
return convert.get(protocol, protocol) # type: ignore
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def mark_storage_root(
|
|
62
|
+
root: UPathStr, uid: str, instance_id: UUID, instance_slug: str
|
|
63
|
+
) -> Literal["__marked__"] | str:
|
|
64
|
+
# we need a file in folder-like storage locations on S3 to avoid
|
|
65
|
+
# permission errors from leveraging s3fs on an empty hosted storage location
|
|
66
|
+
# (path.fs.find raises a PermissionError)
|
|
67
|
+
# we also need it in case a storage location is ambiguous because a server / local environment
|
|
68
|
+
# doesn't have a globally unique identifier, then we screen for this file to map the
|
|
69
|
+
# path on a storage location in the registry
|
|
70
|
+
|
|
71
|
+
root_upath = UPath(root)
|
|
72
|
+
existing_uid = ""
|
|
73
|
+
legacy_mark_upath = root_upath / LEGACY_STORAGE_UID_FILE_KEY
|
|
74
|
+
mark_upath = root_upath / STORAGE_UID_FILE_KEY
|
|
75
|
+
if legacy_mark_upath.exists():
|
|
76
|
+
legacy_mark_upath.rename(mark_upath)
|
|
77
|
+
if mark_upath.exists():
|
|
78
|
+
existing_uid = mark_upath.read_text().splitlines()[0]
|
|
79
|
+
if existing_uid == "":
|
|
80
|
+
instance_uid = instance_uid_from_uuid(instance_id)
|
|
81
|
+
text = f"{uid}\ncreation info:\ninstance_slug={instance_slug}\ninstance_id={instance_id.hex}\ninstance_uid={instance_uid}"
|
|
82
|
+
mark_upath.write_text(text)
|
|
83
|
+
elif existing_uid != uid:
|
|
84
|
+
return uid
|
|
85
|
+
# covers the case in which existing uid is the same as uid
|
|
86
|
+
# and the case in which there was no existing uid
|
|
87
|
+
return "__is_marked__"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def init_storage(
|
|
91
|
+
root: UPathStr,
|
|
92
|
+
instance_id: UUID,
|
|
93
|
+
instance_slug: str,
|
|
94
|
+
register_hub: bool | None = None,
|
|
95
|
+
prevent_register_hub: bool = False,
|
|
96
|
+
init_instance: bool = False,
|
|
97
|
+
created_by: UUID | None = None,
|
|
98
|
+
access_token: str | None = None,
|
|
99
|
+
region: str | None = None,
|
|
100
|
+
) -> tuple[
|
|
101
|
+
StorageSettings,
|
|
102
|
+
Literal["hub-record-not-created", "hub-record-retrieved", "hub-record-created"],
|
|
103
|
+
]:
|
|
104
|
+
from ._hub_core import delete_storage_record, init_storage_hub
|
|
105
|
+
|
|
106
|
+
assert root is not None, "`root` argument can't be `None`"
|
|
107
|
+
|
|
108
|
+
root_str = str(root) # ensure we have a string
|
|
109
|
+
if ".lamindb" in root_str:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
'Please pass a folder name that does not end or contain ".lamindb"'
|
|
112
|
+
)
|
|
113
|
+
uid = os.getenv("LAMINDB_STORAGE_LNID_INIT")
|
|
114
|
+
if uid is None:
|
|
115
|
+
uid = base62(12)
|
|
116
|
+
else:
|
|
117
|
+
# this means we constructed a hosted location of shape s3://bucket-name/uid
|
|
118
|
+
# within LaminHub
|
|
119
|
+
assert root_str.endswith(uid)
|
|
120
|
+
lamin_env = os.getenv("LAMIN_ENV")
|
|
121
|
+
if root_str.startswith("create-s3"):
|
|
122
|
+
if root_str != "create-s3":
|
|
123
|
+
assert "--" in root_str, "example: `create-s3--eu-central-1`"
|
|
124
|
+
region = root_str.replace("create-s3--", "")
|
|
125
|
+
if region is None:
|
|
126
|
+
region = find_closest_aws_region()
|
|
127
|
+
else:
|
|
128
|
+
if region not in HOSTED_REGIONS:
|
|
129
|
+
raise ValueError(f"region has to be one of {HOSTED_REGIONS}")
|
|
130
|
+
if lamin_env is None or lamin_env == "prod":
|
|
131
|
+
root = f"s3://lamin-{region}/{uid}"
|
|
132
|
+
else:
|
|
133
|
+
root = f"s3://lamin-hosted-test/{uid}"
|
|
134
|
+
elif (input_protocol := fsspec.utils.get_protocol(root_str)) not in VALID_PROTOCOLS:
|
|
135
|
+
valid_protocols = ("local",) + VALID_PROTOCOLS[1:] # show local instead of file
|
|
136
|
+
raise ValueError(
|
|
137
|
+
f"Protocol {input_protocol} is not supported, valid protocols are {', '.join(valid_protocols)}"
|
|
138
|
+
)
|
|
139
|
+
ssettings = StorageSettings(
|
|
140
|
+
uid=uid,
|
|
141
|
+
root=root,
|
|
142
|
+
region=region,
|
|
143
|
+
instance_id=instance_id,
|
|
144
|
+
access_token=access_token,
|
|
145
|
+
)
|
|
146
|
+
# this retrieves the storage record if it exists already in the hub
|
|
147
|
+
# and updates uid and instance_id in ssettings
|
|
148
|
+
register_hub = (
|
|
149
|
+
register_hub or ssettings.type_is_cloud
|
|
150
|
+
) # default to registering cloud storage
|
|
151
|
+
if register_hub and not ssettings.type_is_cloud and ssettings.host is None:
|
|
152
|
+
raise ValueError(
|
|
153
|
+
"`host` must be set for local storage locations that are registered on the hub"
|
|
154
|
+
)
|
|
155
|
+
hub_record_status = init_storage_hub(
|
|
156
|
+
ssettings,
|
|
157
|
+
auto_populate_instance=not init_instance,
|
|
158
|
+
created_by=created_by,
|
|
159
|
+
access_token=access_token,
|
|
160
|
+
prevent_creation=prevent_register_hub or not register_hub,
|
|
161
|
+
)
|
|
162
|
+
# we check the write access here if the storage record has not been retrieved from the hub
|
|
163
|
+
if hub_record_status != "hub-record-retrieved":
|
|
164
|
+
try:
|
|
165
|
+
# (federated) credentials for AWS access are provisioned under-the-hood
|
|
166
|
+
# discussion: https://laminlabs.slack.com/archives/C04FPE8V01W/p1719260587167489
|
|
167
|
+
# if access_token was passed in ssettings, it is used here
|
|
168
|
+
marking_result = mark_storage_root(
|
|
169
|
+
root=ssettings.root,
|
|
170
|
+
uid=ssettings.uid,
|
|
171
|
+
instance_id=instance_id,
|
|
172
|
+
instance_slug=instance_slug,
|
|
173
|
+
)
|
|
174
|
+
except Exception:
|
|
175
|
+
marking_result = "no-write-access"
|
|
176
|
+
if marking_result != "__is_marked__":
|
|
177
|
+
if marking_result == "no-write-access":
|
|
178
|
+
logger.important(
|
|
179
|
+
f"due to lack of write access, LaminDB won't manage this storage location: {ssettings.root_as_str}"
|
|
180
|
+
)
|
|
181
|
+
ssettings._instance_id = None # indicate that this storage location is not managed by the instance
|
|
182
|
+
else:
|
|
183
|
+
s = "S" if init_instance else "s" # upper case for error message
|
|
184
|
+
message = (
|
|
185
|
+
f"{s}torage location {ssettings.root_as_str} is already marked with uid {marking_result}, meaning that it is managed by another LaminDB instance -- "
|
|
186
|
+
"if you manage your instance with LaminHub you get an overview of all your storage locations"
|
|
187
|
+
)
|
|
188
|
+
if init_instance:
|
|
189
|
+
raise StorageAlreadyManaged(message)
|
|
190
|
+
logger.warning(message)
|
|
191
|
+
ssettings._instance_id = UUID(
|
|
192
|
+
"00000000000000000000000000000000"
|
|
193
|
+
) # indicate not known
|
|
194
|
+
ssettings._uid = marking_result
|
|
195
|
+
# this condition means that the hub record was created
|
|
196
|
+
if ssettings._uuid is not None:
|
|
197
|
+
delete_storage_record(ssettings, access_token=access_token) # type: ignore
|
|
198
|
+
ssettings._uuid_ = None
|
|
199
|
+
hub_record_status = "hub-record-not-created"
|
|
200
|
+
return ssettings, hub_record_status
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class StorageSettings:
|
|
204
|
+
"""Settings for a storage location (local or cloud).
|
|
205
|
+
|
|
206
|
+
Do not instantiate this class yourself, use `ln.Storage` instead.
|
|
207
|
+
"""
|
|
208
|
+
|
|
209
|
+
def __init__(
|
|
210
|
+
self,
|
|
211
|
+
root: UPathStr,
|
|
212
|
+
region: str | None = None,
|
|
213
|
+
uid: str | None = None,
|
|
214
|
+
uuid: UUID | None = None,
|
|
215
|
+
instance_id: UUID | None = None,
|
|
216
|
+
# note that passing access_token prevents credentials caching
|
|
217
|
+
access_token: str | None = None,
|
|
218
|
+
):
|
|
219
|
+
self._uid = uid
|
|
220
|
+
self._uuid_ = uuid
|
|
221
|
+
self._root_init = UPath(root).expanduser()
|
|
222
|
+
if isinstance(self._root_init, LocalPathClasses): # local paths
|
|
223
|
+
try:
|
|
224
|
+
(self._root_init / ".lamindb").mkdir(parents=True, exist_ok=True)
|
|
225
|
+
self._root_init = self._root_init.resolve()
|
|
226
|
+
except Exception:
|
|
227
|
+
logger.warning(
|
|
228
|
+
f"unable to create .lamindb/ folder in {self._root_init}"
|
|
229
|
+
)
|
|
230
|
+
self._root = None
|
|
231
|
+
self._instance_id = instance_id
|
|
232
|
+
# we don't yet infer region here to make init fast
|
|
233
|
+
self._region = region
|
|
234
|
+
# would prefer to type below as Registry, but need to think through import order
|
|
235
|
+
self._record: Any | None = None
|
|
236
|
+
# save access_token here for use in self.root
|
|
237
|
+
self.access_token = access_token
|
|
238
|
+
|
|
239
|
+
# local storage
|
|
240
|
+
self._has_local = False
|
|
241
|
+
self._local = None
|
|
242
|
+
|
|
243
|
+
@property
|
|
244
|
+
@deprecated("_id")
|
|
245
|
+
def id(self) -> int:
|
|
246
|
+
return self._id
|
|
247
|
+
|
|
248
|
+
@property
|
|
249
|
+
def _id(self) -> int:
|
|
250
|
+
"""Storage id.
|
|
251
|
+
|
|
252
|
+
This id is only valid in the current instance and not globally unique. Only for internal use.
|
|
253
|
+
"""
|
|
254
|
+
return self.record.id
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def _uuid(self) -> UUID | None:
|
|
258
|
+
"""Lamin's internal storage uuid."""
|
|
259
|
+
return self._uuid_
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def uid(self) -> str:
|
|
263
|
+
"""Storage uid."""
|
|
264
|
+
if self._uid is None:
|
|
265
|
+
self._uid = self.record.uid
|
|
266
|
+
return self._uid
|
|
267
|
+
|
|
268
|
+
@property
|
|
269
|
+
def instance_uid(self) -> str | None:
|
|
270
|
+
"""The `uid` of the managing LaminDB instance.
|
|
271
|
+
|
|
272
|
+
If `None`, the storage location is not managed by any LaminDB instance.
|
|
273
|
+
"""
|
|
274
|
+
if self._instance_id is not None:
|
|
275
|
+
if self._instance_id.hex == "00000000000000000000000000000000":
|
|
276
|
+
instance_uid = "__unknown__"
|
|
277
|
+
else:
|
|
278
|
+
instance_uid = instance_uid_from_uuid(self._instance_id)
|
|
279
|
+
else:
|
|
280
|
+
instance_uid = None
|
|
281
|
+
return instance_uid
|
|
282
|
+
|
|
283
|
+
@property
|
|
284
|
+
def _mark_storage_root(self) -> UPath:
|
|
285
|
+
marker_path = self.root / STORAGE_UID_FILE_KEY
|
|
286
|
+
legacy_filepath = self.root / LEGACY_STORAGE_UID_FILE_KEY
|
|
287
|
+
if legacy_filepath.exists():
|
|
288
|
+
logger.warning(
|
|
289
|
+
f"found legacy marker file, renaming it from {legacy_filepath} to {marker_path}"
|
|
290
|
+
)
|
|
291
|
+
legacy_filepath.rename(marker_path)
|
|
292
|
+
return marker_path
|
|
293
|
+
|
|
294
|
+
@property
|
|
295
|
+
def record(self) -> Any:
|
|
296
|
+
"""Storage record in the current instance."""
|
|
297
|
+
if self._record is None:
|
|
298
|
+
# dynamic import because of import order
|
|
299
|
+
from lamindb.models import Storage
|
|
300
|
+
|
|
301
|
+
from ._settings import settings
|
|
302
|
+
|
|
303
|
+
self._record = Storage.objects.using(settings._using_key).get(
|
|
304
|
+
root=self.root_as_str
|
|
305
|
+
)
|
|
306
|
+
return self._record
|
|
307
|
+
|
|
308
|
+
def __repr__(self):
|
|
309
|
+
"""String rep."""
|
|
310
|
+
s = f"root='{self.root_as_str}', uid='{self.uid}'"
|
|
311
|
+
if self._uuid is not None:
|
|
312
|
+
s += f", uuid='{self._uuid.hex}'"
|
|
313
|
+
return f"StorageSettings({s})"
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def root(self) -> UPath:
|
|
317
|
+
"""Root storage location."""
|
|
318
|
+
if self._root is None:
|
|
319
|
+
# below makes network requests to get credentials
|
|
320
|
+
self._root = create_path(self._root_init, access_token=self.access_token)
|
|
321
|
+
elif getattr(self._root, "protocol", "") == "s3":
|
|
322
|
+
# this is needed to be sure that the root always has nonexpired credentials
|
|
323
|
+
# this just checks for time of the cached credentials in most cases
|
|
324
|
+
return get_aws_options_manager().enrich_path(
|
|
325
|
+
self._root, access_token=self.access_token
|
|
326
|
+
)
|
|
327
|
+
return self._root
|
|
328
|
+
|
|
329
|
+
def _set_fs_kwargs(self, **kwargs):
|
|
330
|
+
"""Set additional fsspec arguments for cloud root.
|
|
331
|
+
|
|
332
|
+
Example:
|
|
333
|
+
|
|
334
|
+
>>> ln.setup.settings.storage._set_fs_kwargs( # any fsspec args
|
|
335
|
+
>>> profile="some_profile", cache_regions=True
|
|
336
|
+
>>> )
|
|
337
|
+
"""
|
|
338
|
+
if not isinstance(self._root, LocalPathClasses) and kwargs != {}:
|
|
339
|
+
self._root = UPath(self.root, **kwargs)
|
|
340
|
+
|
|
341
|
+
@property
|
|
342
|
+
def root_as_str(self) -> str:
|
|
343
|
+
"""Formatted root string."""
|
|
344
|
+
# embed endpoint_url into path string for storing and displaying
|
|
345
|
+
if self._root_init.protocol == "s3":
|
|
346
|
+
endpoint_url = self._root_init.storage_options.get("endpoint_url", None)
|
|
347
|
+
# LAMIN_ENDPOINTS include None
|
|
348
|
+
if endpoint_url not in LAMIN_ENDPOINTS:
|
|
349
|
+
return f"s3://{self._root_init.path.rstrip('/')}?endpoint_url={endpoint_url}"
|
|
350
|
+
return self._root_init.as_posix().rstrip("/")
|
|
351
|
+
|
|
352
|
+
@property
|
|
353
|
+
def cache_dir(
|
|
354
|
+
self,
|
|
355
|
+
) -> UPath:
|
|
356
|
+
"""Cache root, a local directory to cache cloud files."""
|
|
357
|
+
from lamindb_setup import settings
|
|
358
|
+
|
|
359
|
+
return settings.cache_dir
|
|
360
|
+
|
|
361
|
+
@property
|
|
362
|
+
def type_is_cloud(self) -> bool:
|
|
363
|
+
"""`True` if `storage_root` is in cloud, `False` otherwise."""
|
|
364
|
+
return self.type != "local"
|
|
365
|
+
|
|
366
|
+
@property
|
|
367
|
+
def host(self) -> str | None:
|
|
368
|
+
"""Host identifier for local storage locations.
|
|
369
|
+
|
|
370
|
+
Is `None` for locations with `type != "local"`.
|
|
371
|
+
|
|
372
|
+
A globally unique user-defined host identifier (cluster, server, laptop, etc.).
|
|
373
|
+
"""
|
|
374
|
+
if self.type != "local":
|
|
375
|
+
return None
|
|
376
|
+
return self.region
|
|
377
|
+
|
|
378
|
+
@property
|
|
379
|
+
def region(self) -> str | None:
|
|
380
|
+
"""Storage region."""
|
|
381
|
+
if self._region is None:
|
|
382
|
+
self._region = get_storage_region(self.root_as_str)
|
|
383
|
+
return self._region
|
|
384
|
+
|
|
385
|
+
@property
|
|
386
|
+
def type(self) -> StorageType:
|
|
387
|
+
"""AWS S3 vs. Google Cloud vs. local.
|
|
388
|
+
|
|
389
|
+
Returns the protocol as a stringe, e.g., "local", "s3", "gs", "http", "https".
|
|
390
|
+
"""
|
|
391
|
+
return get_storage_type(self.root_as_str)
|
|
392
|
+
|
|
393
|
+
@property
|
|
394
|
+
def is_on_hub(self) -> bool:
|
|
395
|
+
"""Is this instance on the hub.
|
|
396
|
+
|
|
397
|
+
Only works if user has access to the instance.
|
|
398
|
+
"""
|
|
399
|
+
if self._uuid is None:
|
|
400
|
+
return False
|
|
401
|
+
else:
|
|
402
|
+
return True
|
|
403
|
+
|
|
404
|
+
def cloud_to_local(
|
|
405
|
+
self, filepath: UPathStr, cache_key: str | None = None, **kwargs
|
|
406
|
+
) -> UPath:
|
|
407
|
+
"""Local (or local cache) filepath from filepath."""
|
|
408
|
+
from lamindb_setup import settings
|
|
409
|
+
|
|
410
|
+
return settings.paths.cloud_to_local(
|
|
411
|
+
filepath=filepath, cache_key=cache_key, **kwargs
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
def cloud_to_local_no_update(
|
|
415
|
+
self, filepath: UPathStr, cache_key: str | None = None
|
|
416
|
+
) -> UPath:
|
|
417
|
+
from lamindb_setup import settings
|
|
418
|
+
|
|
419
|
+
return settings.paths.cloud_to_local_no_update(
|
|
420
|
+
filepath=filepath, cache_key=cache_key
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
def key_to_filepath(self, filekey: UPathStr) -> UPath:
|
|
424
|
+
"""Cloud or local filepath from filekey."""
|
|
425
|
+
return self.root / filekey
|
|
426
|
+
|
|
427
|
+
def local_filepath(self, filekey: UPathStr) -> UPath:
|
|
428
|
+
"""Local (cache) filepath from filekey."""
|
|
429
|
+
return self.cloud_to_local(self.key_to_filepath(filekey))
|