lamindb_setup 0.76.7__py2.py3-none-any.whl → 0.77.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb_setup/__init__.py +6 -7
- lamindb_setup/_cache.py +34 -34
- lamindb_setup/_check.py +7 -7
- lamindb_setup/_check_setup.py +79 -79
- lamindb_setup/_close.py +35 -35
- lamindb_setup/_connect_instance.py +440 -433
- lamindb_setup/_delete.py +137 -137
- lamindb_setup/_django.py +41 -41
- lamindb_setup/_exportdb.py +68 -68
- lamindb_setup/_importdb.py +50 -50
- lamindb_setup/_init_instance.py +374 -374
- lamindb_setup/_migrate.py +239 -239
- lamindb_setup/_register_instance.py +36 -36
- lamindb_setup/_schema.py +27 -27
- lamindb_setup/_schema_metadata.py +411 -391
- lamindb_setup/_set_managed_storage.py +55 -55
- lamindb_setup/_setup_user.py +134 -118
- lamindb_setup/_silence_loggers.py +44 -44
- lamindb_setup/core/__init__.py +21 -21
- lamindb_setup/core/_aws_credentials.py +151 -151
- lamindb_setup/core/_aws_storage.py +48 -48
- lamindb_setup/core/_deprecated.py +55 -55
- lamindb_setup/core/_docs.py +14 -14
- lamindb_setup/core/_hub_client.py +173 -164
- lamindb_setup/core/_hub_core.py +524 -473
- lamindb_setup/core/_hub_crud.py +211 -211
- lamindb_setup/core/_hub_utils.py +109 -109
- lamindb_setup/core/_private_django_api.py +88 -88
- lamindb_setup/core/_settings.py +138 -138
- lamindb_setup/core/_settings_instance.py +461 -461
- lamindb_setup/core/_settings_load.py +105 -100
- lamindb_setup/core/_settings_save.py +81 -81
- lamindb_setup/core/_settings_storage.py +393 -393
- lamindb_setup/core/_settings_store.py +73 -72
- lamindb_setup/core/_settings_user.py +53 -51
- lamindb_setup/core/_setup_bionty_sources.py +101 -99
- lamindb_setup/core/cloud_sqlite_locker.py +232 -232
- lamindb_setup/core/django.py +113 -113
- lamindb_setup/core/exceptions.py +12 -12
- lamindb_setup/core/hashing.py +114 -114
- lamindb_setup/core/types.py +19 -19
- lamindb_setup/core/upath.py +779 -779
- {lamindb_setup-0.76.7.dist-info → lamindb_setup-0.77.0.dist-info}/METADATA +1 -1
- lamindb_setup-0.77.0.dist-info/RECORD +46 -0
- {lamindb_setup-0.76.7.dist-info → lamindb_setup-0.77.0.dist-info}/WHEEL +1 -1
- lamindb_setup-0.76.7.dist-info/RECORD +0 -46
- {lamindb_setup-0.76.7.dist-info → lamindb_setup-0.77.0.dist-info}/LICENSE +0 -0
|
@@ -1,393 +1,393 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import secrets
|
|
5
|
-
import shutil
|
|
6
|
-
import string
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
9
|
-
|
|
10
|
-
from appdirs import AppDirs
|
|
11
|
-
from lamin_utils import logger
|
|
12
|
-
|
|
13
|
-
from ._aws_credentials import HOSTED_REGIONS, get_aws_credentials_manager
|
|
14
|
-
from ._aws_storage import find_closest_aws_region
|
|
15
|
-
from ._settings_save import save_system_storage_settings
|
|
16
|
-
from ._settings_store import system_storage_settings_file
|
|
17
|
-
from .upath import (
|
|
18
|
-
LocalPathClasses,
|
|
19
|
-
UPath,
|
|
20
|
-
create_path,
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
if TYPE_CHECKING:
|
|
24
|
-
from uuid import UUID
|
|
25
|
-
|
|
26
|
-
from .types import UPathStr
|
|
27
|
-
|
|
28
|
-
DIRS = AppDirs("lamindb", "laminlabs")
|
|
29
|
-
IS_INITIALIZED_KEY = ".lamindb/_is_initialized"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def base62(n_char: int) -> str:
|
|
33
|
-
"""Like nanoid without hyphen and underscore."""
|
|
34
|
-
alphabet = string.digits + string.ascii_letters.swapcase()
|
|
35
|
-
id = "".join(secrets.choice(alphabet) for i in range(n_char))
|
|
36
|
-
return id
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def get_storage_region(path: UPathStr) -> str | None:
|
|
40
|
-
path_str = str(path)
|
|
41
|
-
if path_str.startswith("s3://"):
|
|
42
|
-
import botocore.session
|
|
43
|
-
from botocore.config import Config
|
|
44
|
-
from botocore.exceptions import ClientError
|
|
45
|
-
|
|
46
|
-
# strip the prefix and any suffixes of the bucket name
|
|
47
|
-
bucket = path_str.replace("s3://", "").split("/")[0]
|
|
48
|
-
session = botocore.session.get_session()
|
|
49
|
-
credentials = session.get_credentials()
|
|
50
|
-
if credentials is None or credentials.access_key is None:
|
|
51
|
-
config = Config(signature_version=botocore.session.UNSIGNED)
|
|
52
|
-
else:
|
|
53
|
-
config = None
|
|
54
|
-
s3_client = session.create_client("s3", config=config)
|
|
55
|
-
try:
|
|
56
|
-
response = s3_client.head_bucket(Bucket=bucket)
|
|
57
|
-
except ClientError as exc:
|
|
58
|
-
response = getattr(exc, "response", {})
|
|
59
|
-
if response.get("Error", {}).get("Code") == "404":
|
|
60
|
-
raise exc
|
|
61
|
-
region = (
|
|
62
|
-
response.get("ResponseMetadata", {})
|
|
63
|
-
.get("HTTPHeaders", {})
|
|
64
|
-
.get("x-amz-bucket-region")
|
|
65
|
-
)
|
|
66
|
-
else:
|
|
67
|
-
region = None
|
|
68
|
-
return region
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def mark_storage_root(root: UPathStr, uid: str):
|
|
72
|
-
# we need to touch a 0-byte object in folder-like storage location on S3 to avoid
|
|
73
|
-
# permission errors from leveraging s3fs on an empty hosted storage location
|
|
74
|
-
# for consistency, we write this file everywhere
|
|
75
|
-
root_upath = UPath(root)
|
|
76
|
-
mark_upath = root_upath / IS_INITIALIZED_KEY
|
|
77
|
-
mark_upath.write_text(uid)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def init_storage(
|
|
81
|
-
root: UPathStr,
|
|
82
|
-
instance_id: UUID | None = None,
|
|
83
|
-
register_hub: bool | None = None,
|
|
84
|
-
prevent_register_hub: bool = False,
|
|
85
|
-
init_instance: bool = False,
|
|
86
|
-
) -> tuple[
|
|
87
|
-
StorageSettings,
|
|
88
|
-
Literal["hub-record-not-created", "hub-record-retireved", "hub-record-created"],
|
|
89
|
-
]:
|
|
90
|
-
if root is None:
|
|
91
|
-
raise ValueError("`storage` argument can't be `None`")
|
|
92
|
-
root_str = str(root) # ensure we have a string
|
|
93
|
-
if ".lamindb" in root_str:
|
|
94
|
-
raise ValueError(
|
|
95
|
-
'Please pass a folder name that does not end or contain ".lamindb"'
|
|
96
|
-
)
|
|
97
|
-
uid = base62(12)
|
|
98
|
-
region = None
|
|
99
|
-
lamin_env = os.getenv("LAMIN_ENV")
|
|
100
|
-
if root_str.startswith("create-s3"):
|
|
101
|
-
if root_str != "create-s3":
|
|
102
|
-
assert "--" in root_str, "example: `create-s3--eu-central-1`"
|
|
103
|
-
region = root_str.replace("create-s3--", "")
|
|
104
|
-
if region is None:
|
|
105
|
-
region = find_closest_aws_region()
|
|
106
|
-
else:
|
|
107
|
-
if region not in HOSTED_REGIONS:
|
|
108
|
-
raise ValueError(f"region has to be one of {HOSTED_REGIONS}")
|
|
109
|
-
if lamin_env is None or lamin_env == "prod":
|
|
110
|
-
root_str = f"s3://lamin-{region}/{uid}"
|
|
111
|
-
else:
|
|
112
|
-
root_str = f"s3://lamin-hosted-test/{uid}"
|
|
113
|
-
elif root_str.startswith(("gs://", "s3://")):
|
|
114
|
-
pass
|
|
115
|
-
else: # local path
|
|
116
|
-
try:
|
|
117
|
-
_ = Path(root_str)
|
|
118
|
-
except Exception as e:
|
|
119
|
-
logger.error("`storage` is not a valid local, GCP storage or AWS S3 path")
|
|
120
|
-
raise e
|
|
121
|
-
ssettings = StorageSettings(
|
|
122
|
-
uid=uid,
|
|
123
|
-
root=root_str,
|
|
124
|
-
region=region,
|
|
125
|
-
instance_id=instance_id,
|
|
126
|
-
)
|
|
127
|
-
# this stores the result of init_storage_hub
|
|
128
|
-
hub_record_status: Literal[
|
|
129
|
-
"hub-record-not-created", "hub-record-retireved", "hub-record-created"
|
|
130
|
-
] = "hub-record-not-created"
|
|
131
|
-
# the below might update the uid with one that's already taken on the hub
|
|
132
|
-
if not prevent_register_hub:
|
|
133
|
-
if ssettings.type_is_cloud or register_hub:
|
|
134
|
-
from ._hub_core import delete_storage_record
|
|
135
|
-
from ._hub_core import init_storage as init_storage_hub
|
|
136
|
-
|
|
137
|
-
hub_record_status = init_storage_hub(
|
|
138
|
-
ssettings, auto_populate_instance=not init_instance
|
|
139
|
-
)
|
|
140
|
-
# below comes last only if everything else was successful
|
|
141
|
-
try:
|
|
142
|
-
# (federated) credentials for AWS access are provisioned under-the-hood
|
|
143
|
-
# discussion: https://laminlabs.slack.com/archives/C04FPE8V01W/p1719260587167489
|
|
144
|
-
mark_storage_root(ssettings.root, ssettings.uid) # type: ignore
|
|
145
|
-
except Exception:
|
|
146
|
-
logger.important(
|
|
147
|
-
f"due to lack of write access, LaminDB won't manage storage location: {ssettings.root}"
|
|
148
|
-
)
|
|
149
|
-
# we have to check hub_record_status here because
|
|
150
|
-
# _select_storage inside init_storage_hub also populates ssettings._uuid
|
|
151
|
-
# and we don't want to delete an existing storage record here
|
|
152
|
-
# only newly created
|
|
153
|
-
if hub_record_status == "hub-record-created" and ssettings._uuid is not None:
|
|
154
|
-
delete_storage_record(ssettings._uuid) # type: ignore
|
|
155
|
-
ssettings._instance_id = None
|
|
156
|
-
return ssettings, hub_record_status
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def _process_cache_path(cache_path: str | Path | UPath | None):
|
|
160
|
-
if cache_path is None or cache_path == "null":
|
|
161
|
-
return None
|
|
162
|
-
cache_dir = UPath(cache_path)
|
|
163
|
-
if not isinstance(cache_dir, LocalPathClasses):
|
|
164
|
-
raise ValueError("cache dir should be a local path.")
|
|
165
|
-
if cache_dir.exists() and not cache_dir.is_dir():
|
|
166
|
-
raise ValueError("cache dir should be a directory.")
|
|
167
|
-
return cache_dir
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
class StorageSettings:
|
|
171
|
-
"""Settings for a given storage location (local or cloud)."""
|
|
172
|
-
|
|
173
|
-
def __init__(
|
|
174
|
-
self,
|
|
175
|
-
root: UPathStr,
|
|
176
|
-
region: str | None = None,
|
|
177
|
-
uid: str | None = None,
|
|
178
|
-
uuid: UUID | None = None,
|
|
179
|
-
instance_id: UUID | None = None,
|
|
180
|
-
# note that passing access_token prevents credentials caching
|
|
181
|
-
access_token: str | None = None,
|
|
182
|
-
):
|
|
183
|
-
self._uid = uid
|
|
184
|
-
self._uuid_ = uuid
|
|
185
|
-
self._root_init = UPath(root)
|
|
186
|
-
if isinstance(self._root_init, LocalPathClasses): # local paths
|
|
187
|
-
try:
|
|
188
|
-
(self._root_init / ".lamindb").mkdir(parents=True, exist_ok=True)
|
|
189
|
-
self._root_init = self._root_init.resolve()
|
|
190
|
-
except Exception:
|
|
191
|
-
logger.warning(f"unable to create .lamindb folder in {self._root_init}")
|
|
192
|
-
pass
|
|
193
|
-
self._root = None
|
|
194
|
-
self._instance_id = instance_id
|
|
195
|
-
# we don't yet infer region here to make init fast
|
|
196
|
-
self._region = region
|
|
197
|
-
# would prefer to type below as Registry, but need to think through import order
|
|
198
|
-
self._record: Any | None = None
|
|
199
|
-
# cache settings
|
|
200
|
-
self._storage_settings_file = system_storage_settings_file()
|
|
201
|
-
if self._storage_settings_file.exists():
|
|
202
|
-
from dotenv import dotenv_values
|
|
203
|
-
|
|
204
|
-
cache_path = dotenv_values(self._storage_settings_file)[
|
|
205
|
-
"lamindb_cache_path"
|
|
206
|
-
]
|
|
207
|
-
self._cache_dir = _process_cache_path(cache_path)
|
|
208
|
-
else:
|
|
209
|
-
self._cache_dir = None
|
|
210
|
-
# save access_token here for use in self.root
|
|
211
|
-
self.access_token = access_token
|
|
212
|
-
|
|
213
|
-
# local storage
|
|
214
|
-
self._has_local = False
|
|
215
|
-
self._local = None
|
|
216
|
-
|
|
217
|
-
@property
|
|
218
|
-
def id(self) -> int:
|
|
219
|
-
"""Storage id in current instance."""
|
|
220
|
-
return self.record.id
|
|
221
|
-
|
|
222
|
-
@property
|
|
223
|
-
def _uuid(self) -> UUID | None:
|
|
224
|
-
"""Lamin's internal storage uuid."""
|
|
225
|
-
return self._uuid_
|
|
226
|
-
|
|
227
|
-
@property
|
|
228
|
-
def uid(self) -> str | None:
|
|
229
|
-
"""Storage id."""
|
|
230
|
-
if self._uid is None:
|
|
231
|
-
self._uid = self.record.uid
|
|
232
|
-
return self._uid
|
|
233
|
-
|
|
234
|
-
@property
|
|
235
|
-
def _mark_storage_root(self) -> UPath:
|
|
236
|
-
return self.root / IS_INITIALIZED_KEY
|
|
237
|
-
|
|
238
|
-
@property
|
|
239
|
-
def record(self) -> Any:
|
|
240
|
-
"""Storage record in current instance."""
|
|
241
|
-
if self._record is None:
|
|
242
|
-
# dynamic import because of import order
|
|
243
|
-
from lnschema_core.models import Storage
|
|
244
|
-
|
|
245
|
-
from ._settings import settings
|
|
246
|
-
|
|
247
|
-
self._record = Storage.objects.using(settings._using_key).get(
|
|
248
|
-
root=self.root_as_str
|
|
249
|
-
)
|
|
250
|
-
return self._record
|
|
251
|
-
|
|
252
|
-
def __repr__(self):
|
|
253
|
-
"""String rep."""
|
|
254
|
-
s = f"root='{self.root_as_str}', uid='{self.uid}'"
|
|
255
|
-
if self._uuid is not None:
|
|
256
|
-
s += f", uuid='{self._uuid.hex}'"
|
|
257
|
-
return f"StorageSettings({s})"
|
|
258
|
-
|
|
259
|
-
@property
|
|
260
|
-
def root(self) -> UPath:
|
|
261
|
-
"""Root storage location."""
|
|
262
|
-
if self._root is None:
|
|
263
|
-
# below makes network requests to get credentials
|
|
264
|
-
self._root = create_path(self._root_init, access_token=self.access_token)
|
|
265
|
-
elif getattr(self._root, "protocol", "") == "s3":
|
|
266
|
-
# this is needed to be sure that the root always has nonexpired credentials
|
|
267
|
-
# this just checks for time of the cached credentials in most cases
|
|
268
|
-
return get_aws_credentials_manager().enrich_path(
|
|
269
|
-
self._root, access_token=self.access_token
|
|
270
|
-
)
|
|
271
|
-
return self._root
|
|
272
|
-
|
|
273
|
-
def _set_fs_kwargs(self, **kwargs):
|
|
274
|
-
"""Set additional fsspec arguments for cloud root.
|
|
275
|
-
|
|
276
|
-
Example:
|
|
277
|
-
|
|
278
|
-
>>> ln.setup.settings.storage._set_fs_kwargs( # any fsspec args
|
|
279
|
-
>>> profile="some_profile", cache_regions=True
|
|
280
|
-
>>> )
|
|
281
|
-
"""
|
|
282
|
-
if not isinstance(self._root, LocalPathClasses) and kwargs != {}:
|
|
283
|
-
self._root = UPath(self.root, **kwargs)
|
|
284
|
-
|
|
285
|
-
@property
|
|
286
|
-
def root_as_str(self) -> str:
|
|
287
|
-
"""Formatted root string."""
|
|
288
|
-
return self._root_init.as_posix().rstrip("/")
|
|
289
|
-
|
|
290
|
-
@property
|
|
291
|
-
def cache_dir(
|
|
292
|
-
self,
|
|
293
|
-
) -> UPath:
|
|
294
|
-
"""Cache root, a local directory to cache cloud files."""
|
|
295
|
-
if "LAMIN_CACHE_DIR" in os.environ:
|
|
296
|
-
cache_dir = UPath(os.environ["LAMIN_CACHE_DIR"])
|
|
297
|
-
elif self._cache_dir is None:
|
|
298
|
-
cache_dir = UPath(DIRS.user_cache_dir)
|
|
299
|
-
else:
|
|
300
|
-
cache_dir = self._cache_dir
|
|
301
|
-
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
302
|
-
return cache_dir
|
|
303
|
-
|
|
304
|
-
@cache_dir.setter
|
|
305
|
-
def cache_dir(self, cache_dir: UPathStr):
|
|
306
|
-
"""Set cache root."""
|
|
307
|
-
from lamindb_setup import settings
|
|
308
|
-
|
|
309
|
-
if settings.instance._is_cloud_sqlite:
|
|
310
|
-
src_sqlite_file = settings.instance._sqlite_file_local
|
|
311
|
-
else:
|
|
312
|
-
src_sqlite_file = None
|
|
313
|
-
|
|
314
|
-
save_cache_dir = self._cache_dir
|
|
315
|
-
|
|
316
|
-
new_cache_dir = _process_cache_path(cache_dir)
|
|
317
|
-
if new_cache_dir is not None:
|
|
318
|
-
new_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
319
|
-
new_cache_dir = new_cache_dir.resolve()
|
|
320
|
-
self._cache_dir = new_cache_dir
|
|
321
|
-
|
|
322
|
-
try:
|
|
323
|
-
if src_sqlite_file is not None:
|
|
324
|
-
dst_sqlite_file = settings.instance._sqlite_file_local
|
|
325
|
-
dst_sqlite_file.parent.mkdir(parents=True, exist_ok=True)
|
|
326
|
-
if dst_sqlite_file.exists():
|
|
327
|
-
dst_sqlite_file.unlink()
|
|
328
|
-
shutil.move(src_sqlite_file, dst_sqlite_file) # type: ignore
|
|
329
|
-
save_system_storage_settings(self._cache_dir, self._storage_settings_file)
|
|
330
|
-
except Exception as e:
|
|
331
|
-
self._cache_dir = save_cache_dir
|
|
332
|
-
raise e
|
|
333
|
-
|
|
334
|
-
@property
|
|
335
|
-
def type_is_cloud(self) -> bool:
|
|
336
|
-
"""`True` if `storage_root` is in cloud, `False` otherwise."""
|
|
337
|
-
return self.type != "local"
|
|
338
|
-
|
|
339
|
-
@property
|
|
340
|
-
def region(self) -> str | None:
|
|
341
|
-
"""Storage region."""
|
|
342
|
-
if self._region is None:
|
|
343
|
-
self._region = get_storage_region(self.root_as_str)
|
|
344
|
-
return self._region
|
|
345
|
-
|
|
346
|
-
@property
|
|
347
|
-
def type(self) -> Literal["local", "s3", "gs"]:
|
|
348
|
-
"""AWS S3 vs. Google Cloud vs. local.
|
|
349
|
-
|
|
350
|
-
Returns the protocol as a string: "local", "s3", "gs".
|
|
351
|
-
"""
|
|
352
|
-
import fsspec
|
|
353
|
-
|
|
354
|
-
convert = {"file": "local"}
|
|
355
|
-
protocol = fsspec.utils.get_protocol(self.root_as_str)
|
|
356
|
-
return convert.get(protocol, protocol) # type: ignore
|
|
357
|
-
|
|
358
|
-
@property
|
|
359
|
-
def is_on_hub(self) -> bool:
|
|
360
|
-
"""Is this instance on the hub.
|
|
361
|
-
|
|
362
|
-
Only works if user has access to the instance.
|
|
363
|
-
"""
|
|
364
|
-
if self._uuid is None:
|
|
365
|
-
return False
|
|
366
|
-
else:
|
|
367
|
-
return True
|
|
368
|
-
|
|
369
|
-
def key_to_filepath(self, filekey: Path | UPath | str) -> UPath:
|
|
370
|
-
"""Cloud or local filepath from filekey."""
|
|
371
|
-
return self.root / filekey
|
|
372
|
-
|
|
373
|
-
def cloud_to_local(self, filepath: Path | UPath, **kwargs) -> UPath:
|
|
374
|
-
"""Local (cache) filepath from filepath."""
|
|
375
|
-
local_filepath = self.cloud_to_local_no_update(filepath) # type: ignore
|
|
376
|
-
if isinstance(filepath, UPath) and not isinstance(filepath, LocalPathClasses):
|
|
377
|
-
local_filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
378
|
-
filepath.synchronize(local_filepath, **kwargs)
|
|
379
|
-
return local_filepath
|
|
380
|
-
|
|
381
|
-
# conversion to Path via cloud_to_local() would trigger download
|
|
382
|
-
# of remote file to cache if there already is one
|
|
383
|
-
# in pure write operations that update the cloud, we don't want this
|
|
384
|
-
# hence, we manually construct the local file path
|
|
385
|
-
# using the `.parts` attribute in the following line
|
|
386
|
-
def cloud_to_local_no_update(self, filepath: UPath) -> UPath:
|
|
387
|
-
if isinstance(filepath, UPath) and not isinstance(filepath, LocalPathClasses):
|
|
388
|
-
return self.cache_dir.joinpath(filepath._url.netloc, *filepath.parts[1:]) # type: ignore
|
|
389
|
-
return filepath
|
|
390
|
-
|
|
391
|
-
def local_filepath(self, filekey: Path | UPath | str) -> UPath:
|
|
392
|
-
"""Local (cache) filepath from filekey: `local(filepath(...))`."""
|
|
393
|
-
return self.cloud_to_local(self.key_to_filepath(filekey))
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import secrets
|
|
5
|
+
import shutil
|
|
6
|
+
import string
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional, Union
|
|
9
|
+
|
|
10
|
+
from appdirs import AppDirs
|
|
11
|
+
from lamin_utils import logger
|
|
12
|
+
|
|
13
|
+
from ._aws_credentials import HOSTED_REGIONS, get_aws_credentials_manager
|
|
14
|
+
from ._aws_storage import find_closest_aws_region
|
|
15
|
+
from ._settings_save import save_system_storage_settings
|
|
16
|
+
from ._settings_store import system_storage_settings_file
|
|
17
|
+
from .upath import (
|
|
18
|
+
LocalPathClasses,
|
|
19
|
+
UPath,
|
|
20
|
+
create_path,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from uuid import UUID
|
|
25
|
+
|
|
26
|
+
from .types import UPathStr
|
|
27
|
+
|
|
28
|
+
DIRS = AppDirs("lamindb", "laminlabs")
|
|
29
|
+
IS_INITIALIZED_KEY = ".lamindb/_is_initialized"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def base62(n_char: int) -> str:
|
|
33
|
+
"""Like nanoid without hyphen and underscore."""
|
|
34
|
+
alphabet = string.digits + string.ascii_letters.swapcase()
|
|
35
|
+
id = "".join(secrets.choice(alphabet) for i in range(n_char))
|
|
36
|
+
return id
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_storage_region(path: UPathStr) -> str | None:
|
|
40
|
+
path_str = str(path)
|
|
41
|
+
if path_str.startswith("s3://"):
|
|
42
|
+
import botocore.session
|
|
43
|
+
from botocore.config import Config
|
|
44
|
+
from botocore.exceptions import ClientError
|
|
45
|
+
|
|
46
|
+
# strip the prefix and any suffixes of the bucket name
|
|
47
|
+
bucket = path_str.replace("s3://", "").split("/")[0]
|
|
48
|
+
session = botocore.session.get_session()
|
|
49
|
+
credentials = session.get_credentials()
|
|
50
|
+
if credentials is None or credentials.access_key is None:
|
|
51
|
+
config = Config(signature_version=botocore.session.UNSIGNED)
|
|
52
|
+
else:
|
|
53
|
+
config = None
|
|
54
|
+
s3_client = session.create_client("s3", config=config)
|
|
55
|
+
try:
|
|
56
|
+
response = s3_client.head_bucket(Bucket=bucket)
|
|
57
|
+
except ClientError as exc:
|
|
58
|
+
response = getattr(exc, "response", {})
|
|
59
|
+
if response.get("Error", {}).get("Code") == "404":
|
|
60
|
+
raise exc
|
|
61
|
+
region = (
|
|
62
|
+
response.get("ResponseMetadata", {})
|
|
63
|
+
.get("HTTPHeaders", {})
|
|
64
|
+
.get("x-amz-bucket-region")
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
region = None
|
|
68
|
+
return region
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def mark_storage_root(root: UPathStr, uid: str):
|
|
72
|
+
# we need to touch a 0-byte object in folder-like storage location on S3 to avoid
|
|
73
|
+
# permission errors from leveraging s3fs on an empty hosted storage location
|
|
74
|
+
# for consistency, we write this file everywhere
|
|
75
|
+
root_upath = UPath(root)
|
|
76
|
+
mark_upath = root_upath / IS_INITIALIZED_KEY
|
|
77
|
+
mark_upath.write_text(uid)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def init_storage(
|
|
81
|
+
root: UPathStr,
|
|
82
|
+
instance_id: UUID | None = None,
|
|
83
|
+
register_hub: bool | None = None,
|
|
84
|
+
prevent_register_hub: bool = False,
|
|
85
|
+
init_instance: bool = False,
|
|
86
|
+
) -> tuple[
|
|
87
|
+
StorageSettings,
|
|
88
|
+
Literal["hub-record-not-created", "hub-record-retireved", "hub-record-created"],
|
|
89
|
+
]:
|
|
90
|
+
if root is None:
|
|
91
|
+
raise ValueError("`storage` argument can't be `None`")
|
|
92
|
+
root_str = str(root) # ensure we have a string
|
|
93
|
+
if ".lamindb" in root_str:
|
|
94
|
+
raise ValueError(
|
|
95
|
+
'Please pass a folder name that does not end or contain ".lamindb"'
|
|
96
|
+
)
|
|
97
|
+
uid = base62(12)
|
|
98
|
+
region = None
|
|
99
|
+
lamin_env = os.getenv("LAMIN_ENV")
|
|
100
|
+
if root_str.startswith("create-s3"):
|
|
101
|
+
if root_str != "create-s3":
|
|
102
|
+
assert "--" in root_str, "example: `create-s3--eu-central-1`"
|
|
103
|
+
region = root_str.replace("create-s3--", "")
|
|
104
|
+
if region is None:
|
|
105
|
+
region = find_closest_aws_region()
|
|
106
|
+
else:
|
|
107
|
+
if region not in HOSTED_REGIONS:
|
|
108
|
+
raise ValueError(f"region has to be one of {HOSTED_REGIONS}")
|
|
109
|
+
if lamin_env is None or lamin_env == "prod":
|
|
110
|
+
root_str = f"s3://lamin-{region}/{uid}"
|
|
111
|
+
else:
|
|
112
|
+
root_str = f"s3://lamin-hosted-test/{uid}"
|
|
113
|
+
elif root_str.startswith(("gs://", "s3://")):
|
|
114
|
+
pass
|
|
115
|
+
else: # local path
|
|
116
|
+
try:
|
|
117
|
+
_ = Path(root_str)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error("`storage` is not a valid local, GCP storage or AWS S3 path")
|
|
120
|
+
raise e
|
|
121
|
+
ssettings = StorageSettings(
|
|
122
|
+
uid=uid,
|
|
123
|
+
root=root_str,
|
|
124
|
+
region=region,
|
|
125
|
+
instance_id=instance_id,
|
|
126
|
+
)
|
|
127
|
+
# this stores the result of init_storage_hub
|
|
128
|
+
hub_record_status: Literal[
|
|
129
|
+
"hub-record-not-created", "hub-record-retireved", "hub-record-created"
|
|
130
|
+
] = "hub-record-not-created"
|
|
131
|
+
# the below might update the uid with one that's already taken on the hub
|
|
132
|
+
if not prevent_register_hub:
|
|
133
|
+
if ssettings.type_is_cloud or register_hub:
|
|
134
|
+
from ._hub_core import delete_storage_record
|
|
135
|
+
from ._hub_core import init_storage as init_storage_hub
|
|
136
|
+
|
|
137
|
+
hub_record_status = init_storage_hub(
|
|
138
|
+
ssettings, auto_populate_instance=not init_instance
|
|
139
|
+
)
|
|
140
|
+
# below comes last only if everything else was successful
|
|
141
|
+
try:
|
|
142
|
+
# (federated) credentials for AWS access are provisioned under-the-hood
|
|
143
|
+
# discussion: https://laminlabs.slack.com/archives/C04FPE8V01W/p1719260587167489
|
|
144
|
+
mark_storage_root(ssettings.root, ssettings.uid) # type: ignore
|
|
145
|
+
except Exception:
|
|
146
|
+
logger.important(
|
|
147
|
+
f"due to lack of write access, LaminDB won't manage storage location: {ssettings.root}"
|
|
148
|
+
)
|
|
149
|
+
# we have to check hub_record_status here because
|
|
150
|
+
# _select_storage inside init_storage_hub also populates ssettings._uuid
|
|
151
|
+
# and we don't want to delete an existing storage record here
|
|
152
|
+
# only newly created
|
|
153
|
+
if hub_record_status == "hub-record-created" and ssettings._uuid is not None:
|
|
154
|
+
delete_storage_record(ssettings._uuid) # type: ignore
|
|
155
|
+
ssettings._instance_id = None
|
|
156
|
+
return ssettings, hub_record_status
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _process_cache_path(cache_path: str | Path | UPath | None):
|
|
160
|
+
if cache_path is None or cache_path == "null":
|
|
161
|
+
return None
|
|
162
|
+
cache_dir = UPath(cache_path)
|
|
163
|
+
if not isinstance(cache_dir, LocalPathClasses):
|
|
164
|
+
raise ValueError("cache dir should be a local path.")
|
|
165
|
+
if cache_dir.exists() and not cache_dir.is_dir():
|
|
166
|
+
raise ValueError("cache dir should be a directory.")
|
|
167
|
+
return cache_dir
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class StorageSettings:
|
|
171
|
+
"""Settings for a given storage location (local or cloud)."""
|
|
172
|
+
|
|
173
|
+
def __init__(
|
|
174
|
+
self,
|
|
175
|
+
root: UPathStr,
|
|
176
|
+
region: str | None = None,
|
|
177
|
+
uid: str | None = None,
|
|
178
|
+
uuid: UUID | None = None,
|
|
179
|
+
instance_id: UUID | None = None,
|
|
180
|
+
# note that passing access_token prevents credentials caching
|
|
181
|
+
access_token: str | None = None,
|
|
182
|
+
):
|
|
183
|
+
self._uid = uid
|
|
184
|
+
self._uuid_ = uuid
|
|
185
|
+
self._root_init = UPath(root)
|
|
186
|
+
if isinstance(self._root_init, LocalPathClasses): # local paths
|
|
187
|
+
try:
|
|
188
|
+
(self._root_init / ".lamindb").mkdir(parents=True, exist_ok=True)
|
|
189
|
+
self._root_init = self._root_init.resolve()
|
|
190
|
+
except Exception:
|
|
191
|
+
logger.warning(f"unable to create .lamindb folder in {self._root_init}")
|
|
192
|
+
pass
|
|
193
|
+
self._root = None
|
|
194
|
+
self._instance_id = instance_id
|
|
195
|
+
# we don't yet infer region here to make init fast
|
|
196
|
+
self._region = region
|
|
197
|
+
# would prefer to type below as Registry, but need to think through import order
|
|
198
|
+
self._record: Any | None = None
|
|
199
|
+
# cache settings
|
|
200
|
+
self._storage_settings_file = system_storage_settings_file()
|
|
201
|
+
if self._storage_settings_file.exists():
|
|
202
|
+
from dotenv import dotenv_values
|
|
203
|
+
|
|
204
|
+
cache_path = dotenv_values(self._storage_settings_file)[
|
|
205
|
+
"lamindb_cache_path"
|
|
206
|
+
]
|
|
207
|
+
self._cache_dir = _process_cache_path(cache_path)
|
|
208
|
+
else:
|
|
209
|
+
self._cache_dir = None
|
|
210
|
+
# save access_token here for use in self.root
|
|
211
|
+
self.access_token = access_token
|
|
212
|
+
|
|
213
|
+
# local storage
|
|
214
|
+
self._has_local = False
|
|
215
|
+
self._local = None
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def id(self) -> int:
|
|
219
|
+
"""Storage id in current instance."""
|
|
220
|
+
return self.record.id
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def _uuid(self) -> UUID | None:
|
|
224
|
+
"""Lamin's internal storage uuid."""
|
|
225
|
+
return self._uuid_
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def uid(self) -> str | None:
|
|
229
|
+
"""Storage id."""
|
|
230
|
+
if self._uid is None:
|
|
231
|
+
self._uid = self.record.uid
|
|
232
|
+
return self._uid
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def _mark_storage_root(self) -> UPath:
|
|
236
|
+
return self.root / IS_INITIALIZED_KEY
|
|
237
|
+
|
|
238
|
+
@property
|
|
239
|
+
def record(self) -> Any:
|
|
240
|
+
"""Storage record in current instance."""
|
|
241
|
+
if self._record is None:
|
|
242
|
+
# dynamic import because of import order
|
|
243
|
+
from lnschema_core.models import Storage
|
|
244
|
+
|
|
245
|
+
from ._settings import settings
|
|
246
|
+
|
|
247
|
+
self._record = Storage.objects.using(settings._using_key).get(
|
|
248
|
+
root=self.root_as_str
|
|
249
|
+
)
|
|
250
|
+
return self._record
|
|
251
|
+
|
|
252
|
+
def __repr__(self):
|
|
253
|
+
"""String rep."""
|
|
254
|
+
s = f"root='{self.root_as_str}', uid='{self.uid}'"
|
|
255
|
+
if self._uuid is not None:
|
|
256
|
+
s += f", uuid='{self._uuid.hex}'"
|
|
257
|
+
return f"StorageSettings({s})"
|
|
258
|
+
|
|
259
|
+
@property
|
|
260
|
+
def root(self) -> UPath:
|
|
261
|
+
"""Root storage location."""
|
|
262
|
+
if self._root is None:
|
|
263
|
+
# below makes network requests to get credentials
|
|
264
|
+
self._root = create_path(self._root_init, access_token=self.access_token)
|
|
265
|
+
elif getattr(self._root, "protocol", "") == "s3":
|
|
266
|
+
# this is needed to be sure that the root always has nonexpired credentials
|
|
267
|
+
# this just checks for time of the cached credentials in most cases
|
|
268
|
+
return get_aws_credentials_manager().enrich_path(
|
|
269
|
+
self._root, access_token=self.access_token
|
|
270
|
+
)
|
|
271
|
+
return self._root
|
|
272
|
+
|
|
273
|
+
def _set_fs_kwargs(self, **kwargs):
|
|
274
|
+
"""Set additional fsspec arguments for cloud root.
|
|
275
|
+
|
|
276
|
+
Example:
|
|
277
|
+
|
|
278
|
+
>>> ln.setup.settings.storage._set_fs_kwargs( # any fsspec args
|
|
279
|
+
>>> profile="some_profile", cache_regions=True
|
|
280
|
+
>>> )
|
|
281
|
+
"""
|
|
282
|
+
if not isinstance(self._root, LocalPathClasses) and kwargs != {}:
|
|
283
|
+
self._root = UPath(self.root, **kwargs)
|
|
284
|
+
|
|
285
|
+
@property
|
|
286
|
+
def root_as_str(self) -> str:
|
|
287
|
+
"""Formatted root string."""
|
|
288
|
+
return self._root_init.as_posix().rstrip("/")
|
|
289
|
+
|
|
290
|
+
@property
|
|
291
|
+
def cache_dir(
|
|
292
|
+
self,
|
|
293
|
+
) -> UPath:
|
|
294
|
+
"""Cache root, a local directory to cache cloud files."""
|
|
295
|
+
if "LAMIN_CACHE_DIR" in os.environ:
|
|
296
|
+
cache_dir = UPath(os.environ["LAMIN_CACHE_DIR"])
|
|
297
|
+
elif self._cache_dir is None:
|
|
298
|
+
cache_dir = UPath(DIRS.user_cache_dir)
|
|
299
|
+
else:
|
|
300
|
+
cache_dir = self._cache_dir
|
|
301
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
302
|
+
return cache_dir
|
|
303
|
+
|
|
304
|
+
@cache_dir.setter
|
|
305
|
+
def cache_dir(self, cache_dir: UPathStr):
|
|
306
|
+
"""Set cache root."""
|
|
307
|
+
from lamindb_setup import settings
|
|
308
|
+
|
|
309
|
+
if settings.instance._is_cloud_sqlite:
|
|
310
|
+
src_sqlite_file = settings.instance._sqlite_file_local
|
|
311
|
+
else:
|
|
312
|
+
src_sqlite_file = None
|
|
313
|
+
|
|
314
|
+
save_cache_dir = self._cache_dir
|
|
315
|
+
|
|
316
|
+
new_cache_dir = _process_cache_path(cache_dir)
|
|
317
|
+
if new_cache_dir is not None:
|
|
318
|
+
new_cache_dir.mkdir(parents=True, exist_ok=True)
|
|
319
|
+
new_cache_dir = new_cache_dir.resolve()
|
|
320
|
+
self._cache_dir = new_cache_dir
|
|
321
|
+
|
|
322
|
+
try:
|
|
323
|
+
if src_sqlite_file is not None:
|
|
324
|
+
dst_sqlite_file = settings.instance._sqlite_file_local
|
|
325
|
+
dst_sqlite_file.parent.mkdir(parents=True, exist_ok=True)
|
|
326
|
+
if dst_sqlite_file.exists():
|
|
327
|
+
dst_sqlite_file.unlink()
|
|
328
|
+
shutil.move(src_sqlite_file, dst_sqlite_file) # type: ignore
|
|
329
|
+
save_system_storage_settings(self._cache_dir, self._storage_settings_file)
|
|
330
|
+
except Exception as e:
|
|
331
|
+
self._cache_dir = save_cache_dir
|
|
332
|
+
raise e
|
|
333
|
+
|
|
334
|
+
@property
|
|
335
|
+
def type_is_cloud(self) -> bool:
|
|
336
|
+
"""`True` if `storage_root` is in cloud, `False` otherwise."""
|
|
337
|
+
return self.type != "local"
|
|
338
|
+
|
|
339
|
+
@property
|
|
340
|
+
def region(self) -> str | None:
|
|
341
|
+
"""Storage region."""
|
|
342
|
+
if self._region is None:
|
|
343
|
+
self._region = get_storage_region(self.root_as_str)
|
|
344
|
+
return self._region
|
|
345
|
+
|
|
346
|
+
@property
|
|
347
|
+
def type(self) -> Literal["local", "s3", "gs"]:
|
|
348
|
+
"""AWS S3 vs. Google Cloud vs. local.
|
|
349
|
+
|
|
350
|
+
Returns the protocol as a string: "local", "s3", "gs".
|
|
351
|
+
"""
|
|
352
|
+
import fsspec
|
|
353
|
+
|
|
354
|
+
convert = {"file": "local"}
|
|
355
|
+
protocol = fsspec.utils.get_protocol(self.root_as_str)
|
|
356
|
+
return convert.get(protocol, protocol) # type: ignore
|
|
357
|
+
|
|
358
|
+
@property
|
|
359
|
+
def is_on_hub(self) -> bool:
|
|
360
|
+
"""Is this instance on the hub.
|
|
361
|
+
|
|
362
|
+
Only works if user has access to the instance.
|
|
363
|
+
"""
|
|
364
|
+
if self._uuid is None:
|
|
365
|
+
return False
|
|
366
|
+
else:
|
|
367
|
+
return True
|
|
368
|
+
|
|
369
|
+
def key_to_filepath(self, filekey: Path | UPath | str) -> UPath:
|
|
370
|
+
"""Cloud or local filepath from filekey."""
|
|
371
|
+
return self.root / filekey
|
|
372
|
+
|
|
373
|
+
def cloud_to_local(self, filepath: Path | UPath, **kwargs) -> UPath:
|
|
374
|
+
"""Local (cache) filepath from filepath."""
|
|
375
|
+
local_filepath = self.cloud_to_local_no_update(filepath) # type: ignore
|
|
376
|
+
if isinstance(filepath, UPath) and not isinstance(filepath, LocalPathClasses):
|
|
377
|
+
local_filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
378
|
+
filepath.synchronize(local_filepath, **kwargs)
|
|
379
|
+
return local_filepath
|
|
380
|
+
|
|
381
|
+
# conversion to Path via cloud_to_local() would trigger download
|
|
382
|
+
# of remote file to cache if there already is one
|
|
383
|
+
# in pure write operations that update the cloud, we don't want this
|
|
384
|
+
# hence, we manually construct the local file path
|
|
385
|
+
# using the `.parts` attribute in the following line
|
|
386
|
+
def cloud_to_local_no_update(self, filepath: UPath) -> UPath:
|
|
387
|
+
if isinstance(filepath, UPath) and not isinstance(filepath, LocalPathClasses):
|
|
388
|
+
return self.cache_dir.joinpath(filepath._url.netloc, *filepath.parts[1:]) # type: ignore
|
|
389
|
+
return filepath
|
|
390
|
+
|
|
391
|
+
def local_filepath(self, filekey: Path | UPath | str) -> UPath:
|
|
392
|
+
"""Local (cache) filepath from filekey: `local(filepath(...))`."""
|
|
393
|
+
return self.cloud_to_local(self.key_to_filepath(filekey))
|