lamindb 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +30 -25
- lamindb/_tracked.py +1 -1
- lamindb/_view.py +2 -3
- lamindb/base/__init__.py +1 -1
- lamindb/base/ids.py +1 -10
- lamindb/core/__init__.py +7 -65
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +43 -20
- lamindb/core/_settings.py +6 -6
- lamindb/core/_sync_git.py +1 -1
- lamindb/core/loaders.py +30 -19
- lamindb/core/storage/_backed_access.py +4 -2
- lamindb/core/storage/_tiledbsoma.py +8 -6
- lamindb/core/storage/_zarr.py +104 -25
- lamindb/core/storage/objects.py +63 -28
- lamindb/core/storage/paths.py +4 -1
- lamindb/core/types.py +10 -0
- lamindb/curators/__init__.py +100 -85
- lamindb/errors.py +1 -1
- lamindb/integrations/_vitessce.py +4 -4
- lamindb/migrations/0089_subsequent_runs.py +159 -0
- lamindb/migrations/0090_runproject_project_runs.py +73 -0
- lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
- lamindb/models/__init__.py +79 -0
- lamindb/{core → models}/_describe.py +3 -3
- lamindb/{core → models}/_django.py +8 -5
- lamindb/{core → models}/_feature_manager.py +103 -87
- lamindb/{_from_values.py → models/_from_values.py} +5 -2
- lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
- lamindb/{core → models}/_label_manager.py +10 -17
- lamindb/{core/relations.py → models/_relations.py} +8 -1
- lamindb/models/artifact.py +2602 -0
- lamindb/{_can_curate.py → models/can_curate.py} +349 -180
- lamindb/models/collection.py +683 -0
- lamindb/models/core.py +135 -0
- lamindb/models/feature.py +643 -0
- lamindb/models/flextable.py +163 -0
- lamindb/{_parents.py → models/has_parents.py} +55 -49
- lamindb/models/project.py +384 -0
- lamindb/{_query_manager.py → models/query_manager.py} +10 -8
- lamindb/{_query_set.py → models/query_set.py} +40 -26
- lamindb/models/record.py +1762 -0
- lamindb/models/run.py +563 -0
- lamindb/{_save.py → models/save.py} +9 -7
- lamindb/models/schema.py +732 -0
- lamindb/models/transform.py +360 -0
- lamindb/models/ulabel.py +249 -0
- {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
- {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/RECORD +51 -51
- lamindb/_artifact.py +0 -1379
- lamindb/_collection.py +0 -440
- lamindb/_feature.py +0 -316
- lamindb/_is_versioned.py +0 -40
- lamindb/_record.py +0 -1064
- lamindb/_run.py +0 -60
- lamindb/_schema.py +0 -347
- lamindb/_storage.py +0 -15
- lamindb/_transform.py +0 -170
- lamindb/_ulabel.py +0 -56
- lamindb/_utils.py +0 -9
- lamindb/base/validation.py +0 -63
- lamindb/core/_data.py +0 -491
- lamindb/core/fields.py +0 -12
- lamindb/models.py +0 -4475
- {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
- {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,2602 @@
|
|
1
|
+
# ruff: noqa: TC004
|
2
|
+
from __future__ import annotations
|
3
|
+
|
4
|
+
import os
|
5
|
+
import shutil
|
6
|
+
from collections import defaultdict
|
7
|
+
from pathlib import Path, PurePath, PurePosixPath
|
8
|
+
from typing import TYPE_CHECKING, Any, Union, overload
|
9
|
+
|
10
|
+
import fsspec
|
11
|
+
import lamindb_setup as ln_setup
|
12
|
+
import pandas as pd
|
13
|
+
from anndata import AnnData
|
14
|
+
from django.db import connections, models
|
15
|
+
from django.db.models import CASCADE, PROTECT, Q
|
16
|
+
from lamin_utils import colors, logger
|
17
|
+
from lamindb_setup import settings as setup_settings
|
18
|
+
from lamindb_setup._init_instance import register_storage_in_instance
|
19
|
+
from lamindb_setup.core._settings_storage import init_storage
|
20
|
+
from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file
|
21
|
+
from lamindb_setup.core.types import UPathStr
|
22
|
+
from lamindb_setup.core.upath import (
|
23
|
+
create_path,
|
24
|
+
extract_suffix_from_path,
|
25
|
+
get_stat_dir_cloud,
|
26
|
+
get_stat_file_cloud,
|
27
|
+
)
|
28
|
+
|
29
|
+
from lamindb.base import deprecated
|
30
|
+
from lamindb.base.fields import (
|
31
|
+
BigIntegerField,
|
32
|
+
BooleanField,
|
33
|
+
CharField,
|
34
|
+
ForeignKey,
|
35
|
+
)
|
36
|
+
from lamindb.errors import FieldValidationError
|
37
|
+
from lamindb.models.query_set import QuerySet
|
38
|
+
|
39
|
+
from ..base.users import current_user_id
|
40
|
+
from ..core._compat import is_package_installed
|
41
|
+
from ..core.loaders import load_to_memory
|
42
|
+
from ..core.storage import (
|
43
|
+
LocalPathClasses,
|
44
|
+
UPath,
|
45
|
+
delete_storage,
|
46
|
+
infer_suffix,
|
47
|
+
write_to_disk,
|
48
|
+
)
|
49
|
+
from ..core.storage._anndata_accessor import _anndata_n_observations
|
50
|
+
from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES
|
51
|
+
from ..core.storage._tiledbsoma import _soma_n_observations
|
52
|
+
from ..core.storage.paths import (
|
53
|
+
AUTO_KEY_PREFIX,
|
54
|
+
auto_storage_key_from_artifact,
|
55
|
+
auto_storage_key_from_artifact_uid,
|
56
|
+
check_path_is_child_of_root,
|
57
|
+
filepath_cache_key_from_artifact,
|
58
|
+
filepath_from_artifact,
|
59
|
+
)
|
60
|
+
from ..errors import IntegrityError, InvalidArgument, ValidationError
|
61
|
+
from ..models._is_versioned import (
|
62
|
+
create_uid,
|
63
|
+
message_update_key_in_version_family,
|
64
|
+
)
|
65
|
+
from ._django import get_artifact_with_related
|
66
|
+
from ._feature_manager import (
|
67
|
+
FeatureManager,
|
68
|
+
ParamManager,
|
69
|
+
ParamManagerArtifact,
|
70
|
+
add_label_feature_links,
|
71
|
+
get_label_links,
|
72
|
+
)
|
73
|
+
from ._is_versioned import IsVersioned
|
74
|
+
from ._relations import (
|
75
|
+
dict_module_name_to_model_name,
|
76
|
+
dict_related_model_to_related_name,
|
77
|
+
)
|
78
|
+
from .core import Storage
|
79
|
+
from .feature import Feature, FeatureValue
|
80
|
+
from .has_parents import view_lineage
|
81
|
+
from .record import (
|
82
|
+
BasicRecord,
|
83
|
+
LinkORM,
|
84
|
+
Record,
|
85
|
+
_get_record_kwargs,
|
86
|
+
record_repr,
|
87
|
+
)
|
88
|
+
from .run import ParamValue, Run, TracksRun, TracksUpdates, User
|
89
|
+
from .schema import Schema
|
90
|
+
from .ulabel import ULabel
|
91
|
+
|
92
|
+
WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-run"
|
93
|
+
|
94
|
+
WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
|
95
|
+
|
96
|
+
try:
|
97
|
+
from ..core.storage._zarr import identify_zarr_type
|
98
|
+
except ImportError:
|
99
|
+
|
100
|
+
def identify_zarr_type(storepath): # type: ignore
|
101
|
+
raise ImportError("Please install zarr: pip install zarr<=2.18.4")
|
102
|
+
|
103
|
+
|
104
|
+
if TYPE_CHECKING:
|
105
|
+
from collections.abc import Iterable
|
106
|
+
|
107
|
+
from mudata import MuData # noqa: TC004
|
108
|
+
from pyarrow.dataset import Dataset as PyArrowDataset
|
109
|
+
from spatialdata import SpatialData # noqa: TC004
|
110
|
+
from tiledbsoma import Collection as SOMACollection
|
111
|
+
from tiledbsoma import Experiment as SOMAExperiment
|
112
|
+
from tiledbsoma import Measurement as SOMAMeasurement
|
113
|
+
|
114
|
+
from lamindb.base.types import StrField
|
115
|
+
from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
|
116
|
+
from lamindb.core.types import ScverseDataStructures
|
117
|
+
|
118
|
+
from ..base.types import (
|
119
|
+
ArtifactKind,
|
120
|
+
)
|
121
|
+
from ._label_manager import LabelManager
|
122
|
+
from .collection import Collection
|
123
|
+
from .project import Project, Reference
|
124
|
+
from .transform import Transform
|
125
|
+
|
126
|
+
|
127
|
+
INCONSISTENT_STATE_MSG = (
|
128
|
+
"Trying to read a folder artifact from an outdated version, "
|
129
|
+
"this can result in an incosistent state.\n"
|
130
|
+
"Read from the latest version: artifact.versions.filter(is_latest=True).one()"
|
131
|
+
)
|
132
|
+
|
133
|
+
|
134
|
+
def process_pathlike(
|
135
|
+
filepath: UPath,
|
136
|
+
default_storage: Storage,
|
137
|
+
using_key: str | None,
|
138
|
+
skip_existence_check: bool = False,
|
139
|
+
) -> tuple[Storage, bool]:
|
140
|
+
"""Determines the appropriate storage for a given path and whether to use an existing storage key."""
|
141
|
+
if not skip_existence_check:
|
142
|
+
try: # check if file exists
|
143
|
+
if not filepath.exists():
|
144
|
+
raise FileNotFoundError(filepath)
|
145
|
+
except PermissionError:
|
146
|
+
pass
|
147
|
+
if check_path_is_child_of_root(filepath, default_storage.root):
|
148
|
+
use_existing_storage_key = True
|
149
|
+
return default_storage, use_existing_storage_key
|
150
|
+
else:
|
151
|
+
# check whether the path is part of one of the existing
|
152
|
+
# already-registered storage locations
|
153
|
+
result = False
|
154
|
+
# within the hub, we don't want to perform check_path_in_existing_storage
|
155
|
+
if using_key is None:
|
156
|
+
result = check_path_in_existing_storage(filepath, using_key)
|
157
|
+
if isinstance(result, Storage):
|
158
|
+
use_existing_storage_key = True
|
159
|
+
return result, use_existing_storage_key
|
160
|
+
else:
|
161
|
+
# if the path is in the cloud, we have a good candidate
|
162
|
+
# for the storage root: the bucket
|
163
|
+
if not isinstance(filepath, LocalPathClasses):
|
164
|
+
# for a cloud path, new_root is always the bucket name
|
165
|
+
if filepath.protocol == "hf":
|
166
|
+
hf_path = filepath.fs.resolve_path(filepath.as_posix())
|
167
|
+
hf_path.path_in_repo = ""
|
168
|
+
new_root = "hf://" + hf_path.unresolve()
|
169
|
+
else:
|
170
|
+
if filepath.protocol == "s3":
|
171
|
+
# check that endpoint_url didn't propagate here
|
172
|
+
# as a part of the path string
|
173
|
+
assert "?" not in filepath.path # noqa: S101
|
174
|
+
new_root = list(filepath.parents)[-1]
|
175
|
+
# do not register remote storage locations on hub if the current instance
|
176
|
+
# is not managed on the hub
|
177
|
+
storage_settings, _ = init_storage(
|
178
|
+
new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
|
179
|
+
)
|
180
|
+
storage_record = register_storage_in_instance(storage_settings)
|
181
|
+
use_existing_storage_key = True
|
182
|
+
return storage_record, use_existing_storage_key
|
183
|
+
# if the filepath is local
|
184
|
+
else:
|
185
|
+
use_existing_storage_key = False
|
186
|
+
# if the default storage is local we'll throw an error if the user
|
187
|
+
# doesn't provide a key
|
188
|
+
if default_storage.type == "local":
|
189
|
+
return default_storage, use_existing_storage_key
|
190
|
+
# if the default storage is in the cloud (the file is going to
|
191
|
+
# be uploaded upon saving it), we treat the filepath as a cache
|
192
|
+
else:
|
193
|
+
return default_storage, use_existing_storage_key
|
194
|
+
|
195
|
+
|
196
|
+
def process_data(
|
197
|
+
provisional_uid: str,
|
198
|
+
data: UPathStr | pd.DataFrame | AnnData,
|
199
|
+
format: str | None,
|
200
|
+
key: str | None,
|
201
|
+
default_storage: Storage,
|
202
|
+
using_key: str | None,
|
203
|
+
skip_existence_check: bool = False,
|
204
|
+
is_replace: bool = False,
|
205
|
+
) -> tuple[Any, Path | UPath, str, Storage, bool]:
|
206
|
+
"""Serialize a data object that's provided as file or in memory.
|
207
|
+
|
208
|
+
if not overwritten, data gets stored in default storage
|
209
|
+
"""
|
210
|
+
supported_data_types = [pd.DataFrame, AnnData]
|
211
|
+
if is_package_installed("mudata"):
|
212
|
+
from mudata import MuData
|
213
|
+
|
214
|
+
supported_data_types.append(MuData)
|
215
|
+
if is_package_installed("spatialdata"):
|
216
|
+
from spatialdata import SpatialData
|
217
|
+
|
218
|
+
supported_data_types.append(SpatialData)
|
219
|
+
supported_data_types = tuple(supported_data_types) # type: ignore
|
220
|
+
|
221
|
+
if key is not None:
|
222
|
+
key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
|
223
|
+
# use suffix as the (adata) format if the format is not provided
|
224
|
+
if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
|
225
|
+
format = key_suffix[1:]
|
226
|
+
else:
|
227
|
+
key_suffix = None
|
228
|
+
if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
|
229
|
+
access_token = (
|
230
|
+
default_storage._access_token
|
231
|
+
if hasattr(default_storage, "_access_token")
|
232
|
+
else None
|
233
|
+
)
|
234
|
+
path = create_path(data, access_token=access_token)
|
235
|
+
# we don't resolve http links because they can resolve into a different domain
|
236
|
+
# for example into a temporary url
|
237
|
+
if path.protocol not in {"http", "https"}:
|
238
|
+
path = path.resolve()
|
239
|
+
storage, use_existing_storage_key = process_pathlike(
|
240
|
+
path,
|
241
|
+
default_storage=default_storage,
|
242
|
+
using_key=using_key,
|
243
|
+
skip_existence_check=skip_existence_check,
|
244
|
+
)
|
245
|
+
suffix = extract_suffix_from_path(path)
|
246
|
+
memory_rep = None
|
247
|
+
elif isinstance(data, supported_data_types):
|
248
|
+
storage = default_storage
|
249
|
+
memory_rep = data
|
250
|
+
suffix = infer_suffix(data, format)
|
251
|
+
else:
|
252
|
+
raise NotImplementedError(
|
253
|
+
f"Do not know how to create a artifact object from {data}, pass a path instead!"
|
254
|
+
)
|
255
|
+
if key_suffix is not None and key_suffix != suffix and not is_replace:
|
256
|
+
# consciously omitting a trailing period
|
257
|
+
if isinstance(data, (str, Path, UPath)):
|
258
|
+
message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
|
259
|
+
else:
|
260
|
+
message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
|
261
|
+
raise InvalidArgument(message)
|
262
|
+
# in case we have an in-memory representation, we need to write it to disk
|
263
|
+
from lamindb import settings
|
264
|
+
|
265
|
+
if isinstance(data, supported_data_types):
|
266
|
+
path = settings.cache_dir / f"{provisional_uid}{suffix}"
|
267
|
+
write_to_disk(data, path)
|
268
|
+
use_existing_storage_key = False
|
269
|
+
return memory_rep, path, suffix, storage, use_existing_storage_key
|
270
|
+
|
271
|
+
|
272
|
+
def get_stat_or_artifact(
|
273
|
+
path: UPath,
|
274
|
+
key: str | None = None,
|
275
|
+
check_hash: bool = True,
|
276
|
+
is_replace: bool = False,
|
277
|
+
instance: str | None = None,
|
278
|
+
) -> Union[tuple[int, str | None, str | None, int | None, Artifact | None], Artifact]:
|
279
|
+
"""Retrieves file statistics or an existing artifact based on the path, hash, and key."""
|
280
|
+
n_files = None
|
281
|
+
from lamindb import settings
|
282
|
+
|
283
|
+
if settings.creation.artifact_skip_size_hash:
|
284
|
+
return None, None, None, n_files, None
|
285
|
+
stat = path.stat() # one network request
|
286
|
+
if not isinstance(path, LocalPathClasses):
|
287
|
+
size, hash, hash_type = None, None, None
|
288
|
+
if stat is not None:
|
289
|
+
# convert UPathStatResult to fsspec info dict
|
290
|
+
stat = stat.as_info()
|
291
|
+
if (store_type := stat["type"]) == "file":
|
292
|
+
size, hash, hash_type = get_stat_file_cloud(stat)
|
293
|
+
elif store_type == "directory":
|
294
|
+
size, hash, hash_type, n_files = get_stat_dir_cloud(path)
|
295
|
+
if hash is None:
|
296
|
+
logger.warning(f"did not add hash for {path}")
|
297
|
+
return size, hash, hash_type, n_files, None
|
298
|
+
else:
|
299
|
+
if path.is_dir():
|
300
|
+
size, hash, hash_type, n_files = hash_dir(path)
|
301
|
+
else:
|
302
|
+
hash, hash_type = hash_file(path)
|
303
|
+
size = stat.st_size
|
304
|
+
if not check_hash:
|
305
|
+
return size, hash, hash_type, n_files, None
|
306
|
+
previous_artifact_version = None
|
307
|
+
if key is None or is_replace:
|
308
|
+
result = Artifact.objects.using(instance).filter(hash=hash).all()
|
309
|
+
artifact_with_same_hash_exists = len(result) > 0
|
310
|
+
else:
|
311
|
+
storage_id = settings.storage.id
|
312
|
+
result = (
|
313
|
+
Artifact.objects.using(instance)
|
314
|
+
.filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
|
315
|
+
.order_by("-created_at")
|
316
|
+
.all()
|
317
|
+
)
|
318
|
+
artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
|
319
|
+
if not artifact_with_same_hash_exists and len(result) > 0:
|
320
|
+
logger.important(
|
321
|
+
f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
|
322
|
+
)
|
323
|
+
previous_artifact_version = result[0]
|
324
|
+
if artifact_with_same_hash_exists:
|
325
|
+
message = "returning existing artifact with same hash"
|
326
|
+
if result[0]._branch_code == -1:
|
327
|
+
result[0].restore()
|
328
|
+
message = "restored artifact with same hash from trash"
|
329
|
+
logger.important(
|
330
|
+
f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
|
331
|
+
)
|
332
|
+
return result[0]
|
333
|
+
else:
|
334
|
+
return size, hash, hash_type, n_files, previous_artifact_version
|
335
|
+
|
336
|
+
|
337
|
+
def check_path_in_existing_storage(
|
338
|
+
path: Path | UPath, using_key: str | None = None
|
339
|
+
) -> Storage | bool:
|
340
|
+
for storage in Storage.objects.using(using_key).filter().all():
|
341
|
+
# if path is part of storage, return it
|
342
|
+
if check_path_is_child_of_root(path, root=storage.root):
|
343
|
+
return storage
|
344
|
+
return False
|
345
|
+
|
346
|
+
|
347
|
+
def get_relative_path_to_directory(
|
348
|
+
path: PurePath | Path | UPath, directory: PurePath | Path | UPath
|
349
|
+
) -> PurePath | Path:
|
350
|
+
if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
|
351
|
+
# UPath.relative_to() is not behaving as it should (2023-04-07)
|
352
|
+
# need to lstrip otherwise inconsistent behavior across trailing slashes
|
353
|
+
# see test_artifact.py: test_get_relative_path_to_directory
|
354
|
+
relpath = PurePath(
|
355
|
+
path.as_posix().replace(directory.as_posix(), "").lstrip("/")
|
356
|
+
)
|
357
|
+
elif isinstance(directory, Path):
|
358
|
+
relpath = path.resolve().relative_to(directory.resolve()) # type: ignore
|
359
|
+
elif isinstance(directory, PurePath):
|
360
|
+
relpath = path.relative_to(directory)
|
361
|
+
else:
|
362
|
+
raise TypeError("Directory not of type Path or UPath")
|
363
|
+
return relpath
|
364
|
+
|
365
|
+
|
366
|
+
def get_artifact_kwargs_from_data(
|
367
|
+
*,
|
368
|
+
data: Path | UPath | str | pd.DataFrame | ScverseDataStructures,
|
369
|
+
key: str | None,
|
370
|
+
run: Run | None,
|
371
|
+
format: str | None,
|
372
|
+
provisional_uid: str,
|
373
|
+
version: str | None,
|
374
|
+
default_storage: Storage,
|
375
|
+
using_key: str | None = None,
|
376
|
+
is_replace: bool = False,
|
377
|
+
skip_check_exists: bool = False,
|
378
|
+
):
|
379
|
+
from lamindb import settings
|
380
|
+
|
381
|
+
run = get_run(run)
|
382
|
+
memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
|
383
|
+
provisional_uid,
|
384
|
+
data,
|
385
|
+
format,
|
386
|
+
key,
|
387
|
+
default_storage,
|
388
|
+
using_key,
|
389
|
+
skip_check_exists,
|
390
|
+
is_replace=is_replace,
|
391
|
+
)
|
392
|
+
stat_or_artifact = get_stat_or_artifact(
|
393
|
+
path=path,
|
394
|
+
key=key,
|
395
|
+
instance=using_key,
|
396
|
+
is_replace=is_replace,
|
397
|
+
)
|
398
|
+
if isinstance(stat_or_artifact, Artifact):
|
399
|
+
existing_artifact = stat_or_artifact
|
400
|
+
if run is not None:
|
401
|
+
existing_artifact._populate_subsequent_runs(run)
|
402
|
+
return existing_artifact, None
|
403
|
+
else:
|
404
|
+
size, hash, hash_type, n_files, revises = stat_or_artifact
|
405
|
+
|
406
|
+
if revises is not None: # update provisional_uid
|
407
|
+
provisional_uid, revises = create_uid(revises=revises, version=version)
|
408
|
+
if settings.cache_dir in path.parents:
|
409
|
+
path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
|
410
|
+
|
411
|
+
check_path_in_storage = False
|
412
|
+
if use_existing_storage_key:
|
413
|
+
inferred_key = get_relative_path_to_directory(
|
414
|
+
path=path, directory=UPath(storage.root)
|
415
|
+
).as_posix()
|
416
|
+
if key is None:
|
417
|
+
key = inferred_key
|
418
|
+
else:
|
419
|
+
if not key == inferred_key:
|
420
|
+
raise InvalidArgument(
|
421
|
+
f"The path '{data}' is already in registered storage"
|
422
|
+
f" '{storage.root}' with key '{inferred_key}'\nYou passed"
|
423
|
+
f" conflicting key '{key}': please move the file before"
|
424
|
+
" registering it."
|
425
|
+
)
|
426
|
+
check_path_in_storage = True
|
427
|
+
else:
|
428
|
+
storage = default_storage
|
429
|
+
|
430
|
+
log_storage_hint(
|
431
|
+
check_path_in_storage=check_path_in_storage,
|
432
|
+
storage=storage,
|
433
|
+
key=key,
|
434
|
+
uid=provisional_uid,
|
435
|
+
suffix=suffix,
|
436
|
+
is_dir=n_files is not None,
|
437
|
+
)
|
438
|
+
|
439
|
+
# do we use a virtual or an actual storage key?
|
440
|
+
key_is_virtual = settings.creation._artifact_use_virtual_keys
|
441
|
+
|
442
|
+
# if the file is already in storage, independent of the default
|
443
|
+
# we use an actual storage key
|
444
|
+
if check_path_in_storage:
|
445
|
+
key_is_virtual = False
|
446
|
+
|
447
|
+
kwargs = {
|
448
|
+
"uid": provisional_uid,
|
449
|
+
"suffix": suffix,
|
450
|
+
"hash": hash,
|
451
|
+
"_hash_type": hash_type,
|
452
|
+
"key": key,
|
453
|
+
"size": size,
|
454
|
+
"storage_id": storage.id,
|
455
|
+
# passing both the id and the object
|
456
|
+
# to make them both available immediately
|
457
|
+
# after object creation
|
458
|
+
"n_files": n_files,
|
459
|
+
"_overwrite_versions": n_files is not None, # True for folder, False for file
|
460
|
+
"n_observations": None, # to implement
|
461
|
+
"run_id": run.id if run is not None else None,
|
462
|
+
"run": run,
|
463
|
+
"_key_is_virtual": key_is_virtual,
|
464
|
+
"revises": revises,
|
465
|
+
}
|
466
|
+
if not isinstance(path, LocalPathClasses):
|
467
|
+
local_filepath = None
|
468
|
+
cloud_filepath = path
|
469
|
+
else:
|
470
|
+
local_filepath = path
|
471
|
+
cloud_filepath = None
|
472
|
+
privates = {
|
473
|
+
"local_filepath": local_filepath,
|
474
|
+
"cloud_filepath": cloud_filepath,
|
475
|
+
"memory_rep": memory_rep,
|
476
|
+
"check_path_in_storage": check_path_in_storage,
|
477
|
+
}
|
478
|
+
return kwargs, privates
|
479
|
+
|
480
|
+
|
481
|
+
def log_storage_hint(
|
482
|
+
*,
|
483
|
+
check_path_in_storage: bool,
|
484
|
+
storage: Storage | None,
|
485
|
+
key: str | None,
|
486
|
+
uid: str,
|
487
|
+
suffix: str,
|
488
|
+
is_dir: bool,
|
489
|
+
) -> None:
|
490
|
+
hint = ""
|
491
|
+
if check_path_in_storage:
|
492
|
+
display_root = storage.root # type: ignore
|
493
|
+
# check whether path is local
|
494
|
+
if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore
|
495
|
+
# if it's a local path, check whether it's in the current working directory
|
496
|
+
root_path = Path(storage.root) # type: ignore
|
497
|
+
if check_path_is_child_of_root(root_path, Path.cwd()):
|
498
|
+
# only display the relative path, not the fully resolved path
|
499
|
+
display_root = root_path.relative_to(Path.cwd()) # type: ignore
|
500
|
+
hint += f"path in storage '{display_root}'" # type: ignore
|
501
|
+
else:
|
502
|
+
hint += "path content will be copied to default storage upon `save()`"
|
503
|
+
if key is None:
|
504
|
+
storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
|
505
|
+
hint += f" with key `None` ('{storage_key}')"
|
506
|
+
else:
|
507
|
+
hint += f" with key '{key}'"
|
508
|
+
logger.hint(hint)
|
509
|
+
|
510
|
+
|
511
|
+
def data_is_anndata(data: AnnData | UPathStr) -> bool:
|
512
|
+
if isinstance(data, AnnData):
|
513
|
+
return True
|
514
|
+
if isinstance(data, (str, Path, UPath)):
|
515
|
+
data_path = UPath(data)
|
516
|
+
if ".h5ad" in data_path.suffixes: # ".h5ad.gz" is a valid suffix
|
517
|
+
return True
|
518
|
+
elif data_path.suffix == ".zarr":
|
519
|
+
# ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
|
520
|
+
# TODO: the suffix based check should likely be moved to identify_zarr_type
|
521
|
+
if ".anndata" in data_path.suffixes:
|
522
|
+
return True
|
523
|
+
# check only for local, expensive for cloud
|
524
|
+
if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
|
525
|
+
return identify_zarr_type(data_path) == "anndata"
|
526
|
+
else:
|
527
|
+
logger.warning("We do not check if cloud zarr is AnnData or not")
|
528
|
+
return False
|
529
|
+
return False
|
530
|
+
|
531
|
+
|
532
|
+
def data_is_mudata(data: MuData | UPathStr) -> bool:
|
533
|
+
if is_package_installed("mudata"):
|
534
|
+
from mudata import MuData
|
535
|
+
|
536
|
+
if isinstance(data, MuData):
|
537
|
+
return True
|
538
|
+
if isinstance(data, (str, Path)):
|
539
|
+
return UPath(data).suffix == ".h5mu"
|
540
|
+
return False
|
541
|
+
|
542
|
+
|
543
|
+
def data_is_spatialdata(data: SpatialData | UPathStr) -> bool:
|
544
|
+
if is_package_installed("spatialdata"):
|
545
|
+
from spatialdata import SpatialData
|
546
|
+
|
547
|
+
if isinstance(data, SpatialData):
|
548
|
+
return True
|
549
|
+
if isinstance(data, (str, Path)):
|
550
|
+
if UPath(data).suffix == ".zarr":
|
551
|
+
# TODO: inconsistent with anndata, where we run the storage
|
552
|
+
# check only for local, expensive for cloud
|
553
|
+
return identify_zarr_type(data, check=False) == "spatialdata"
|
554
|
+
return False
|
555
|
+
|
556
|
+
|
557
|
+
def _check_otype_artifact(
|
558
|
+
data: UPathStr | pd.DataFrame | ScverseDataStructures,
|
559
|
+
otype: str | None = None,
|
560
|
+
) -> str:
|
561
|
+
if otype is None:
|
562
|
+
if isinstance(data, pd.DataFrame):
|
563
|
+
logger.warning("data is a DataFrame, please use .from_df()")
|
564
|
+
otype = "DataFrame"
|
565
|
+
return otype
|
566
|
+
|
567
|
+
data_is_path = isinstance(data, (str, Path))
|
568
|
+
if data_is_anndata(data):
|
569
|
+
if not data_is_path:
|
570
|
+
logger.warning("data is an AnnData, please use .from_anndata()")
|
571
|
+
otype = "AnnData"
|
572
|
+
elif data_is_mudata(data):
|
573
|
+
if not data_is_path:
|
574
|
+
logger.warning("data is a MuData, please use .from_mudata()")
|
575
|
+
otype = "MuData"
|
576
|
+
elif data_is_spatialdata(data):
|
577
|
+
if not data_is_path:
|
578
|
+
logger.warning("data is a SpatialData, please use .from_spatialdata()")
|
579
|
+
otype = "SpatialData"
|
580
|
+
elif not data_is_path: # UPath is a subclass of Path
|
581
|
+
raise TypeError("data has to be a string, Path, UPath")
|
582
|
+
return otype
|
583
|
+
|
584
|
+
|
585
|
+
def _populate_subsequent_runs_(record: Union[Artifact, Collection], run: Run):
|
586
|
+
if record.run is None:
|
587
|
+
record.run = run
|
588
|
+
elif record.run != run:
|
589
|
+
record._subsequent_runs.add(run)
|
590
|
+
|
591
|
+
|
592
|
+
# also see current_run() in core._data
|
593
|
+
def get_run(run: Run | None) -> Run | None:
|
594
|
+
from lamindb import settings
|
595
|
+
|
596
|
+
from .._tracked import get_current_tracked_run
|
597
|
+
from ..core._context import context
|
598
|
+
|
599
|
+
if run is None:
|
600
|
+
run = get_current_tracked_run()
|
601
|
+
if run is None:
|
602
|
+
run = context.run
|
603
|
+
if run is None and not settings.creation.artifact_silence_missing_run_warning:
|
604
|
+
# here we check that this is not a read-only connection
|
605
|
+
# normally for our connection strings the read-only role name has _read in it
|
606
|
+
# not absolutely safe but the worst case is that the warning is not shown
|
607
|
+
instance = setup_settings.instance
|
608
|
+
if instance.dialect != "postgresql" or "_read" not in instance.db:
|
609
|
+
logger.warning(WARNING_RUN_TRANSFORM)
|
610
|
+
# suppress run by passing False
|
611
|
+
elif not run:
|
612
|
+
run = None
|
613
|
+
return run
|
614
|
+
|
615
|
+
|
616
|
+
def save_staged_feature_sets(self: Artifact) -> None:
|
617
|
+
if hasattr(self, "_staged_feature_sets"):
|
618
|
+
from lamindb.models._feature_manager import get_schema_by_slot_
|
619
|
+
|
620
|
+
existing_staged_feature_sets = get_schema_by_slot_(self)
|
621
|
+
saved_staged_feature_sets = {}
|
622
|
+
for key, schema in self._staged_feature_sets.items():
|
623
|
+
if isinstance(schema, Schema) and schema._state.adding:
|
624
|
+
schema.save()
|
625
|
+
saved_staged_feature_sets[key] = schema
|
626
|
+
if key in existing_staged_feature_sets:
|
627
|
+
# remove existing feature set on the same slot
|
628
|
+
self.feature_sets.remove(existing_staged_feature_sets[key])
|
629
|
+
if len(saved_staged_feature_sets) > 0:
|
630
|
+
s = "s" if len(saved_staged_feature_sets) > 1 else ""
|
631
|
+
display_schema_keys = ",".join(
|
632
|
+
f"'{key}'" for key in saved_staged_feature_sets.keys()
|
633
|
+
)
|
634
|
+
logger.save(
|
635
|
+
f"saved {len(saved_staged_feature_sets)} feature set{s} for slot{s}:"
|
636
|
+
f" {display_schema_keys}"
|
637
|
+
)
|
638
|
+
|
639
|
+
|
640
|
+
def save_schema_links(self: Artifact) -> None:
|
641
|
+
from lamindb.models.save import bulk_create
|
642
|
+
|
643
|
+
if hasattr(self, "_staged_feature_sets"):
|
644
|
+
links = []
|
645
|
+
for slot, schema in self._staged_feature_sets.items():
|
646
|
+
kwargs = {
|
647
|
+
"artifact_id": self.id,
|
648
|
+
"schema_id": schema.id,
|
649
|
+
"slot": slot,
|
650
|
+
}
|
651
|
+
links.append(Artifact.feature_sets.through(**kwargs))
|
652
|
+
bulk_create(links, ignore_conflicts=True)
|
653
|
+
|
654
|
+
|
655
|
+
# can restore later if needed
|
656
|
+
# def format_provenance(self, fk_data, print_types):
|
657
|
+
# type_str = lambda attr: (
|
658
|
+
# f": {get_related_model(self.__class__, attr).__name__}" if print_types else ""
|
659
|
+
# )
|
660
|
+
|
661
|
+
# return "".join(
|
662
|
+
# [
|
663
|
+
# f" .{field_name}{type_str(field_name)} = {format_field_value(value.get('name'))}\n"
|
664
|
+
# for field_name, value in fk_data.items()
|
665
|
+
# if value.get("name")
|
666
|
+
# ]
|
667
|
+
# )
|
668
|
+
|
669
|
+
# can restore later if needed
|
670
|
+
# def format_input_of_runs(self, print_types):
|
671
|
+
# if self.id is not None and self.input_of_runs.exists():
|
672
|
+
# values = [format_field_value(i.started_at) for i in self.input_of_runs.all()]
|
673
|
+
# type_str = ": Run" if print_types else "" # type: ignore
|
674
|
+
# return f" .input_of_runs{type_str} = {', '.join(values)}\n"
|
675
|
+
# return ""
|
676
|
+
|
677
|
+
|
678
|
+
def _describe_postgres(self): # for Artifact & Collection
|
679
|
+
from ._describe import describe_general
|
680
|
+
from ._feature_manager import describe_features
|
681
|
+
|
682
|
+
model_name = self.__class__.__name__
|
683
|
+
msg = f"{colors.green(model_name)}{record_repr(self, include_foreign_keys=False).lstrip(model_name)}\n"
|
684
|
+
if self._state.db is not None and self._state.db != "default":
|
685
|
+
msg += f" {colors.italic('Database instance')}\n"
|
686
|
+
msg += f" slug: {self._state.db}\n"
|
687
|
+
|
688
|
+
if model_name == "Artifact":
|
689
|
+
result = get_artifact_with_related(
|
690
|
+
self,
|
691
|
+
include_feature_link=True,
|
692
|
+
include_fk=True,
|
693
|
+
include_m2m=True,
|
694
|
+
include_schema=True,
|
695
|
+
)
|
696
|
+
else:
|
697
|
+
result = get_artifact_with_related(self, include_fk=True, include_m2m=True)
|
698
|
+
related_data = result.get("related_data", {})
|
699
|
+
# TODO: fk_data = related_data.get("fk", {})
|
700
|
+
|
701
|
+
tree = describe_general(self)
|
702
|
+
if model_name == "Artifact":
|
703
|
+
return describe_features(
|
704
|
+
self,
|
705
|
+
tree=tree,
|
706
|
+
related_data=related_data,
|
707
|
+
with_labels=True,
|
708
|
+
print_params=hasattr(self, "kind") and self.kind == "model",
|
709
|
+
)
|
710
|
+
else:
|
711
|
+
return tree
|
712
|
+
|
713
|
+
|
714
|
+
def _describe_sqlite(self, print_types: bool = False): # for artifact & collection
|
715
|
+
from ._describe import describe_general
|
716
|
+
from ._feature_manager import describe_features
|
717
|
+
from .collection import Collection
|
718
|
+
|
719
|
+
model_name = self.__class__.__name__
|
720
|
+
msg = f"{colors.green(model_name)}{record_repr(self, include_foreign_keys=False).lstrip(model_name)}\n"
|
721
|
+
if self._state.db is not None and self._state.db != "default":
|
722
|
+
msg += f" {colors.italic('Database instance')}\n"
|
723
|
+
msg += f" slug: {self._state.db}\n"
|
724
|
+
|
725
|
+
fields = self._meta.fields
|
726
|
+
direct_fields = []
|
727
|
+
foreign_key_fields = []
|
728
|
+
for f in fields:
|
729
|
+
if f.is_relation:
|
730
|
+
foreign_key_fields.append(f.name)
|
731
|
+
else:
|
732
|
+
direct_fields.append(f.name)
|
733
|
+
if not self._state.adding:
|
734
|
+
# prefetch foreign key relationships
|
735
|
+
self = (
|
736
|
+
self.__class__.objects.using(self._state.db)
|
737
|
+
.select_related(*foreign_key_fields)
|
738
|
+
.get(id=self.id)
|
739
|
+
)
|
740
|
+
# prefetch m-2-m relationships
|
741
|
+
many_to_many_fields = []
|
742
|
+
if isinstance(self, (Collection, Artifact)):
|
743
|
+
many_to_many_fields.append("input_of_runs")
|
744
|
+
if isinstance(self, Artifact):
|
745
|
+
many_to_many_fields.append("feature_sets")
|
746
|
+
self = (
|
747
|
+
self.__class__.objects.using(self._state.db)
|
748
|
+
.prefetch_related(*many_to_many_fields)
|
749
|
+
.get(id=self.id)
|
750
|
+
)
|
751
|
+
tree = describe_general(self)
|
752
|
+
if model_name == "Artifact":
|
753
|
+
return describe_features(
|
754
|
+
self,
|
755
|
+
tree=tree,
|
756
|
+
with_labels=True,
|
757
|
+
print_params=hasattr(self, "kind") and self.kind == "kind",
|
758
|
+
)
|
759
|
+
else:
|
760
|
+
return tree
|
761
|
+
|
762
|
+
|
763
|
+
def describe_artifact_collection(self): # for artifact & collection
|
764
|
+
from ._describe import print_rich_tree
|
765
|
+
|
766
|
+
if not self._state.adding and connections[self._state.db].vendor == "postgresql":
|
767
|
+
tree = _describe_postgres(self)
|
768
|
+
else:
|
769
|
+
tree = _describe_sqlite(self)
|
770
|
+
|
771
|
+
print_rich_tree(tree)
|
772
|
+
|
773
|
+
|
774
|
+
def validate_feature(feature: Feature, records: list[Record]) -> None:
|
775
|
+
"""Validate feature record, adjust feature.dtype based on labels records."""
|
776
|
+
if not isinstance(feature, Feature):
|
777
|
+
raise TypeError("feature has to be of type Feature")
|
778
|
+
if feature._state.adding:
|
779
|
+
registries = {record.__class__.__get_name_with_module__() for record in records}
|
780
|
+
registries_str = "|".join(registries)
|
781
|
+
msg = f"ln.Feature(name='{feature.name}', type='cat[{registries_str}]').save()"
|
782
|
+
raise ValidationError(f"Feature not validated. If it looks correct: {msg}")
|
783
|
+
|
784
|
+
|
785
|
+
def get_labels(
|
786
|
+
self,
|
787
|
+
feature: Feature,
|
788
|
+
mute: bool = False,
|
789
|
+
flat_names: bool = False,
|
790
|
+
) -> QuerySet | dict[str, QuerySet] | list:
|
791
|
+
"""{}""" # noqa: D415
|
792
|
+
if not isinstance(feature, Feature):
|
793
|
+
raise TypeError("feature has to be of type Feature")
|
794
|
+
if feature.dtype is None or not feature.dtype.startswith("cat["):
|
795
|
+
raise ValueError("feature does not have linked labels")
|
796
|
+
registries_to_check = feature.dtype.replace("cat[", "").rstrip("]").split("|")
|
797
|
+
if len(registries_to_check) > 1 and not mute:
|
798
|
+
logger.warning("labels come from multiple registries!")
|
799
|
+
# return an empty query set if self.id is still None
|
800
|
+
if self.id is None:
|
801
|
+
return QuerySet(self.__class__)
|
802
|
+
qs_by_registry = {}
|
803
|
+
for registry in registries_to_check:
|
804
|
+
# currently need to distinguish between ULabel and non-ULabel, because
|
805
|
+
# we only have the feature information for Label
|
806
|
+
if registry == "ULabel":
|
807
|
+
links_to_labels = get_label_links(self, registry, feature)
|
808
|
+
label_ids = [link.ulabel_id for link in links_to_labels]
|
809
|
+
qs_by_registry[registry] = ULabel.objects.using(self._state.db).filter(
|
810
|
+
id__in=label_ids
|
811
|
+
)
|
812
|
+
elif registry in self.features._accessor_by_registry:
|
813
|
+
qs_by_registry[registry] = getattr(
|
814
|
+
self, self.features._accessor_by_registry[registry]
|
815
|
+
).all()
|
816
|
+
if flat_names:
|
817
|
+
# returns a flat list of names
|
818
|
+
from .record import get_name_field
|
819
|
+
|
820
|
+
values = []
|
821
|
+
for v in qs_by_registry.values():
|
822
|
+
values += v.list(get_name_field(v))
|
823
|
+
return values
|
824
|
+
if len(registries_to_check) == 1 and registry in qs_by_registry:
|
825
|
+
return qs_by_registry[registry]
|
826
|
+
else:
|
827
|
+
return qs_by_registry
|
828
|
+
|
829
|
+
|
830
|
+
def add_labels(
|
831
|
+
self,
|
832
|
+
records: Record | list[Record] | QuerySet | Iterable,
|
833
|
+
feature: Feature | None = None,
|
834
|
+
*,
|
835
|
+
field: StrField | None = None,
|
836
|
+
feature_ref_is_name: bool | None = None,
|
837
|
+
label_ref_is_name: bool | None = None,
|
838
|
+
from_curator: bool = False,
|
839
|
+
) -> None:
|
840
|
+
"""{}""" # noqa: D415
|
841
|
+
if self._state.adding:
|
842
|
+
raise ValueError("Please save the artifact/collection before adding a label!")
|
843
|
+
|
844
|
+
if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
|
845
|
+
records = records.list()
|
846
|
+
if isinstance(records, (str, Record)):
|
847
|
+
records = [records]
|
848
|
+
if not isinstance(records, list): # avoids warning for pd Series
|
849
|
+
records = list(records)
|
850
|
+
# create records from values
|
851
|
+
if len(records) == 0:
|
852
|
+
return None
|
853
|
+
if isinstance(records[0], str): # type: ignore
|
854
|
+
records_validated = []
|
855
|
+
# feature is needed if we want to create records from values
|
856
|
+
if feature is None:
|
857
|
+
raise ValueError(
|
858
|
+
"Please pass a feature, e.g., via: label = ln.ULabel(name='my_label',"
|
859
|
+
" feature=ln.Feature(name='my_feature'))"
|
860
|
+
)
|
861
|
+
if feature.dtype.startswith("cat["):
|
862
|
+
orm_dict = dict_module_name_to_model_name(Artifact)
|
863
|
+
for reg in feature.dtype.replace("cat[", "").rstrip("]").split("|"):
|
864
|
+
registry = orm_dict.get(reg)
|
865
|
+
records_validated += registry.from_values(records, field=field)
|
866
|
+
|
867
|
+
# feature doesn't have registries and therefore can't create records from values
|
868
|
+
# ask users to pass records
|
869
|
+
if len(records_validated) == 0:
|
870
|
+
raise ValueError(
|
871
|
+
"Please pass a record (a `Record` object), not a string, e.g., via:"
|
872
|
+
" label"
|
873
|
+
f" = ln.ULabel(name='{records[0]}')" # type: ignore
|
874
|
+
)
|
875
|
+
records = records_validated
|
876
|
+
|
877
|
+
for record in records:
|
878
|
+
if record._state.adding:
|
879
|
+
raise ValidationError(
|
880
|
+
f"{record} not validated. If it looks correct: record.save()"
|
881
|
+
)
|
882
|
+
|
883
|
+
if feature is None:
|
884
|
+
d = dict_related_model_to_related_name(self.__class__)
|
885
|
+
# strategy: group records by registry to reduce number of transactions
|
886
|
+
records_by_related_name: dict = {}
|
887
|
+
for record in records:
|
888
|
+
related_name = d.get(record.__class__.__get_name_with_module__())
|
889
|
+
if related_name is None:
|
890
|
+
raise ValueError(f"Can't add labels to {record.__class__} record!")
|
891
|
+
if related_name not in records_by_related_name:
|
892
|
+
records_by_related_name[related_name] = []
|
893
|
+
records_by_related_name[related_name].append(record)
|
894
|
+
for related_name, records in records_by_related_name.items():
|
895
|
+
getattr(self, related_name).add(*records)
|
896
|
+
else:
|
897
|
+
validate_feature(feature, records) # type:ignore
|
898
|
+
records_by_registry = defaultdict(list)
|
899
|
+
feature_sets = self.feature_sets.filter(itype="Feature").all()
|
900
|
+
internal_features = set() # type: ignore
|
901
|
+
if len(feature_sets) > 0:
|
902
|
+
for schema in feature_sets:
|
903
|
+
internal_features = internal_features.union(
|
904
|
+
set(schema.members.values_list("name", flat=True))
|
905
|
+
) # type: ignore
|
906
|
+
for record in records:
|
907
|
+
records_by_registry[record.__class__.__get_name_with_module__()].append(
|
908
|
+
record
|
909
|
+
)
|
910
|
+
for registry_name, records in records_by_registry.items():
|
911
|
+
if not from_curator and feature.name in internal_features:
|
912
|
+
raise ValidationError(
|
913
|
+
"Cannot manually annotate internal feature with label. Please use ln.Curator"
|
914
|
+
)
|
915
|
+
if registry_name not in feature.dtype:
|
916
|
+
if not feature.dtype.startswith("cat"):
|
917
|
+
raise ValidationError(
|
918
|
+
f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{feature.dtype}'"
|
919
|
+
)
|
920
|
+
if feature.dtype == "cat":
|
921
|
+
feature.dtype = f"cat[{registry_name}]" # type: ignore
|
922
|
+
feature.save()
|
923
|
+
elif registry_name not in feature.dtype:
|
924
|
+
new_dtype = feature.dtype.rstrip("]") + f"|{registry_name}]"
|
925
|
+
raise ValidationError(
|
926
|
+
f"Label type {registry_name} is not valid for Feature(name='{feature.name}', dtype='{feature.dtype}'), consider updating to dtype='{new_dtype}'"
|
927
|
+
)
|
928
|
+
|
929
|
+
if registry_name not in self.features._accessor_by_registry:
|
930
|
+
logger.warning(f"skipping {registry_name}")
|
931
|
+
continue
|
932
|
+
if len(records) == 0:
|
933
|
+
continue
|
934
|
+
features_labels = {
|
935
|
+
registry_name: [(feature, label_record) for label_record in records]
|
936
|
+
}
|
937
|
+
add_label_feature_links(
|
938
|
+
self.features,
|
939
|
+
features_labels,
|
940
|
+
feature_ref_is_name=feature_ref_is_name,
|
941
|
+
label_ref_is_name=label_ref_is_name,
|
942
|
+
)
|
943
|
+
|
944
|
+
|
945
|
+
class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
946
|
+
# Note that this docstring has to be consistent with Curator.save_artifact()
|
947
|
+
"""Datasets & models stored as files, folders, or arrays.
|
948
|
+
|
949
|
+
Artifacts manage data in local or remote storage.
|
950
|
+
|
951
|
+
Some artifacts are array-like, e.g., when stored as `.parquet`, `.h5ad`,
|
952
|
+
`.zarr`, or `.tiledb`.
|
953
|
+
|
954
|
+
Args:
|
955
|
+
data: `UPathStr` A path to a local or remote folder or file.
|
956
|
+
kind: `Literal["dataset", "model"] | None = None` Distinguish models from datasets from other files & folders.
|
957
|
+
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
958
|
+
description: `str | None = None` A description.
|
959
|
+
revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
|
960
|
+
run: `Run | None = None` The run that creates the artifact.
|
961
|
+
|
962
|
+
.. dropdown:: Typical storage formats & their API accessors
|
963
|
+
|
964
|
+
Arrays:
|
965
|
+
|
966
|
+
- Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
|
967
|
+
- Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
|
968
|
+
- Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
|
969
|
+
|
970
|
+
Non-arrays:
|
971
|
+
|
972
|
+
- Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
|
973
|
+
- Fastq: `.fastq` ⟷ /
|
974
|
+
- VCF: `.vcf` ⟷ /
|
975
|
+
- QC: `.html` ⟷ /
|
976
|
+
|
977
|
+
You'll find these values in the `suffix` & `accessor` fields.
|
978
|
+
|
979
|
+
LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
|
980
|
+
|
981
|
+
See Also:
|
982
|
+
:class:`~lamindb.Storage`
|
983
|
+
Storage locations for artifacts.
|
984
|
+
:class:`~lamindb.Collection`
|
985
|
+
Collections of artifacts.
|
986
|
+
:meth:`~lamindb.Artifact.from_df`
|
987
|
+
Create an artifact from a `DataFrame`.
|
988
|
+
:meth:`~lamindb.Artifact.from_anndata`
|
989
|
+
Create an artifact from an `AnnData`.
|
990
|
+
|
991
|
+
Examples:
|
992
|
+
|
993
|
+
Create an artifact by passing `key`:
|
994
|
+
|
995
|
+
>>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
|
996
|
+
>>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
|
997
|
+
|
998
|
+
Calling `.save()` uploads the file to the default storage location of your lamindb instance.
|
999
|
+
(If it's a local instance, the "upload" is a mere copy operation.)
|
1000
|
+
|
1001
|
+
If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
|
1002
|
+
|
1003
|
+
>>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
|
1004
|
+
|
1005
|
+
You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
|
1006
|
+
|
1007
|
+
>>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
|
1008
|
+
>>> artifact_v2.versions.df() # see all versions
|
1009
|
+
|
1010
|
+
.. dropdown:: Why does the API look this way?
|
1011
|
+
|
1012
|
+
It's inspired by APIs building on AWS S3.
|
1013
|
+
|
1014
|
+
Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
|
1015
|
+
|
1016
|
+
In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
|
1017
|
+
|
1018
|
+
# signature: S3.Bucket.upload_file(filepath, key)
|
1019
|
+
import boto3
|
1020
|
+
s3 = boto3.resource('s3')
|
1021
|
+
bucket = s3.Bucket('mybucket')
|
1022
|
+
bucket.upload_file('/tmp/hello.txt', 'hello.txt')
|
1023
|
+
|
1024
|
+
In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
|
1025
|
+
|
1026
|
+
# signature: quilt3.Bucket.put_file(key, filepath)
|
1027
|
+
import quilt3
|
1028
|
+
bucket = quilt3.Bucket('mybucket')
|
1029
|
+
bucket.put_file('hello.txt', '/tmp/hello.txt')
|
1030
|
+
|
1031
|
+
Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
|
1032
|
+
|
1033
|
+
>>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
|
1034
|
+
>>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
|
1035
|
+
|
1036
|
+
Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
|
1037
|
+
|
1038
|
+
>>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
|
1039
|
+
|
1040
|
+
If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
|
1041
|
+
the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
|
1042
|
+
detects the duplication and will return the existing artifact.
|
1043
|
+
|
1044
|
+
"""
|
1045
|
+
|
1046
|
+
class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
1047
|
+
abstract = False
|
1048
|
+
|
1049
|
+
_len_full_uid: int = 20
|
1050
|
+
_len_stem_uid: int = 16
|
1051
|
+
|
1052
|
+
params: ParamManager = ParamManagerArtifact # type: ignore
|
1053
|
+
"""Param manager.
|
1054
|
+
|
1055
|
+
Example::
|
1056
|
+
|
1057
|
+
artifact.params.add_values({
|
1058
|
+
"hidden_size": 32,
|
1059
|
+
"bottleneck_size": 16,
|
1060
|
+
"batch_size": 32,
|
1061
|
+
"preprocess_params": {
|
1062
|
+
"normalization_type": "cool",
|
1063
|
+
"subset_highlyvariable": True,
|
1064
|
+
},
|
1065
|
+
})
|
1066
|
+
"""
|
1067
|
+
|
1068
|
+
features: FeatureManager = FeatureManager # type: ignore
|
1069
|
+
"""Feature manager.
|
1070
|
+
|
1071
|
+
Features denote dataset dimensions, i.e., the variables that measure labels & numbers.
|
1072
|
+
|
1073
|
+
Annotate with features & values::
|
1074
|
+
|
1075
|
+
artifact.features.add_values({
|
1076
|
+
"species": organism, # here, organism is an Organism record
|
1077
|
+
"scientist": ['Barbara McClintock', 'Edgar Anderson'],
|
1078
|
+
"temperature": 27.6,
|
1079
|
+
"study": "Candidate marker study"
|
1080
|
+
})
|
1081
|
+
|
1082
|
+
Query for features & values::
|
1083
|
+
|
1084
|
+
ln.Artifact.features.filter(scientist="Barbara McClintock")
|
1085
|
+
|
1086
|
+
Features may or may not be part of the artifact content in storage. For
|
1087
|
+
instance, the :class:`~lamindb.Curator` flow validates the columns of a
|
1088
|
+
`DataFrame`-like artifact and annotates it with features corresponding to
|
1089
|
+
these columns. `artifact.features.add_values`, by contrast, does not
|
1090
|
+
validate the content of the artifact.
|
1091
|
+
"""
|
1092
|
+
|
1093
|
+
@property
|
1094
|
+
def labels(self) -> LabelManager:
|
1095
|
+
"""Label manager.
|
1096
|
+
|
1097
|
+
To annotate with labels, you typically use the registry-specific accessors,
|
1098
|
+
for instance :attr:`~lamindb.Artifact.ulabels`::
|
1099
|
+
|
1100
|
+
candidate_marker_study = ln.ULabel(name="Candidate marker study").save()
|
1101
|
+
artifact.ulabels.add(candidate_marker_study)
|
1102
|
+
|
1103
|
+
Similarly, you query based on these accessors::
|
1104
|
+
|
1105
|
+
ln.Artifact.filter(ulabels__name="Candidate marker study").all()
|
1106
|
+
|
1107
|
+
Unlike the registry-specific accessors, the `.labels` accessor provides
|
1108
|
+
a way of associating labels with features::
|
1109
|
+
|
1110
|
+
study = ln.Feature(name="study", dtype="cat").save()
|
1111
|
+
artifact.labels.add(candidate_marker_study, feature=study)
|
1112
|
+
|
1113
|
+
Note that the above is equivalent to::
|
1114
|
+
|
1115
|
+
artifact.features.add_values({"study": candidate_marker_study})
|
1116
|
+
"""
|
1117
|
+
from ._label_manager import LabelManager
|
1118
|
+
|
1119
|
+
return LabelManager(self)
|
1120
|
+
|
1121
|
+
id: int = models.AutoField(primary_key=True)
|
1122
|
+
"""Internal id, valid only in one DB instance."""
|
1123
|
+
uid: str = CharField(
|
1124
|
+
editable=False, unique=True, db_index=True, max_length=_len_full_uid
|
1125
|
+
)
|
1126
|
+
"""A universal random id."""
|
1127
|
+
key: str | None = CharField(db_index=True, null=True)
|
1128
|
+
"""A (virtual) relative file path within the artifact's storage location.
|
1129
|
+
|
1130
|
+
Setting a `key` is useful to automatically group artifacts into a version family.
|
1131
|
+
|
1132
|
+
LaminDB defaults to a virtual file path to make renaming of data in object storage easy.
|
1133
|
+
|
1134
|
+
If you register existing files in a storage location, the `key` equals the
|
1135
|
+
actual filepath on the underyling filesytem or object store.
|
1136
|
+
"""
|
1137
|
+
description: str | None = CharField(db_index=True, null=True)
|
1138
|
+
"""A description."""
|
1139
|
+
storage: Storage = ForeignKey(
|
1140
|
+
Storage, PROTECT, related_name="artifacts", editable=False
|
1141
|
+
)
|
1142
|
+
"""Storage location, e.g. an S3 or GCP bucket or a local directory."""
|
1143
|
+
suffix: str = CharField(max_length=30, db_index=True, editable=False)
|
1144
|
+
# Initially, we thought about having this be nullable to indicate folders
|
1145
|
+
# But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
|
1146
|
+
"""Path suffix or empty string if no canonical suffix exists.
|
1147
|
+
|
1148
|
+
This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "".
|
1149
|
+
"""
|
1150
|
+
kind: ArtifactKind | None = CharField(
|
1151
|
+
max_length=20,
|
1152
|
+
db_index=True,
|
1153
|
+
null=True,
|
1154
|
+
)
|
1155
|
+
""":class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
|
1156
|
+
otype: str | None = CharField(
|
1157
|
+
max_length=64, db_index=True, null=True, editable=False
|
1158
|
+
)
|
1159
|
+
"""Default Python object type, e.g., DataFrame, AnnData."""
|
1160
|
+
size: int | None = BigIntegerField(
|
1161
|
+
null=True, db_index=True, default=None, editable=False
|
1162
|
+
)
|
1163
|
+
"""Size in bytes.
|
1164
|
+
|
1165
|
+
Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
|
1166
|
+
"""
|
1167
|
+
hash: str | None = CharField(
|
1168
|
+
max_length=HASH_LENGTH, db_index=True, null=True, unique=True, editable=False
|
1169
|
+
)
|
1170
|
+
"""Hash or pseudo-hash of artifact content.
|
1171
|
+
|
1172
|
+
Useful to ascertain integrity and avoid duplication.
|
1173
|
+
"""
|
1174
|
+
n_files: int | None = BigIntegerField(
|
1175
|
+
null=True, db_index=True, default=None, editable=False
|
1176
|
+
)
|
1177
|
+
"""Number of files for folder-like artifacts, `None` for file-like artifacts.
|
1178
|
+
|
1179
|
+
Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
|
1180
|
+
|
1181
|
+
.. versionchanged:: 1.0
|
1182
|
+
Renamed from `n_objects` to `n_files`.
|
1183
|
+
"""
|
1184
|
+
n_observations: int | None = BigIntegerField(
|
1185
|
+
null=True, db_index=True, default=None, editable=False
|
1186
|
+
)
|
1187
|
+
"""Number of observations.
|
1188
|
+
|
1189
|
+
Typically, this denotes the first array dimension.
|
1190
|
+
"""
|
1191
|
+
_hash_type: str | None = CharField(
|
1192
|
+
max_length=30, db_index=True, null=True, editable=False
|
1193
|
+
)
|
1194
|
+
"""Type of hash."""
|
1195
|
+
ulabels: ULabel = models.ManyToManyField(
|
1196
|
+
ULabel, through="ArtifactULabel", related_name="artifacts"
|
1197
|
+
)
|
1198
|
+
"""The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
|
1199
|
+
run: Run | None = ForeignKey(
|
1200
|
+
Run,
|
1201
|
+
PROTECT,
|
1202
|
+
related_name="output_artifacts",
|
1203
|
+
null=True,
|
1204
|
+
default=None,
|
1205
|
+
editable=False,
|
1206
|
+
)
|
1207
|
+
"""Run that created the artifact."""
|
1208
|
+
input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
|
1209
|
+
"""Runs that use this artifact as an input."""
|
1210
|
+
_subsequent_runs: Run = models.ManyToManyField(
|
1211
|
+
"Run",
|
1212
|
+
related_name="_recreated_artifacts",
|
1213
|
+
db_table="lamindb_artifact__previous_runs", # legacy name, change in lamindb v2
|
1214
|
+
)
|
1215
|
+
"""Runs that re-created the record after initial creation."""
|
1216
|
+
collections: Collection
|
1217
|
+
"""The collections that this artifact is part of."""
|
1218
|
+
schema: Schema | None = ForeignKey(
|
1219
|
+
Schema,
|
1220
|
+
PROTECT,
|
1221
|
+
null=True,
|
1222
|
+
default=None,
|
1223
|
+
related_name="validated_artifacts",
|
1224
|
+
)
|
1225
|
+
"""The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
|
1226
|
+
feature_sets: Schema = models.ManyToManyField(
|
1227
|
+
Schema, related_name="artifacts", through="ArtifactSchema"
|
1228
|
+
)
|
1229
|
+
"""The feature sets measured by the artifact."""
|
1230
|
+
_feature_values: FeatureValue = models.ManyToManyField(
|
1231
|
+
FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
|
1232
|
+
)
|
1233
|
+
"""Non-categorical feature values for annotation."""
|
1234
|
+
_param_values: ParamValue = models.ManyToManyField(
|
1235
|
+
ParamValue, through="ArtifactParamValue", related_name="artifacts"
|
1236
|
+
)
|
1237
|
+
"""Parameter values."""
|
1238
|
+
_key_is_virtual: bool = BooleanField()
|
1239
|
+
"""Indicates whether `key` is virtual or part of an actual file path."""
|
1240
|
+
# be mindful that below, passing related_name="+" leads to errors
|
1241
|
+
_actions: Artifact = models.ManyToManyField(
|
1242
|
+
"self", symmetrical=False, related_name="_action_targets"
|
1243
|
+
)
|
1244
|
+
"""Actions to attach for the UI."""
|
1245
|
+
created_by: User = ForeignKey(
|
1246
|
+
"lamindb.User",
|
1247
|
+
PROTECT,
|
1248
|
+
default=current_user_id,
|
1249
|
+
related_name="created_artifacts",
|
1250
|
+
editable=False,
|
1251
|
+
)
|
1252
|
+
"""Creator of record."""
|
1253
|
+
_overwrite_versions: bool = BooleanField(default=None)
|
1254
|
+
"""Indicates whether to store or overwrite versions.
|
1255
|
+
|
1256
|
+
It defaults to False for file-like artifacts and to True for folder-like artifacts.
|
1257
|
+
"""
|
1258
|
+
projects: Project
|
1259
|
+
"""Linked projects."""
|
1260
|
+
references: Reference
|
1261
|
+
"""Linked references."""
|
1262
|
+
|
1263
|
+
@overload
|
1264
|
+
def __init__(
|
1265
|
+
self,
|
1266
|
+
# we're not choosing the name "path" for this arg because
|
1267
|
+
# it'd be confusing with `artifact.path`, which is not the same
|
1268
|
+
# so "data" conveys better that this is input data that's ingested
|
1269
|
+
# and will be moved to a target path at `artifact.path`
|
1270
|
+
# also internally, we sometimes pass "data objects" like a DataFrame
|
1271
|
+
# here; and we might refactor this but we might also keep that internal
|
1272
|
+
# usage
|
1273
|
+
data: UPathStr,
|
1274
|
+
kind: ArtifactKind | None = None,
|
1275
|
+
key: str | None = None,
|
1276
|
+
description: str | None = None,
|
1277
|
+
revises: Artifact | None = None,
|
1278
|
+
run: Run | None = None,
|
1279
|
+
): ...
|
1280
|
+
|
1281
|
+
@overload
|
1282
|
+
def __init__(
|
1283
|
+
self,
|
1284
|
+
*db_args,
|
1285
|
+
): ...
|
1286
|
+
|
1287
|
+
def __init__(
|
1288
|
+
self,
|
1289
|
+
*args,
|
1290
|
+
**kwargs,
|
1291
|
+
):
|
1292
|
+
self.features = FeatureManager(self) # type: ignore
|
1293
|
+
self.params = ParamManager(self) # type: ignore
|
1294
|
+
# Below checks for the Django-internal call in from_db()
|
1295
|
+
# it'd be better if we could avoid this, but not being able to create a Artifact
|
1296
|
+
# from data with the default constructor renders the central class of the API
|
1297
|
+
# essentially useless
|
1298
|
+
# The danger below is not that a user might pass as many args (12 of it), but rather
|
1299
|
+
# that at some point the Django API might change; on the other hand, this
|
1300
|
+
# condition of for calling the constructor based on kwargs should always
|
1301
|
+
# stay robust
|
1302
|
+
if len(args) == len(self._meta.concrete_fields):
|
1303
|
+
super().__init__(*args, **kwargs)
|
1304
|
+
return None
|
1305
|
+
# now we proceed with the user-facing constructor
|
1306
|
+
if len(args) > 1:
|
1307
|
+
raise ValueError("Only one non-keyword arg allowed: data")
|
1308
|
+
data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
|
1309
|
+
kind: str = kwargs.pop("kind", None)
|
1310
|
+
key: str | None = kwargs.pop("key", None)
|
1311
|
+
run: Run | None = kwargs.pop("run", None)
|
1312
|
+
description: str | None = kwargs.pop("description", None)
|
1313
|
+
revises: Artifact | None = kwargs.pop("revises", None)
|
1314
|
+
version: str | None = kwargs.pop("version", None)
|
1315
|
+
if "visibility" in kwargs: # backward compat
|
1316
|
+
_branch_code = kwargs.pop("visibility")
|
1317
|
+
elif "_branch_code" in kwargs:
|
1318
|
+
_branch_code = kwargs.pop("_branch_code")
|
1319
|
+
else:
|
1320
|
+
_branch_code = 1
|
1321
|
+
format = kwargs.pop("format", None)
|
1322
|
+
_is_internal_call = kwargs.pop("_is_internal_call", False)
|
1323
|
+
skip_check_exists = kwargs.pop("skip_check_exists", False)
|
1324
|
+
if "default_storage" in kwargs:
|
1325
|
+
default_storage = kwargs.pop("default_storage")
|
1326
|
+
else:
|
1327
|
+
if setup_settings.instance.keep_artifacts_local:
|
1328
|
+
default_storage = setup_settings.instance.storage_local.record
|
1329
|
+
else:
|
1330
|
+
default_storage = setup_settings.instance.storage.record
|
1331
|
+
using_key = kwargs.pop("using_key", None)
|
1332
|
+
otype = kwargs.pop("otype") if "otype" in kwargs else None
|
1333
|
+
otype = _check_otype_artifact(data=data, otype=otype)
|
1334
|
+
if "type" in kwargs:
|
1335
|
+
logger.warning("`type` will be removed soon, please use `kind`")
|
1336
|
+
kind = kwargs.pop("type")
|
1337
|
+
if not len(kwargs) == 0:
|
1338
|
+
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
|
1339
|
+
raise FieldValidationError(
|
1340
|
+
f"Only {valid_keywords} can be passed, you passed: {kwargs}"
|
1341
|
+
)
|
1342
|
+
if revises is not None and key is not None and revises.key != key:
|
1343
|
+
note = message_update_key_in_version_family(
|
1344
|
+
suid=revises.stem_uid,
|
1345
|
+
existing_key=revises.key,
|
1346
|
+
new_key=key,
|
1347
|
+
registry="Artifact",
|
1348
|
+
)
|
1349
|
+
raise ValueError(
|
1350
|
+
f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
|
1351
|
+
)
|
1352
|
+
if revises is not None:
|
1353
|
+
if not isinstance(revises, Artifact):
|
1354
|
+
raise TypeError("`revises` has to be of type `Artifact`")
|
1355
|
+
if description is None:
|
1356
|
+
description = revises.description
|
1357
|
+
if key is not None and AUTO_KEY_PREFIX in key:
|
1358
|
+
raise ValueError(
|
1359
|
+
f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
|
1360
|
+
)
|
1361
|
+
# below is for internal calls that require defining the storage location
|
1362
|
+
# ahead of constructing the Artifact
|
1363
|
+
if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
|
1364
|
+
if _is_internal_call:
|
1365
|
+
is_automanaged_path = True
|
1366
|
+
user_provided_key = key
|
1367
|
+
key = None
|
1368
|
+
else:
|
1369
|
+
raise ValueError(
|
1370
|
+
f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
|
1371
|
+
)
|
1372
|
+
else:
|
1373
|
+
is_automanaged_path = False
|
1374
|
+
provisional_uid, revises = create_uid(revises=revises, version=version)
|
1375
|
+
kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
|
1376
|
+
data=data,
|
1377
|
+
key=key,
|
1378
|
+
run=run,
|
1379
|
+
format=format,
|
1380
|
+
provisional_uid=provisional_uid,
|
1381
|
+
version=version,
|
1382
|
+
default_storage=default_storage,
|
1383
|
+
using_key=using_key,
|
1384
|
+
skip_check_exists=skip_check_exists,
|
1385
|
+
)
|
1386
|
+
|
1387
|
+
# an object with the same hash already exists
|
1388
|
+
if isinstance(kwargs_or_artifact, Artifact):
|
1389
|
+
from .record import init_self_from_db, update_attributes
|
1390
|
+
|
1391
|
+
init_self_from_db(self, kwargs_or_artifact)
|
1392
|
+
# adding "key" here is dangerous because key might be auto-populated
|
1393
|
+
attr_to_update = {"description": description}
|
1394
|
+
if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None:
|
1395
|
+
attr_to_update["key"] = key
|
1396
|
+
elif self.key != key and key is not None:
|
1397
|
+
logger.warning(
|
1398
|
+
f"key {self.key} on existing artifact differs from passed key {key}"
|
1399
|
+
)
|
1400
|
+
update_attributes(self, attr_to_update)
|
1401
|
+
return None
|
1402
|
+
else:
|
1403
|
+
kwargs = kwargs_or_artifact
|
1404
|
+
|
1405
|
+
if revises is None:
|
1406
|
+
revises = kwargs_or_artifact.pop("revises")
|
1407
|
+
|
1408
|
+
if data is not None:
|
1409
|
+
self._local_filepath = privates["local_filepath"]
|
1410
|
+
self._cloud_filepath = privates["cloud_filepath"]
|
1411
|
+
self._memory_rep = privates["memory_rep"]
|
1412
|
+
self._to_store = not privates["check_path_in_storage"]
|
1413
|
+
|
1414
|
+
if is_automanaged_path and _is_internal_call:
|
1415
|
+
kwargs["_key_is_virtual"] = True
|
1416
|
+
assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
|
1417
|
+
uid = (
|
1418
|
+
kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
|
1419
|
+
)
|
1420
|
+
kwargs["key"] = user_provided_key
|
1421
|
+
if revises is not None:
|
1422
|
+
assert uid.startswith(revises.stem_uid) # noqa: S101
|
1423
|
+
if len(uid) == 16:
|
1424
|
+
if revises is None:
|
1425
|
+
uid += "0000"
|
1426
|
+
else:
|
1427
|
+
uid, revises = create_uid(revises=revises, version=version)
|
1428
|
+
kwargs["uid"] = uid
|
1429
|
+
|
1430
|
+
# only set key now so that we don't do a look-up on it in case revises is passed
|
1431
|
+
if revises is not None:
|
1432
|
+
kwargs["key"] = revises.key
|
1433
|
+
|
1434
|
+
kwargs["kind"] = kind
|
1435
|
+
kwargs["version"] = version
|
1436
|
+
kwargs["description"] = description
|
1437
|
+
kwargs["_branch_code"] = _branch_code
|
1438
|
+
kwargs["otype"] = otype
|
1439
|
+
kwargs["revises"] = revises
|
1440
|
+
# this check needs to come down here because key might be populated from an
|
1441
|
+
# existing file path during get_artifact_kwargs_from_data()
|
1442
|
+
if (
|
1443
|
+
kwargs["key"] is None
|
1444
|
+
and kwargs["description"] is None
|
1445
|
+
and kwargs["run"] is None
|
1446
|
+
):
|
1447
|
+
raise ValueError("Pass one of key, run or description as a parameter")
|
1448
|
+
|
1449
|
+
super().__init__(**kwargs)
|
1450
|
+
|
1451
|
+
@property
|
1452
|
+
@deprecated("kind")
|
1453
|
+
def type(self) -> str:
|
1454
|
+
return self.kind
|
1455
|
+
|
1456
|
+
@property
|
1457
|
+
@deprecated("otype")
|
1458
|
+
def _accessor(self) -> str:
|
1459
|
+
return self.otype
|
1460
|
+
|
1461
|
+
@property
|
1462
|
+
def transform(self) -> Transform | None:
|
1463
|
+
"""Transform whose run created the artifact."""
|
1464
|
+
return self.run.transform if self.run is not None else None
|
1465
|
+
|
1466
|
+
@property
|
1467
|
+
@deprecated("n_files")
|
1468
|
+
def n_objects(self) -> int:
|
1469
|
+
return self.n_files
|
1470
|
+
|
1471
|
+
# add the below because this is what people will have in their code
|
1472
|
+
# if they implement the recommended migration strategy
|
1473
|
+
# - FeatureSet -> Schema
|
1474
|
+
# - featureset -> schema
|
1475
|
+
# - feature_set -> schema
|
1476
|
+
# @property
|
1477
|
+
# def schemas(self) -> QuerySet[Schema]:
|
1478
|
+
# """Schemas linked to artifact via many-to-many relationship.
|
1479
|
+
|
1480
|
+
# Is now mediating the private `.feature_sets` relationship during
|
1481
|
+
# a transition period to better schema management.
|
1482
|
+
|
1483
|
+
# .. versionchanged: 1.0
|
1484
|
+
# Was previously called `.feature_sets`.
|
1485
|
+
|
1486
|
+
# """
|
1487
|
+
# return self.feature_sets
|
1488
|
+
|
1489
|
+
@property
|
1490
|
+
def path(self) -> Path:
|
1491
|
+
"""Path.
|
1492
|
+
|
1493
|
+
File in cloud storage, here AWS S3:
|
1494
|
+
|
1495
|
+
>>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
|
1496
|
+
>>> artifact.path
|
1497
|
+
S3QueryPath('s3://my-bucket/my-file.csv')
|
1498
|
+
|
1499
|
+
File in local storage:
|
1500
|
+
|
1501
|
+
>>> ln.Artifact("./myfile.csv", key="myfile.csv").save()
|
1502
|
+
>>> artifact.path
|
1503
|
+
PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv')
|
1504
|
+
"""
|
1505
|
+
from lamindb import settings
|
1506
|
+
|
1507
|
+
filepath, _ = filepath_from_artifact(self, using_key=settings._using_key)
|
1508
|
+
return filepath
|
1509
|
+
|
1510
|
+
@property
|
1511
|
+
def _cache_path(self) -> UPath:
|
1512
|
+
from lamindb import settings
|
1513
|
+
|
1514
|
+
filepath, cache_key = filepath_cache_key_from_artifact(
|
1515
|
+
self, using_key=settings._using_key
|
1516
|
+
)
|
1517
|
+
if isinstance(filepath, LocalPathClasses):
|
1518
|
+
return filepath
|
1519
|
+
return setup_settings.paths.cloud_to_local_no_update(
|
1520
|
+
filepath, cache_key=cache_key
|
1521
|
+
)
|
1522
|
+
|
1523
|
+
@classmethod
|
1524
|
+
def from_df(
|
1525
|
+
cls,
|
1526
|
+
df: pd.DataFrame,
|
1527
|
+
*,
|
1528
|
+
key: str | None = None,
|
1529
|
+
description: str | None = None,
|
1530
|
+
run: Run | None = None,
|
1531
|
+
revises: Artifact | None = None,
|
1532
|
+
**kwargs,
|
1533
|
+
) -> Artifact:
|
1534
|
+
"""Create from `DataFrame`, validate & link features.
|
1535
|
+
|
1536
|
+
Args:
|
1537
|
+
df: A `DataFrame` object.
|
1538
|
+
key: A relative path within default storage,
|
1539
|
+
e.g., `"myfolder/myfile.parquet"`.
|
1540
|
+
description: A description.
|
1541
|
+
revises: An old version of the artifact.
|
1542
|
+
run: The run that creates the artifact.
|
1543
|
+
|
1544
|
+
See Also:
|
1545
|
+
:meth:`~lamindb.Collection`
|
1546
|
+
Track collections.
|
1547
|
+
:class:`~lamindb.Feature`
|
1548
|
+
Track features.
|
1549
|
+
|
1550
|
+
Examples:
|
1551
|
+
>>> df = ln.core.datasets.df_iris_in_meter_batch1()
|
1552
|
+
>>> df.head()
|
1553
|
+
sepal_length sepal_width petal_length petal_width iris_organism_code
|
1554
|
+
0 0.051 0.035 0.014 0.002 0
|
1555
|
+
1 0.049 0.030 0.014 0.002 0
|
1556
|
+
2 0.047 0.032 0.013 0.002 0
|
1557
|
+
3 0.046 0.031 0.015 0.002 0
|
1558
|
+
4 0.050 0.036 0.014 0.002 0
|
1559
|
+
>>> artifact = ln.Artifact.from_df(df, description="Iris flower collection batch1")
|
1560
|
+
>>> artifact.save()
|
1561
|
+
"""
|
1562
|
+
artifact = Artifact( # type: ignore
|
1563
|
+
data=df,
|
1564
|
+
key=key,
|
1565
|
+
run=run,
|
1566
|
+
description=description,
|
1567
|
+
revises=revises,
|
1568
|
+
otype="DataFrame",
|
1569
|
+
kind="dataset",
|
1570
|
+
**kwargs,
|
1571
|
+
)
|
1572
|
+
artifact.n_observations = len(df)
|
1573
|
+
return artifact
|
1574
|
+
|
1575
|
+
@classmethod
|
1576
|
+
def from_anndata(
|
1577
|
+
cls,
|
1578
|
+
adata: Union[AnnData, UPathStr],
|
1579
|
+
*,
|
1580
|
+
key: str | None = None,
|
1581
|
+
description: str | None = None,
|
1582
|
+
run: Run | None = None,
|
1583
|
+
revises: Artifact | None = None,
|
1584
|
+
**kwargs,
|
1585
|
+
) -> Artifact:
|
1586
|
+
"""Create from ``AnnData``, validate & link features.
|
1587
|
+
|
1588
|
+
Args:
|
1589
|
+
adata: An `AnnData` object or a path of AnnData-like.
|
1590
|
+
key: A relative path within default storage,
|
1591
|
+
e.g., `"myfolder/myfile.h5ad"`.
|
1592
|
+
description: A description.
|
1593
|
+
revises: An old version of the artifact.
|
1594
|
+
run: The run that creates the artifact.
|
1595
|
+
|
1596
|
+
See Also:
|
1597
|
+
|
1598
|
+
:meth:`~lamindb.Collection`
|
1599
|
+
Track collections.
|
1600
|
+
:class:`~lamindb.Feature`
|
1601
|
+
Track features.
|
1602
|
+
|
1603
|
+
Examples:
|
1604
|
+
>>> import bionty as bt
|
1605
|
+
>>> bt.settings.organism = "human"
|
1606
|
+
>>> adata = ln.core.datasets.anndata_with_obs()
|
1607
|
+
>>> artifact = ln.Artifact.from_anndata(adata, description="mini anndata with obs")
|
1608
|
+
>>> artifact.save()
|
1609
|
+
"""
|
1610
|
+
if not data_is_anndata(adata):
|
1611
|
+
raise ValueError(
|
1612
|
+
"data has to be an AnnData object or a path to AnnData-like"
|
1613
|
+
)
|
1614
|
+
_anndata_n_observations(adata)
|
1615
|
+
artifact = Artifact( # type: ignore
|
1616
|
+
data=adata,
|
1617
|
+
key=key,
|
1618
|
+
run=run,
|
1619
|
+
description=description,
|
1620
|
+
revises=revises,
|
1621
|
+
otype="AnnData",
|
1622
|
+
kind="dataset",
|
1623
|
+
**kwargs,
|
1624
|
+
)
|
1625
|
+
# this is done instead of _anndata_n_observations(adata)
|
1626
|
+
# because we need a proper path through create_path for cloud paths
|
1627
|
+
# for additional upath options etc that create_path adds
|
1628
|
+
obj_for_obs: AnnData | UPath
|
1629
|
+
if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
|
1630
|
+
obj_for_obs = artifact._memory_rep
|
1631
|
+
else:
|
1632
|
+
# returns ._local_filepath for local files
|
1633
|
+
# and the proper path through create_path for cloud paths
|
1634
|
+
obj_for_obs = artifact.path
|
1635
|
+
artifact.n_observations = _anndata_n_observations(obj_for_obs)
|
1636
|
+
return artifact
|
1637
|
+
|
1638
|
+
@classmethod
|
1639
|
+
def from_mudata(
|
1640
|
+
cls,
|
1641
|
+
mdata: Union[MuData, UPathStr],
|
1642
|
+
*,
|
1643
|
+
key: str | None = None,
|
1644
|
+
description: str | None = None,
|
1645
|
+
run: Run | None = None,
|
1646
|
+
revises: Artifact | None = None,
|
1647
|
+
**kwargs,
|
1648
|
+
) -> Artifact:
|
1649
|
+
"""Create from ``MuData``, validate & link features.
|
1650
|
+
|
1651
|
+
Args:
|
1652
|
+
mdata: A `MuData` object.
|
1653
|
+
key: A relative path within default storage,
|
1654
|
+
e.g., `"myfolder/myfile.h5mu"`.
|
1655
|
+
description: A description.
|
1656
|
+
revises: An old version of the artifact.
|
1657
|
+
run: The run that creates the artifact.
|
1658
|
+
|
1659
|
+
See Also:
|
1660
|
+
:meth:`~lamindb.Collection`
|
1661
|
+
Track collections.
|
1662
|
+
:class:`~lamindb.Feature`
|
1663
|
+
Track features.
|
1664
|
+
|
1665
|
+
Examples:
|
1666
|
+
>>> import bionty as bt
|
1667
|
+
>>> bt.settings.organism = "human"
|
1668
|
+
>>> mdata = ln.core.datasets.mudata_papalexi21_subset()
|
1669
|
+
>>> artifact = ln.Artifact.from_mudata(mdata, description="a mudata object")
|
1670
|
+
>>> artifact.save()
|
1671
|
+
"""
|
1672
|
+
if not data_is_mudata(mdata):
|
1673
|
+
raise ValueError("data has to be a MuData object or a path to MuData-like")
|
1674
|
+
artifact = Artifact( # type: ignore
|
1675
|
+
data=mdata,
|
1676
|
+
key=key,
|
1677
|
+
run=run,
|
1678
|
+
description=description,
|
1679
|
+
revises=revises,
|
1680
|
+
otype="MuData",
|
1681
|
+
kind="dataset",
|
1682
|
+
**kwargs,
|
1683
|
+
)
|
1684
|
+
if not isinstance(mdata, UPathStr):
|
1685
|
+
artifact.n_observations = mdata.n_obs
|
1686
|
+
return artifact
|
1687
|
+
|
1688
|
+
@classmethod
|
1689
|
+
def from_spatialdata(
|
1690
|
+
cls,
|
1691
|
+
sdata: Union[SpatialData, UPathStr],
|
1692
|
+
*,
|
1693
|
+
key: str | None = None,
|
1694
|
+
description: str | None = None,
|
1695
|
+
run: Run | None = None,
|
1696
|
+
revises: Artifact | None = None,
|
1697
|
+
**kwargs,
|
1698
|
+
) -> Artifact:
|
1699
|
+
"""Create from ``SpatialData``, validate & link features.
|
1700
|
+
|
1701
|
+
Args:
|
1702
|
+
mdata: A `SpatialData` object.
|
1703
|
+
key: A relative path within default storage,
|
1704
|
+
e.g., `"myfolder/myfile.zarr"`.
|
1705
|
+
description: A description.
|
1706
|
+
revises: An old version of the artifact.
|
1707
|
+
run: The run that creates the artifact.
|
1708
|
+
|
1709
|
+
See Also:
|
1710
|
+
:meth:`~lamindb.Collection`
|
1711
|
+
Track collections.
|
1712
|
+
:class:`~lamindb.Feature`
|
1713
|
+
Track features.
|
1714
|
+
|
1715
|
+
Examples:
|
1716
|
+
>>> artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr")
|
1717
|
+
"""
|
1718
|
+
if not data_is_spatialdata(sdata):
|
1719
|
+
raise ValueError(
|
1720
|
+
"data has to be a SpatialData object or a path to SpatialData-like"
|
1721
|
+
)
|
1722
|
+
artifact = Artifact( # type: ignore
|
1723
|
+
data=sdata,
|
1724
|
+
key=key,
|
1725
|
+
run=run,
|
1726
|
+
description=description,
|
1727
|
+
revises=revises,
|
1728
|
+
otype="SpatialData",
|
1729
|
+
kind="dataset",
|
1730
|
+
**kwargs,
|
1731
|
+
)
|
1732
|
+
# ill-defined https://scverse.zulipchat.com/#narrow/channel/315824-spatial/topic/How.20to.20calculate.20the.20number.20of.20observations.3F
|
1733
|
+
# artifact.n_observations = ...
|
1734
|
+
return artifact
|
1735
|
+
|
1736
|
+
@classmethod
|
1737
|
+
def from_tiledbsoma(
|
1738
|
+
cls,
|
1739
|
+
path: UPathStr,
|
1740
|
+
*,
|
1741
|
+
key: str | None = None,
|
1742
|
+
description: str | None = None,
|
1743
|
+
run: Run | None = None,
|
1744
|
+
revises: Artifact | None = None,
|
1745
|
+
**kwargs,
|
1746
|
+
) -> Artifact:
|
1747
|
+
"""Create from a tiledbsoma store.
|
1748
|
+
|
1749
|
+
Args:
|
1750
|
+
path: A tiledbsoma store with .tiledbsoma suffix.
|
1751
|
+
key: A relative path within default storage,
|
1752
|
+
e.g., `"myfolder/mystore.tiledbsoma"`.
|
1753
|
+
description: A description.
|
1754
|
+
revises: An old version of the artifact.
|
1755
|
+
run: The run that creates the artifact.
|
1756
|
+
|
1757
|
+
Examples:
|
1758
|
+
>>> artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store")
|
1759
|
+
>>> artifact.save()
|
1760
|
+
"""
|
1761
|
+
if UPath(path).suffix != ".tiledbsoma":
|
1762
|
+
raise ValueError(
|
1763
|
+
"A tiledbsoma store should have .tiledbsoma suffix to be registered."
|
1764
|
+
)
|
1765
|
+
artifact = Artifact( # type: ignore
|
1766
|
+
data=path,
|
1767
|
+
key=key,
|
1768
|
+
run=run,
|
1769
|
+
description=description,
|
1770
|
+
revises=revises,
|
1771
|
+
otype="tiledbsoma",
|
1772
|
+
kind="dataset",
|
1773
|
+
**kwargs,
|
1774
|
+
)
|
1775
|
+
artifact.n_observations = _soma_n_observations(artifact.path)
|
1776
|
+
return artifact
|
1777
|
+
|
1778
|
+
@classmethod
|
1779
|
+
def from_dir(
|
1780
|
+
cls,
|
1781
|
+
path: UPathStr,
|
1782
|
+
*,
|
1783
|
+
key: str | None = None,
|
1784
|
+
run: Run | None = None,
|
1785
|
+
) -> list[Artifact]:
|
1786
|
+
"""Create a list of artifact objects from a directory.
|
1787
|
+
|
1788
|
+
Hint:
|
1789
|
+
If you have a high number of files (several 100k) and don't want to
|
1790
|
+
track them individually, create a single :class:`~lamindb.Artifact` via
|
1791
|
+
``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`.
|
1792
|
+
|
1793
|
+
Args:
|
1794
|
+
path: Source path of folder.
|
1795
|
+
key: Key for storage destination. If `None` and
|
1796
|
+
directory is in a registered location, the inferred `key` will
|
1797
|
+
reflect the relative position. If `None` and directory is outside
|
1798
|
+
of a registered storage location, the inferred key defaults to `path.name`.
|
1799
|
+
run: A `Run` object.
|
1800
|
+
|
1801
|
+
Examples:
|
1802
|
+
>>> dir_path = ln.core.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
|
1803
|
+
>>> artifacts = ln.Artifact.from_dir(dir_path)
|
1804
|
+
>>> ln.save(artifacts)
|
1805
|
+
"""
|
1806
|
+
from lamindb import settings
|
1807
|
+
|
1808
|
+
folderpath: UPath = create_path(path) # returns Path for local
|
1809
|
+
default_storage = settings.storage.record
|
1810
|
+
using_key = settings._using_key
|
1811
|
+
storage, use_existing_storage = process_pathlike(
|
1812
|
+
folderpath, default_storage, using_key
|
1813
|
+
)
|
1814
|
+
folder_key_path: PurePath | Path
|
1815
|
+
if key is None:
|
1816
|
+
if not use_existing_storage:
|
1817
|
+
logger.warning(
|
1818
|
+
"folder is outside existing storage location, will copy files from"
|
1819
|
+
f" {path} to {storage.root}/{folderpath.name}"
|
1820
|
+
)
|
1821
|
+
folder_key_path = Path(folderpath.name)
|
1822
|
+
else:
|
1823
|
+
# maintain the hierachy within an existing storage location
|
1824
|
+
folder_key_path = get_relative_path_to_directory(
|
1825
|
+
folderpath, UPath(storage.root)
|
1826
|
+
)
|
1827
|
+
else:
|
1828
|
+
folder_key_path = Path(key)
|
1829
|
+
|
1830
|
+
folder_key = folder_key_path.as_posix()
|
1831
|
+
# silence fine-grained logging
|
1832
|
+
verbosity = settings.verbosity
|
1833
|
+
verbosity_int = settings._verbosity_int
|
1834
|
+
if verbosity_int >= 1:
|
1835
|
+
settings.verbosity = "warning"
|
1836
|
+
artifacts_dict = {}
|
1837
|
+
for filepath in folderpath.rglob("*"):
|
1838
|
+
if filepath.is_file():
|
1839
|
+
relative_path = get_relative_path_to_directory(filepath, folderpath)
|
1840
|
+
artifact_key = folder_key + "/" + relative_path.as_posix()
|
1841
|
+
# if creating from rglob, we don't need to check for existence
|
1842
|
+
artifact = Artifact(
|
1843
|
+
filepath, run=run, key=artifact_key, skip_check_exists=True
|
1844
|
+
)
|
1845
|
+
artifacts_dict[artifact.uid] = artifact
|
1846
|
+
settings.verbosity = verbosity
|
1847
|
+
|
1848
|
+
# run sanity check on hashes
|
1849
|
+
hashes = [
|
1850
|
+
artifact.hash
|
1851
|
+
for artifact in artifacts_dict.values()
|
1852
|
+
if artifact.hash is not None
|
1853
|
+
]
|
1854
|
+
uids = artifacts_dict.keys()
|
1855
|
+
n_unique_hashes = len(set(hashes))
|
1856
|
+
if n_unique_hashes == len(hashes):
|
1857
|
+
artifacts = list(artifacts_dict.values())
|
1858
|
+
else:
|
1859
|
+
# consider exact duplicates (same id, same hash)
|
1860
|
+
# below can't happen anymore because artifacts is a dict now
|
1861
|
+
# if len(set(uids)) == len(set(hashes)):
|
1862
|
+
# logger.warning("dropping duplicate records in list of artifact records")
|
1863
|
+
# artifacts = list(set(uids))
|
1864
|
+
# consider false duplicates (different id, same hash)
|
1865
|
+
if not len(set(uids)) == n_unique_hashes:
|
1866
|
+
seen_hashes = set()
|
1867
|
+
non_unique_artifacts = {
|
1868
|
+
hash: artifact
|
1869
|
+
for hash, artifact in artifacts_dict.items()
|
1870
|
+
if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore
|
1871
|
+
}
|
1872
|
+
display_non_unique = "\n ".join(
|
1873
|
+
f"{artifact}" for artifact in non_unique_artifacts
|
1874
|
+
)
|
1875
|
+
logger.warning(
|
1876
|
+
"there are multiple artifact uids with the same hashes, dropping"
|
1877
|
+
f" {len(non_unique_artifacts)} duplicates out of"
|
1878
|
+
f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
|
1879
|
+
)
|
1880
|
+
artifacts = [
|
1881
|
+
artifact
|
1882
|
+
for artifact in artifacts_dict.values()
|
1883
|
+
if artifact not in non_unique_artifacts.values()
|
1884
|
+
]
|
1885
|
+
logger.success(
|
1886
|
+
f"created {len(artifacts)} artifacts from directory using storage"
|
1887
|
+
f" {storage.root} and key = {folder_key}/"
|
1888
|
+
)
|
1889
|
+
return artifacts
|
1890
|
+
|
1891
|
+
def replace(
|
1892
|
+
self,
|
1893
|
+
data: Union[UPathStr, pd.DataFrame, AnnData, MuData],
|
1894
|
+
run: Run | None = None,
|
1895
|
+
format: str | None = None,
|
1896
|
+
) -> None:
|
1897
|
+
"""Replace artifact content.
|
1898
|
+
|
1899
|
+
Args:
|
1900
|
+
data: A file path.
|
1901
|
+
run: The run that created the artifact gets
|
1902
|
+
auto-linked if ``ln.track()`` was called.
|
1903
|
+
|
1904
|
+
Examples:
|
1905
|
+
Say we made a change to the content of an artifact, e.g., edited the image
|
1906
|
+
`paradisi05_laminopathic_nuclei.jpg`.
|
1907
|
+
|
1908
|
+
This is how we replace the old file in storage with the new file:
|
1909
|
+
|
1910
|
+
>>> artifact.replace("paradisi05_laminopathic_nuclei.jpg")
|
1911
|
+
>>> artifact.save()
|
1912
|
+
|
1913
|
+
Note that this neither changes the storage key nor the filename.
|
1914
|
+
|
1915
|
+
However, it will update the suffix if it changes.
|
1916
|
+
"""
|
1917
|
+
from lamindb import settings
|
1918
|
+
|
1919
|
+
default_storage = settings.storage.record
|
1920
|
+
kwargs, privates = get_artifact_kwargs_from_data(
|
1921
|
+
provisional_uid=self.uid,
|
1922
|
+
data=data,
|
1923
|
+
key=self.key,
|
1924
|
+
run=run,
|
1925
|
+
format=format,
|
1926
|
+
default_storage=default_storage,
|
1927
|
+
version=None,
|
1928
|
+
is_replace=True,
|
1929
|
+
)
|
1930
|
+
|
1931
|
+
# this artifact already exists
|
1932
|
+
if privates is None:
|
1933
|
+
return kwargs
|
1934
|
+
|
1935
|
+
check_path_in_storage = privates["check_path_in_storage"]
|
1936
|
+
if check_path_in_storage:
|
1937
|
+
err_msg = (
|
1938
|
+
"Can only replace with a local path not in any Storage. "
|
1939
|
+
f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}."
|
1940
|
+
)
|
1941
|
+
raise ValueError(err_msg)
|
1942
|
+
|
1943
|
+
_overwrite_versions = kwargs["_overwrite_versions"]
|
1944
|
+
if self._overwrite_versions != _overwrite_versions:
|
1945
|
+
err_msg = "It is not allowed to replace "
|
1946
|
+
err_msg += "a folder" if self._overwrite_versions else "a file"
|
1947
|
+
err_msg += " with " + ("a folder." if _overwrite_versions else "a file.")
|
1948
|
+
raise ValueError(err_msg)
|
1949
|
+
|
1950
|
+
if self.key is not None and not self._key_is_virtual:
|
1951
|
+
key_path = PurePosixPath(self.key)
|
1952
|
+
new_filename = f"{key_path.stem}{kwargs['suffix']}"
|
1953
|
+
# the following will only be true if the suffix changes!
|
1954
|
+
if key_path.name != new_filename:
|
1955
|
+
self._clear_storagekey = self.key
|
1956
|
+
self.key = str(key_path.with_name(new_filename))
|
1957
|
+
# update old key with the new one so that checks in record pass
|
1958
|
+
self._old_key = self.key
|
1959
|
+
logger.warning(
|
1960
|
+
f"replacing the file will replace key '{key_path}' with '{self.key}'"
|
1961
|
+
f" and delete '{key_path}' upon `save()`"
|
1962
|
+
)
|
1963
|
+
else:
|
1964
|
+
old_storage = auto_storage_key_from_artifact(self)
|
1965
|
+
is_dir = self.n_files is not None
|
1966
|
+
new_storage = auto_storage_key_from_artifact_uid(
|
1967
|
+
self.uid, kwargs["suffix"], is_dir
|
1968
|
+
)
|
1969
|
+
if old_storage != new_storage:
|
1970
|
+
self._clear_storagekey = old_storage
|
1971
|
+
if self.key is not None:
|
1972
|
+
new_key_path = PurePosixPath(self.key).with_suffix(kwargs["suffix"])
|
1973
|
+
self.key = str(new_key_path)
|
1974
|
+
# update old key with the new one so that checks in record pass
|
1975
|
+
self._old_key = self.key
|
1976
|
+
|
1977
|
+
self.suffix = kwargs["suffix"]
|
1978
|
+
self.size = kwargs["size"]
|
1979
|
+
self.hash = kwargs["hash"]
|
1980
|
+
self._hash_type = kwargs["_hash_type"]
|
1981
|
+
self.run_id = kwargs["run_id"]
|
1982
|
+
self.run = kwargs["run"]
|
1983
|
+
self.n_files = kwargs["n_files"]
|
1984
|
+
|
1985
|
+
self._local_filepath = privates["local_filepath"]
|
1986
|
+
self._cloud_filepath = privates["cloud_filepath"]
|
1987
|
+
self._memory_rep = privates["memory_rep"]
|
1988
|
+
# no need to upload if new file is already in storage
|
1989
|
+
self._to_store = not check_path_in_storage
|
1990
|
+
|
1991
|
+
def open(
|
1992
|
+
self, mode: str = "r", is_run_input: bool | None = None, **kwargs
|
1993
|
+
) -> Union[
|
1994
|
+
AnnDataAccessor,
|
1995
|
+
BackedAccessor,
|
1996
|
+
SOMACollection,
|
1997
|
+
SOMAExperiment,
|
1998
|
+
SOMAMeasurement,
|
1999
|
+
PyArrowDataset,
|
2000
|
+
]:
|
2001
|
+
"""Return a cloud-backed data object.
|
2002
|
+
|
2003
|
+
Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
|
2004
|
+
`tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
|
2005
|
+
|
2006
|
+
Args:
|
2007
|
+
mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
|
2008
|
+
otherwise should be always `"r"` (read-only mode).
|
2009
|
+
|
2010
|
+
Notes:
|
2011
|
+
For more info, see tutorial: :doc:`/arrays`.
|
2012
|
+
|
2013
|
+
Examples:
|
2014
|
+
|
2015
|
+
Read AnnData in backed mode from cloud:
|
2016
|
+
|
2017
|
+
>>> artifact = ln.Artifact.get(key="lndb-storage/pbmc68k.h5ad")
|
2018
|
+
>>> artifact.open()
|
2019
|
+
AnnDataAccessor object with n_obs × n_vars = 70 × 765
|
2020
|
+
constructed for the AnnData object pbmc68k.h5ad
|
2021
|
+
...
|
2022
|
+
"""
|
2023
|
+
if self._overwrite_versions and not self.is_latest:
|
2024
|
+
raise ValueError(INCONSISTENT_STATE_MSG)
|
2025
|
+
# all hdf5 suffixes including gzipped
|
2026
|
+
h5_suffixes = [".h5", ".hdf5", ".h5ad"]
|
2027
|
+
h5_suffixes += [s + ".gz" for s in h5_suffixes]
|
2028
|
+
# ignore empty suffix for now
|
2029
|
+
suffixes = (
|
2030
|
+
(
|
2031
|
+
"",
|
2032
|
+
".zarr",
|
2033
|
+
".anndata.zarr",
|
2034
|
+
".tiledbsoma",
|
2035
|
+
)
|
2036
|
+
+ tuple(h5_suffixes)
|
2037
|
+
+ PYARROW_SUFFIXES
|
2038
|
+
+ tuple(
|
2039
|
+
s + ".gz" for s in PYARROW_SUFFIXES
|
2040
|
+
) # this doesn't work for externally gzipped files, REMOVE LATER
|
2041
|
+
)
|
2042
|
+
if self.suffix not in suffixes:
|
2043
|
+
raise ValueError(
|
2044
|
+
"Artifact should have a zarr, h5, tiledbsoma object"
|
2045
|
+
" or a compatible `pyarrow.dataset.dataset` directory"
|
2046
|
+
" as the underlying data, please use one of the following suffixes"
|
2047
|
+
f" for the object name: {', '.join(suffixes[1:])}."
|
2048
|
+
f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
|
2049
|
+
" (no mixing allowed)."
|
2050
|
+
)
|
2051
|
+
if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
|
2052
|
+
raise ValueError(
|
2053
|
+
"Only a tiledbsoma store can be openened with `mode!='r'`."
|
2054
|
+
)
|
2055
|
+
|
2056
|
+
from lamindb import settings
|
2057
|
+
from lamindb.core.storage._backed_access import (
|
2058
|
+
_track_writes_factory,
|
2059
|
+
backed_access,
|
2060
|
+
)
|
2061
|
+
|
2062
|
+
using_key = settings._using_key
|
2063
|
+
filepath, cache_key = filepath_cache_key_from_artifact(
|
2064
|
+
self, using_key=using_key
|
2065
|
+
)
|
2066
|
+
is_tiledbsoma_w = (
|
2067
|
+
filepath.name == "soma" or self.suffix == ".tiledbsoma"
|
2068
|
+
) and mode == "w"
|
2069
|
+
# consider the case where an object is already locally cached
|
2070
|
+
localpath = setup_settings.paths.cloud_to_local_no_update(
|
2071
|
+
filepath, cache_key=cache_key
|
2072
|
+
)
|
2073
|
+
if is_tiledbsoma_w:
|
2074
|
+
open_cache = False
|
2075
|
+
else:
|
2076
|
+
open_cache = not isinstance(
|
2077
|
+
filepath, LocalPathClasses
|
2078
|
+
) and not filepath.synchronize(localpath, just_check=True)
|
2079
|
+
if open_cache:
|
2080
|
+
try:
|
2081
|
+
access = backed_access(localpath, mode, using_key, **kwargs)
|
2082
|
+
except Exception as e:
|
2083
|
+
if isinstance(filepath, LocalPathClasses):
|
2084
|
+
raise e
|
2085
|
+
logger.warning(
|
2086
|
+
f"The cache might be corrupted: {e}. Trying to open directly."
|
2087
|
+
)
|
2088
|
+
access = backed_access(filepath, mode, using_key, **kwargs)
|
2089
|
+
# happens only if backed_access has been successful
|
2090
|
+
# delete the corrupted cache
|
2091
|
+
if localpath.is_dir():
|
2092
|
+
shutil.rmtree(localpath)
|
2093
|
+
else:
|
2094
|
+
localpath.unlink(missing_ok=True)
|
2095
|
+
else:
|
2096
|
+
access = backed_access(filepath, mode, using_key, **kwargs)
|
2097
|
+
if is_tiledbsoma_w:
|
2098
|
+
|
2099
|
+
def finalize():
|
2100
|
+
nonlocal self, filepath, localpath
|
2101
|
+
if not isinstance(filepath, LocalPathClasses):
|
2102
|
+
_, hash, _, _ = get_stat_dir_cloud(filepath)
|
2103
|
+
else:
|
2104
|
+
# this can be very slow
|
2105
|
+
_, hash, _, _ = hash_dir(filepath)
|
2106
|
+
if self.hash != hash:
|
2107
|
+
from .record import init_self_from_db
|
2108
|
+
|
2109
|
+
new_version = Artifact(
|
2110
|
+
filepath, revises=self, _is_internal_call=True
|
2111
|
+
).save()
|
2112
|
+
init_self_from_db(self, new_version)
|
2113
|
+
|
2114
|
+
if localpath != filepath and localpath.exists():
|
2115
|
+
shutil.rmtree(localpath)
|
2116
|
+
|
2117
|
+
access = _track_writes_factory(access, finalize)
|
2118
|
+
# only call if open is successfull
|
2119
|
+
_track_run_input(self, is_run_input)
|
2120
|
+
return access
|
2121
|
+
|
2122
|
+
def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
|
2123
|
+
"""Cache and load into memory.
|
2124
|
+
|
2125
|
+
See all :mod:`~lamindb.core.loaders`.
|
2126
|
+
|
2127
|
+
Examples:
|
2128
|
+
|
2129
|
+
Load a `DataFrame`-like artifact:
|
2130
|
+
|
2131
|
+
>>> artifact.load().head()
|
2132
|
+
sepal_length sepal_width petal_length petal_width iris_organism_code
|
2133
|
+
0 0.051 0.035 0.014 0.002 0
|
2134
|
+
1 0.049 0.030 0.014 0.002 0
|
2135
|
+
2 0.047 0.032 0.013 0.002 0
|
2136
|
+
3 0.046 0.031 0.015 0.002 0
|
2137
|
+
4 0.050 0.036 0.014 0.002 0
|
2138
|
+
|
2139
|
+
Load an `AnnData`-like artifact:
|
2140
|
+
|
2141
|
+
>>> artifact.load()
|
2142
|
+
AnnData object with n_obs × n_vars = 70 × 765
|
2143
|
+
|
2144
|
+
Fall back to :meth:`~lamindb.Artifact.cache` if no in-memory representation is configured:
|
2145
|
+
|
2146
|
+
>>> artifact.load()
|
2147
|
+
PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/.lamindb/jb7BY5UJoQVGMUOKiLcn.jpg')
|
2148
|
+
"""
|
2149
|
+
from lamindb import settings
|
2150
|
+
|
2151
|
+
if self._overwrite_versions and not self.is_latest:
|
2152
|
+
raise ValueError(INCONSISTENT_STATE_MSG)
|
2153
|
+
|
2154
|
+
if hasattr(self, "_memory_rep") and self._memory_rep is not None:
|
2155
|
+
access_memory = self._memory_rep
|
2156
|
+
else:
|
2157
|
+
filepath, cache_key = filepath_cache_key_from_artifact(
|
2158
|
+
self, using_key=settings._using_key
|
2159
|
+
)
|
2160
|
+
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
2161
|
+
try:
|
2162
|
+
# cache_path is local so doesn't trigger any sync in load_to_memory
|
2163
|
+
access_memory = load_to_memory(cache_path, **kwargs)
|
2164
|
+
except Exception as e:
|
2165
|
+
# raise the exception if it comes from not having a correct loader
|
2166
|
+
# or if the original path is local
|
2167
|
+
if isinstance(e, NotImplementedError) or isinstance(
|
2168
|
+
filepath, LocalPathClasses
|
2169
|
+
):
|
2170
|
+
raise e
|
2171
|
+
logger.warning(
|
2172
|
+
f"The cache might be corrupted: {e}. Retrying to synchronize."
|
2173
|
+
)
|
2174
|
+
# delete the existing cache
|
2175
|
+
if cache_path.is_dir():
|
2176
|
+
shutil.rmtree(cache_path)
|
2177
|
+
else:
|
2178
|
+
cache_path.unlink(missing_ok=True)
|
2179
|
+
# download again and try to load into memory
|
2180
|
+
cache_path = _synchronize_cleanup_on_error(
|
2181
|
+
filepath, cache_key=cache_key
|
2182
|
+
)
|
2183
|
+
access_memory = load_to_memory(cache_path, **kwargs)
|
2184
|
+
# only call if load is successfull
|
2185
|
+
_track_run_input(self, is_run_input)
|
2186
|
+
return access_memory
|
2187
|
+
|
2188
|
+
def cache(self, is_run_input: bool | None = None) -> Path:
|
2189
|
+
"""Download cloud artifact to local cache.
|
2190
|
+
|
2191
|
+
Follows synching logic: only caches an artifact if it's outdated in the local cache.
|
2192
|
+
|
2193
|
+
Returns a path to a locally cached on-disk object (say a `.jpg` file).
|
2194
|
+
|
2195
|
+
Examples:
|
2196
|
+
|
2197
|
+
Sync file from cloud and return the local path of the cache:
|
2198
|
+
|
2199
|
+
>>> artifact.cache()
|
2200
|
+
PosixPath('/home/runner/work/Caches/lamindb/lamindb-ci/lndb-storage/pbmc68k.h5ad')
|
2201
|
+
"""
|
2202
|
+
from lamindb import settings
|
2203
|
+
|
2204
|
+
if self._overwrite_versions and not self.is_latest:
|
2205
|
+
raise ValueError(INCONSISTENT_STATE_MSG)
|
2206
|
+
|
2207
|
+
filepath, cache_key = filepath_cache_key_from_artifact(
|
2208
|
+
self, using_key=settings._using_key
|
2209
|
+
)
|
2210
|
+
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
2211
|
+
# only call if sync is successfull
|
2212
|
+
_track_run_input(self, is_run_input)
|
2213
|
+
return cache_path
|
2214
|
+
|
2215
|
+
def delete(
|
2216
|
+
self,
|
2217
|
+
permanent: bool | None = None,
|
2218
|
+
storage: bool | None = None,
|
2219
|
+
using_key: str | None = None,
|
2220
|
+
) -> None:
|
2221
|
+
"""Trash or permanently delete.
|
2222
|
+
|
2223
|
+
A first call to `.delete()` puts an artifact into the trash (sets `_branch_code` to `-1`).
|
2224
|
+
A second call permanently deletes the artifact.
|
2225
|
+
If it is a folder artifact with multiple versions, deleting a non-latest version
|
2226
|
+
will not delete the underlying storage by default (if `storage=True` is not specified).
|
2227
|
+
Deleting the latest version will delete all the versions for folder artifacts.
|
2228
|
+
|
2229
|
+
FAQ: :doc:`docs:faq/storage`
|
2230
|
+
|
2231
|
+
Args:
|
2232
|
+
permanent: Permanently delete the artifact (skip trash).
|
2233
|
+
storage: Indicate whether you want to delete the artifact in storage.
|
2234
|
+
|
2235
|
+
Examples:
|
2236
|
+
|
2237
|
+
For an `Artifact` object `artifact`, call:
|
2238
|
+
|
2239
|
+
>>> artifact = ln.Artifact.filter(key="some.csv").one()
|
2240
|
+
>>> artifact.delete() # delete a single file artifact
|
2241
|
+
|
2242
|
+
>>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=False).first()
|
2243
|
+
>>> artiact.delete() # delete an old version, the data will not be deleted
|
2244
|
+
|
2245
|
+
>>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=True).one()
|
2246
|
+
>>> artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
|
2247
|
+
"""
|
2248
|
+
# this first check means an invalid delete fails fast rather than cascading through
|
2249
|
+
# database and storage permission errors
|
2250
|
+
if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
|
2251
|
+
isettings = setup_settings.instance
|
2252
|
+
if self.storage.instance_uid != isettings.uid and (
|
2253
|
+
storage or storage is None
|
2254
|
+
):
|
2255
|
+
raise IntegrityError(
|
2256
|
+
"Cannot simply delete artifacts outside of this instance's managed storage locations."
|
2257
|
+
"\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
|
2258
|
+
f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
|
2259
|
+
f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
|
2260
|
+
)
|
2261
|
+
# by default, we only move artifacts into the trash (_branch_code = -1)
|
2262
|
+
trash__branch_code = -1
|
2263
|
+
if self._branch_code > trash__branch_code and not permanent:
|
2264
|
+
if storage is not None:
|
2265
|
+
logger.warning("moving artifact to trash, storage arg is ignored")
|
2266
|
+
# move to trash
|
2267
|
+
self._branch_code = trash__branch_code
|
2268
|
+
self.save()
|
2269
|
+
logger.important(
|
2270
|
+
f"moved artifact to trash (_branch_code = {trash__branch_code})"
|
2271
|
+
)
|
2272
|
+
return
|
2273
|
+
|
2274
|
+
# if the artifact is already in the trash
|
2275
|
+
# permanent delete skips the trash
|
2276
|
+
if permanent is None:
|
2277
|
+
# ask for confirmation of permanent delete
|
2278
|
+
response = input(
|
2279
|
+
"Artifact record is already in trash! Are you sure you want to permanently"
|
2280
|
+
" delete it? (y/n) You can't undo this action."
|
2281
|
+
)
|
2282
|
+
delete_record = response == "y"
|
2283
|
+
else:
|
2284
|
+
assert permanent # noqa: S101
|
2285
|
+
delete_record = True
|
2286
|
+
|
2287
|
+
if delete_record:
|
2288
|
+
# need to grab file path before deletion
|
2289
|
+
try:
|
2290
|
+
path, _ = filepath_from_artifact(self, using_key)
|
2291
|
+
except OSError:
|
2292
|
+
# we can still delete the record
|
2293
|
+
logger.warning("Could not get path")
|
2294
|
+
storage = False
|
2295
|
+
# only delete in storage if DB delete is successful
|
2296
|
+
# DB delete might error because of a foreign key constraint violated etc.
|
2297
|
+
if self._overwrite_versions and self.is_latest:
|
2298
|
+
# includes self
|
2299
|
+
for version in self.versions.all():
|
2300
|
+
_delete_skip_storage(version)
|
2301
|
+
else:
|
2302
|
+
self._delete_skip_storage()
|
2303
|
+
# by default do not delete storage if deleting only a previous version
|
2304
|
+
# and the underlying store is mutable
|
2305
|
+
if self._overwrite_versions and not self.is_latest:
|
2306
|
+
delete_in_storage = False
|
2307
|
+
if storage:
|
2308
|
+
logger.warning(
|
2309
|
+
"Storage argument is ignored; can't delete storage on an previous version"
|
2310
|
+
)
|
2311
|
+
elif self.key is None or self._key_is_virtual:
|
2312
|
+
# do not ask for confirmation also if storage is None
|
2313
|
+
delete_in_storage = storage is None or storage
|
2314
|
+
else:
|
2315
|
+
# for artifacts with non-virtual semantic storage keys (key is not None)
|
2316
|
+
# ask for extra-confirmation
|
2317
|
+
if storage is None:
|
2318
|
+
response = input(
|
2319
|
+
f"Are you sure to want to delete {path}? (y/n) You can't undo"
|
2320
|
+
" this action."
|
2321
|
+
)
|
2322
|
+
delete_in_storage = response == "y"
|
2323
|
+
else:
|
2324
|
+
delete_in_storage = storage
|
2325
|
+
if not delete_in_storage:
|
2326
|
+
logger.important(f"a file/folder remains here: {path}")
|
2327
|
+
# we don't yet have logic to bring back the deleted metadata record
|
2328
|
+
# in case storage deletion fails - this is important for ACID down the road
|
2329
|
+
if delete_in_storage:
|
2330
|
+
delete_msg = delete_storage(path, raise_file_not_found_error=False)
|
2331
|
+
if delete_msg != "did-not-delete":
|
2332
|
+
logger.success(f"deleted {colors.yellow(f'{path}')}")
|
2333
|
+
|
2334
|
+
def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
2335
|
+
"""Save to database & storage.
|
2336
|
+
|
2337
|
+
Args:
|
2338
|
+
upload: Trigger upload to cloud storage in instances with hybrid storage mode.
|
2339
|
+
|
2340
|
+
Examples:
|
2341
|
+
>>> artifact = ln.Artifact("./myfile.csv", description="myfile")
|
2342
|
+
>>> artifact.save()
|
2343
|
+
"""
|
2344
|
+
state_was_adding = self._state.adding
|
2345
|
+
print_progress = kwargs.pop("print_progress", True)
|
2346
|
+
store_kwargs = kwargs.pop(
|
2347
|
+
"store_kwargs", {}
|
2348
|
+
) # kwargs for .upload_from in the end
|
2349
|
+
access_token = kwargs.pop("access_token", None)
|
2350
|
+
local_path = None
|
2351
|
+
if upload and setup_settings.instance.keep_artifacts_local:
|
2352
|
+
# switch local storage location to cloud
|
2353
|
+
local_path = self.path
|
2354
|
+
self.storage_id = setup_settings.instance.storage.id
|
2355
|
+
self._local_filepath = local_path
|
2356
|
+
# switch to virtual storage key upon upload
|
2357
|
+
# the local filepath is already cached at that point
|
2358
|
+
self._key_is_virtual = True
|
2359
|
+
# ensure that the artifact is uploaded
|
2360
|
+
self._to_store = True
|
2361
|
+
|
2362
|
+
self._save_skip_storage(**kwargs)
|
2363
|
+
|
2364
|
+
from .save import check_and_attempt_clearing, check_and_attempt_upload
|
2365
|
+
|
2366
|
+
using_key = None
|
2367
|
+
if "using" in kwargs:
|
2368
|
+
using_key = kwargs["using"]
|
2369
|
+
exception_upload = check_and_attempt_upload(
|
2370
|
+
self,
|
2371
|
+
using_key,
|
2372
|
+
access_token=access_token,
|
2373
|
+
print_progress=print_progress,
|
2374
|
+
**store_kwargs,
|
2375
|
+
)
|
2376
|
+
if exception_upload is not None:
|
2377
|
+
# we do not want to raise file not found on cleanup if upload of a file failed
|
2378
|
+
# often it is ACID in the filesystem itself
|
2379
|
+
# for example, s3 won't have the failed file, so just skip the delete in this case
|
2380
|
+
raise_file_not_found_error = False
|
2381
|
+
self._delete_skip_storage()
|
2382
|
+
else:
|
2383
|
+
# this is the case when it is cleaned on .replace
|
2384
|
+
raise_file_not_found_error = True
|
2385
|
+
# this is triggered by an exception in check_and_attempt_upload or by replace.
|
2386
|
+
exception_clear = check_and_attempt_clearing(
|
2387
|
+
self,
|
2388
|
+
raise_file_not_found_error=raise_file_not_found_error,
|
2389
|
+
using_key=using_key,
|
2390
|
+
)
|
2391
|
+
if exception_upload is not None:
|
2392
|
+
raise RuntimeError(exception_upload)
|
2393
|
+
if exception_clear is not None:
|
2394
|
+
raise RuntimeError(exception_clear)
|
2395
|
+
# this is only for keep_artifacts_local
|
2396
|
+
if local_path is not None and not state_was_adding:
|
2397
|
+
# only move the local artifact to cache if it was not newly created
|
2398
|
+
local_path_cache = ln_setup.settings.cache_dir / local_path.name
|
2399
|
+
# don't use Path.rename here because of cross-device link error
|
2400
|
+
# https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
|
2401
|
+
shutil.move(
|
2402
|
+
local_path, # type: ignore
|
2403
|
+
local_path_cache,
|
2404
|
+
)
|
2405
|
+
logger.important(f"moved local artifact to cache: {local_path_cache}")
|
2406
|
+
return self
|
2407
|
+
|
2408
|
+
def restore(self) -> None:
|
2409
|
+
"""Restore from trash.
|
2410
|
+
|
2411
|
+
Examples:
|
2412
|
+
>>> artifact.restore()
|
2413
|
+
"""
|
2414
|
+
self._branch_code = 1
|
2415
|
+
self.save()
|
2416
|
+
|
2417
|
+
def describe(self) -> None:
|
2418
|
+
"""Describe relations of record.
|
2419
|
+
|
2420
|
+
Examples:
|
2421
|
+
>>> artifact.describe()
|
2422
|
+
"""
|
2423
|
+
return describe_artifact_collection(self)
|
2424
|
+
|
2425
|
+
def _populate_subsequent_runs(self, run: Run) -> None:
|
2426
|
+
_populate_subsequent_runs_(self, run)
|
2427
|
+
|
2428
|
+
|
2429
|
+
# can't really just call .cache in .load because of double tracking
|
2430
|
+
def _synchronize_cleanup_on_error(
|
2431
|
+
filepath: UPath, cache_key: str | None = None
|
2432
|
+
) -> UPath:
|
2433
|
+
try:
|
2434
|
+
cache_path = setup_settings.paths.cloud_to_local(
|
2435
|
+
filepath, cache_key=cache_key, print_progress=True
|
2436
|
+
)
|
2437
|
+
except Exception as e:
|
2438
|
+
if not isinstance(filepath, LocalPathClasses):
|
2439
|
+
cache_path = setup_settings.paths.cloud_to_local_no_update(
|
2440
|
+
filepath, cache_key=cache_key
|
2441
|
+
)
|
2442
|
+
if cache_path.is_dir():
|
2443
|
+
shutil.rmtree(cache_path)
|
2444
|
+
else:
|
2445
|
+
cache_path.unlink(missing_ok=True)
|
2446
|
+
raise e
|
2447
|
+
return cache_path
|
2448
|
+
|
2449
|
+
|
2450
|
+
def _delete_skip_storage(artifact, *args, **kwargs) -> None:
|
2451
|
+
super(Artifact, artifact).delete(*args, **kwargs)
|
2452
|
+
|
2453
|
+
|
2454
|
+
def _save_skip_storage(artifact, **kwargs) -> None:
|
2455
|
+
save_staged_feature_sets(artifact)
|
2456
|
+
super(Artifact, artifact).save(**kwargs)
|
2457
|
+
save_schema_links(artifact)
|
2458
|
+
|
2459
|
+
|
2460
|
+
class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
|
2461
|
+
id: int = models.BigAutoField(primary_key=True)
|
2462
|
+
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
|
2463
|
+
# we follow the lower() case convention rather than snake case for link models
|
2464
|
+
featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
|
2465
|
+
|
2466
|
+
class Meta:
|
2467
|
+
unique_together = ("artifact", "featurevalue")
|
2468
|
+
|
2469
|
+
|
2470
|
+
class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
|
2471
|
+
id: int = models.BigAutoField(primary_key=True)
|
2472
|
+
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
|
2473
|
+
# we follow the lower() case convention rather than snake case for link models
|
2474
|
+
paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
|
2475
|
+
|
2476
|
+
class Meta:
|
2477
|
+
unique_together = ("artifact", "paramvalue")
|
2478
|
+
|
2479
|
+
|
2480
|
+
def _track_run_input(
|
2481
|
+
data: Artifact
|
2482
|
+
| Iterable[Artifact], # can also be Collection | Iterable[Collection]
|
2483
|
+
is_run_input: bool | Run | None = None,
|
2484
|
+
run: Run | None = None,
|
2485
|
+
):
|
2486
|
+
from lamindb import settings
|
2487
|
+
|
2488
|
+
from .._tracked import get_current_tracked_run
|
2489
|
+
from ..core._context import context
|
2490
|
+
from .collection import Collection
|
2491
|
+
|
2492
|
+
if isinstance(is_run_input, Run):
|
2493
|
+
run = is_run_input
|
2494
|
+
is_run_input = True
|
2495
|
+
elif run is None:
|
2496
|
+
run = get_current_tracked_run()
|
2497
|
+
if run is None:
|
2498
|
+
run = context.run
|
2499
|
+
# consider that data is an iterable of Data
|
2500
|
+
data_iter: Iterable[Artifact] | Iterable[Collection] = (
|
2501
|
+
[data] if isinstance(data, (Artifact, Collection)) else data
|
2502
|
+
)
|
2503
|
+
track_run_input = False
|
2504
|
+
input_data = []
|
2505
|
+
if run is not None:
|
2506
|
+
# avoid cycles: data can't be both input and output
|
2507
|
+
def is_valid_input(data: Artifact | Collection):
|
2508
|
+
is_valid = False
|
2509
|
+
if data._state.db == "default":
|
2510
|
+
# things are OK if the record is on the default db
|
2511
|
+
is_valid = True
|
2512
|
+
elif data._state.db is None:
|
2513
|
+
# if a record is not yet saved, it can't be an input
|
2514
|
+
# we silently ignore because what likely happens is that
|
2515
|
+
# the user works with an object that's about to be saved
|
2516
|
+
# in the current Python session
|
2517
|
+
is_valid = False
|
2518
|
+
else:
|
2519
|
+
# record is on another db
|
2520
|
+
# we have to save the record into the current db with
|
2521
|
+
# the run being attached to a transfer transform
|
2522
|
+
logger.important(
|
2523
|
+
f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}') as input"
|
2524
|
+
)
|
2525
|
+
data.save()
|
2526
|
+
is_valid = True
|
2527
|
+
return (
|
2528
|
+
data.run_id != run.id
|
2529
|
+
and not data._state.adding # this seems duplicated with data._state.db is None
|
2530
|
+
and is_valid
|
2531
|
+
)
|
2532
|
+
|
2533
|
+
input_data = [data for data in data_iter if is_valid_input(data)]
|
2534
|
+
input_data_ids = [data.id for data in input_data]
|
2535
|
+
if input_data:
|
2536
|
+
data_class_name = input_data[0].__class__.__name__.lower()
|
2537
|
+
# let us first look at the case in which the user does not
|
2538
|
+
# provide a boolean value for `is_run_input`
|
2539
|
+
# hence, we need to determine whether we actually want to
|
2540
|
+
# track a run or not
|
2541
|
+
if is_run_input is None:
|
2542
|
+
# we don't have a run record
|
2543
|
+
if run is None:
|
2544
|
+
if settings.track_run_inputs:
|
2545
|
+
# here we check that this is not a read-only connection
|
2546
|
+
# normally for our connection strings the read-only role name has _read in it
|
2547
|
+
# not absolutely safe but the worst case is that the warning is not shown
|
2548
|
+
instance = setup_settings.instance
|
2549
|
+
if instance.dialect != "postgresql" or "_read" not in instance.db:
|
2550
|
+
logger.warning(WARNING_NO_INPUT)
|
2551
|
+
# assume we have a run record
|
2552
|
+
else:
|
2553
|
+
# assume there is non-cyclic candidate input data
|
2554
|
+
if input_data:
|
2555
|
+
if settings.track_run_inputs:
|
2556
|
+
transform_note = ""
|
2557
|
+
if len(input_data) == 1:
|
2558
|
+
if input_data[0].transform is not None:
|
2559
|
+
transform_note = (
|
2560
|
+
", adding parent transform"
|
2561
|
+
f" {input_data[0].transform.id}"
|
2562
|
+
)
|
2563
|
+
logger.info(
|
2564
|
+
f"adding {data_class_name} ids {input_data_ids} as inputs for run"
|
2565
|
+
f" {run.id}{transform_note}"
|
2566
|
+
)
|
2567
|
+
track_run_input = True
|
2568
|
+
else:
|
2569
|
+
logger.hint(
|
2570
|
+
"track these data as a run input by passing `is_run_input=True`"
|
2571
|
+
)
|
2572
|
+
else:
|
2573
|
+
track_run_input = is_run_input
|
2574
|
+
if track_run_input:
|
2575
|
+
if run is None:
|
2576
|
+
raise ValueError("No run context set. Call `ln.track()`.")
|
2577
|
+
# avoid adding the same run twice
|
2578
|
+
run.save()
|
2579
|
+
if data_class_name == "artifact":
|
2580
|
+
LinkORM = run.input_artifacts.through
|
2581
|
+
links = [
|
2582
|
+
LinkORM(run_id=run.id, artifact_id=data_id)
|
2583
|
+
for data_id in input_data_ids
|
2584
|
+
]
|
2585
|
+
else:
|
2586
|
+
LinkORM = run.input_collections.through
|
2587
|
+
links = [
|
2588
|
+
LinkORM(run_id=run.id, collection_id=data_id)
|
2589
|
+
for data_id in input_data_ids
|
2590
|
+
]
|
2591
|
+
LinkORM.objects.bulk_create(links, ignore_conflicts=True)
|
2592
|
+
# generalize below for more than one data batch
|
2593
|
+
if len(input_data) == 1:
|
2594
|
+
if input_data[0].transform is not None:
|
2595
|
+
run.transform.predecessors.add(input_data[0].transform)
|
2596
|
+
|
2597
|
+
|
2598
|
+
# privates currently dealt with separately
|
2599
|
+
# mypy: ignore-errors
|
2600
|
+
Artifact._delete_skip_storage = _delete_skip_storage
|
2601
|
+
Artifact._save_skip_storage = _save_skip_storage
|
2602
|
+
Artifact.view_lineage = view_lineage
|