lamindb 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. lamindb/__init__.py +30 -25
  2. lamindb/_tracked.py +1 -1
  3. lamindb/_view.py +2 -3
  4. lamindb/base/__init__.py +1 -1
  5. lamindb/base/ids.py +1 -10
  6. lamindb/core/__init__.py +7 -65
  7. lamindb/core/_compat.py +60 -0
  8. lamindb/core/_context.py +43 -20
  9. lamindb/core/_settings.py +6 -6
  10. lamindb/core/_sync_git.py +1 -1
  11. lamindb/core/loaders.py +30 -19
  12. lamindb/core/storage/_backed_access.py +4 -2
  13. lamindb/core/storage/_tiledbsoma.py +8 -6
  14. lamindb/core/storage/_zarr.py +104 -25
  15. lamindb/core/storage/objects.py +63 -28
  16. lamindb/core/storage/paths.py +4 -1
  17. lamindb/core/types.py +10 -0
  18. lamindb/curators/__init__.py +100 -85
  19. lamindb/errors.py +1 -1
  20. lamindb/integrations/_vitessce.py +4 -4
  21. lamindb/migrations/0089_subsequent_runs.py +159 -0
  22. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  23. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  24. lamindb/models/__init__.py +79 -0
  25. lamindb/{core → models}/_describe.py +3 -3
  26. lamindb/{core → models}/_django.py +8 -5
  27. lamindb/{core → models}/_feature_manager.py +103 -87
  28. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  29. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  30. lamindb/{core → models}/_label_manager.py +10 -17
  31. lamindb/{core/relations.py → models/_relations.py} +8 -1
  32. lamindb/models/artifact.py +2602 -0
  33. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  34. lamindb/models/collection.py +683 -0
  35. lamindb/models/core.py +135 -0
  36. lamindb/models/feature.py +643 -0
  37. lamindb/models/flextable.py +163 -0
  38. lamindb/{_parents.py → models/has_parents.py} +55 -49
  39. lamindb/models/project.py +384 -0
  40. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  41. lamindb/{_query_set.py → models/query_set.py} +40 -26
  42. lamindb/models/record.py +1762 -0
  43. lamindb/models/run.py +563 -0
  44. lamindb/{_save.py → models/save.py} +9 -7
  45. lamindb/models/schema.py +732 -0
  46. lamindb/models/transform.py +360 -0
  47. lamindb/models/ulabel.py +249 -0
  48. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
  49. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/RECORD +51 -51
  50. lamindb/_artifact.py +0 -1379
  51. lamindb/_collection.py +0 -440
  52. lamindb/_feature.py +0 -316
  53. lamindb/_is_versioned.py +0 -40
  54. lamindb/_record.py +0 -1064
  55. lamindb/_run.py +0 -60
  56. lamindb/_schema.py +0 -347
  57. lamindb/_storage.py +0 -15
  58. lamindb/_transform.py +0 -170
  59. lamindb/_ulabel.py +0 -56
  60. lamindb/_utils.py +0 -9
  61. lamindb/base/validation.py +0 -63
  62. lamindb/core/_data.py +0 -491
  63. lamindb/core/fields.py +0 -12
  64. lamindb/models.py +0 -4475
  65. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
  66. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,2602 @@
1
+ # ruff: noqa: TC004
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import shutil
6
+ from collections import defaultdict
7
+ from pathlib import Path, PurePath, PurePosixPath
8
+ from typing import TYPE_CHECKING, Any, Union, overload
9
+
10
+ import fsspec
11
+ import lamindb_setup as ln_setup
12
+ import pandas as pd
13
+ from anndata import AnnData
14
+ from django.db import connections, models
15
+ from django.db.models import CASCADE, PROTECT, Q
16
+ from lamin_utils import colors, logger
17
+ from lamindb_setup import settings as setup_settings
18
+ from lamindb_setup._init_instance import register_storage_in_instance
19
+ from lamindb_setup.core._settings_storage import init_storage
20
+ from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file
21
+ from lamindb_setup.core.types import UPathStr
22
+ from lamindb_setup.core.upath import (
23
+ create_path,
24
+ extract_suffix_from_path,
25
+ get_stat_dir_cloud,
26
+ get_stat_file_cloud,
27
+ )
28
+
29
+ from lamindb.base import deprecated
30
+ from lamindb.base.fields import (
31
+ BigIntegerField,
32
+ BooleanField,
33
+ CharField,
34
+ ForeignKey,
35
+ )
36
+ from lamindb.errors import FieldValidationError
37
+ from lamindb.models.query_set import QuerySet
38
+
39
+ from ..base.users import current_user_id
40
+ from ..core._compat import is_package_installed
41
+ from ..core.loaders import load_to_memory
42
+ from ..core.storage import (
43
+ LocalPathClasses,
44
+ UPath,
45
+ delete_storage,
46
+ infer_suffix,
47
+ write_to_disk,
48
+ )
49
+ from ..core.storage._anndata_accessor import _anndata_n_observations
50
+ from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES
51
+ from ..core.storage._tiledbsoma import _soma_n_observations
52
+ from ..core.storage.paths import (
53
+ AUTO_KEY_PREFIX,
54
+ auto_storage_key_from_artifact,
55
+ auto_storage_key_from_artifact_uid,
56
+ check_path_is_child_of_root,
57
+ filepath_cache_key_from_artifact,
58
+ filepath_from_artifact,
59
+ )
60
+ from ..errors import IntegrityError, InvalidArgument, ValidationError
61
+ from ..models._is_versioned import (
62
+ create_uid,
63
+ message_update_key_in_version_family,
64
+ )
65
+ from ._django import get_artifact_with_related
66
+ from ._feature_manager import (
67
+ FeatureManager,
68
+ ParamManager,
69
+ ParamManagerArtifact,
70
+ add_label_feature_links,
71
+ get_label_links,
72
+ )
73
+ from ._is_versioned import IsVersioned
74
+ from ._relations import (
75
+ dict_module_name_to_model_name,
76
+ dict_related_model_to_related_name,
77
+ )
78
+ from .core import Storage
79
+ from .feature import Feature, FeatureValue
80
+ from .has_parents import view_lineage
81
+ from .record import (
82
+ BasicRecord,
83
+ LinkORM,
84
+ Record,
85
+ _get_record_kwargs,
86
+ record_repr,
87
+ )
88
+ from .run import ParamValue, Run, TracksRun, TracksUpdates, User
89
+ from .schema import Schema
90
+ from .ulabel import ULabel
91
+
92
+ WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-run"
93
+
94
+ WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
95
+
96
+ try:
97
+ from ..core.storage._zarr import identify_zarr_type
98
+ except ImportError:
99
+
100
+ def identify_zarr_type(storepath): # type: ignore
101
+ raise ImportError("Please install zarr: pip install zarr<=2.18.4")
102
+
103
+
104
+ if TYPE_CHECKING:
105
+ from collections.abc import Iterable
106
+
107
+ from mudata import MuData # noqa: TC004
108
+ from pyarrow.dataset import Dataset as PyArrowDataset
109
+ from spatialdata import SpatialData # noqa: TC004
110
+ from tiledbsoma import Collection as SOMACollection
111
+ from tiledbsoma import Experiment as SOMAExperiment
112
+ from tiledbsoma import Measurement as SOMAMeasurement
113
+
114
+ from lamindb.base.types import StrField
115
+ from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
116
+ from lamindb.core.types import ScverseDataStructures
117
+
118
+ from ..base.types import (
119
+ ArtifactKind,
120
+ )
121
+ from ._label_manager import LabelManager
122
+ from .collection import Collection
123
+ from .project import Project, Reference
124
+ from .transform import Transform
125
+
126
+
127
+ INCONSISTENT_STATE_MSG = (
128
+ "Trying to read a folder artifact from an outdated version, "
129
+ "this can result in an incosistent state.\n"
130
+ "Read from the latest version: artifact.versions.filter(is_latest=True).one()"
131
+ )
132
+
133
+
134
+ def process_pathlike(
135
+ filepath: UPath,
136
+ default_storage: Storage,
137
+ using_key: str | None,
138
+ skip_existence_check: bool = False,
139
+ ) -> tuple[Storage, bool]:
140
+ """Determines the appropriate storage for a given path and whether to use an existing storage key."""
141
+ if not skip_existence_check:
142
+ try: # check if file exists
143
+ if not filepath.exists():
144
+ raise FileNotFoundError(filepath)
145
+ except PermissionError:
146
+ pass
147
+ if check_path_is_child_of_root(filepath, default_storage.root):
148
+ use_existing_storage_key = True
149
+ return default_storage, use_existing_storage_key
150
+ else:
151
+ # check whether the path is part of one of the existing
152
+ # already-registered storage locations
153
+ result = False
154
+ # within the hub, we don't want to perform check_path_in_existing_storage
155
+ if using_key is None:
156
+ result = check_path_in_existing_storage(filepath, using_key)
157
+ if isinstance(result, Storage):
158
+ use_existing_storage_key = True
159
+ return result, use_existing_storage_key
160
+ else:
161
+ # if the path is in the cloud, we have a good candidate
162
+ # for the storage root: the bucket
163
+ if not isinstance(filepath, LocalPathClasses):
164
+ # for a cloud path, new_root is always the bucket name
165
+ if filepath.protocol == "hf":
166
+ hf_path = filepath.fs.resolve_path(filepath.as_posix())
167
+ hf_path.path_in_repo = ""
168
+ new_root = "hf://" + hf_path.unresolve()
169
+ else:
170
+ if filepath.protocol == "s3":
171
+ # check that endpoint_url didn't propagate here
172
+ # as a part of the path string
173
+ assert "?" not in filepath.path # noqa: S101
174
+ new_root = list(filepath.parents)[-1]
175
+ # do not register remote storage locations on hub if the current instance
176
+ # is not managed on the hub
177
+ storage_settings, _ = init_storage(
178
+ new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
179
+ )
180
+ storage_record = register_storage_in_instance(storage_settings)
181
+ use_existing_storage_key = True
182
+ return storage_record, use_existing_storage_key
183
+ # if the filepath is local
184
+ else:
185
+ use_existing_storage_key = False
186
+ # if the default storage is local we'll throw an error if the user
187
+ # doesn't provide a key
188
+ if default_storage.type == "local":
189
+ return default_storage, use_existing_storage_key
190
+ # if the default storage is in the cloud (the file is going to
191
+ # be uploaded upon saving it), we treat the filepath as a cache
192
+ else:
193
+ return default_storage, use_existing_storage_key
194
+
195
+
196
+ def process_data(
197
+ provisional_uid: str,
198
+ data: UPathStr | pd.DataFrame | AnnData,
199
+ format: str | None,
200
+ key: str | None,
201
+ default_storage: Storage,
202
+ using_key: str | None,
203
+ skip_existence_check: bool = False,
204
+ is_replace: bool = False,
205
+ ) -> tuple[Any, Path | UPath, str, Storage, bool]:
206
+ """Serialize a data object that's provided as file or in memory.
207
+
208
+ if not overwritten, data gets stored in default storage
209
+ """
210
+ supported_data_types = [pd.DataFrame, AnnData]
211
+ if is_package_installed("mudata"):
212
+ from mudata import MuData
213
+
214
+ supported_data_types.append(MuData)
215
+ if is_package_installed("spatialdata"):
216
+ from spatialdata import SpatialData
217
+
218
+ supported_data_types.append(SpatialData)
219
+ supported_data_types = tuple(supported_data_types) # type: ignore
220
+
221
+ if key is not None:
222
+ key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
223
+ # use suffix as the (adata) format if the format is not provided
224
+ if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
225
+ format = key_suffix[1:]
226
+ else:
227
+ key_suffix = None
228
+ if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
229
+ access_token = (
230
+ default_storage._access_token
231
+ if hasattr(default_storage, "_access_token")
232
+ else None
233
+ )
234
+ path = create_path(data, access_token=access_token)
235
+ # we don't resolve http links because they can resolve into a different domain
236
+ # for example into a temporary url
237
+ if path.protocol not in {"http", "https"}:
238
+ path = path.resolve()
239
+ storage, use_existing_storage_key = process_pathlike(
240
+ path,
241
+ default_storage=default_storage,
242
+ using_key=using_key,
243
+ skip_existence_check=skip_existence_check,
244
+ )
245
+ suffix = extract_suffix_from_path(path)
246
+ memory_rep = None
247
+ elif isinstance(data, supported_data_types):
248
+ storage = default_storage
249
+ memory_rep = data
250
+ suffix = infer_suffix(data, format)
251
+ else:
252
+ raise NotImplementedError(
253
+ f"Do not know how to create a artifact object from {data}, pass a path instead!"
254
+ )
255
+ if key_suffix is not None and key_suffix != suffix and not is_replace:
256
+ # consciously omitting a trailing period
257
+ if isinstance(data, (str, Path, UPath)):
258
+ message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
259
+ else:
260
+ message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
261
+ raise InvalidArgument(message)
262
+ # in case we have an in-memory representation, we need to write it to disk
263
+ from lamindb import settings
264
+
265
+ if isinstance(data, supported_data_types):
266
+ path = settings.cache_dir / f"{provisional_uid}{suffix}"
267
+ write_to_disk(data, path)
268
+ use_existing_storage_key = False
269
+ return memory_rep, path, suffix, storage, use_existing_storage_key
270
+
271
+
272
+ def get_stat_or_artifact(
273
+ path: UPath,
274
+ key: str | None = None,
275
+ check_hash: bool = True,
276
+ is_replace: bool = False,
277
+ instance: str | None = None,
278
+ ) -> Union[tuple[int, str | None, str | None, int | None, Artifact | None], Artifact]:
279
+ """Retrieves file statistics or an existing artifact based on the path, hash, and key."""
280
+ n_files = None
281
+ from lamindb import settings
282
+
283
+ if settings.creation.artifact_skip_size_hash:
284
+ return None, None, None, n_files, None
285
+ stat = path.stat() # one network request
286
+ if not isinstance(path, LocalPathClasses):
287
+ size, hash, hash_type = None, None, None
288
+ if stat is not None:
289
+ # convert UPathStatResult to fsspec info dict
290
+ stat = stat.as_info()
291
+ if (store_type := stat["type"]) == "file":
292
+ size, hash, hash_type = get_stat_file_cloud(stat)
293
+ elif store_type == "directory":
294
+ size, hash, hash_type, n_files = get_stat_dir_cloud(path)
295
+ if hash is None:
296
+ logger.warning(f"did not add hash for {path}")
297
+ return size, hash, hash_type, n_files, None
298
+ else:
299
+ if path.is_dir():
300
+ size, hash, hash_type, n_files = hash_dir(path)
301
+ else:
302
+ hash, hash_type = hash_file(path)
303
+ size = stat.st_size
304
+ if not check_hash:
305
+ return size, hash, hash_type, n_files, None
306
+ previous_artifact_version = None
307
+ if key is None or is_replace:
308
+ result = Artifact.objects.using(instance).filter(hash=hash).all()
309
+ artifact_with_same_hash_exists = len(result) > 0
310
+ else:
311
+ storage_id = settings.storage.id
312
+ result = (
313
+ Artifact.objects.using(instance)
314
+ .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
315
+ .order_by("-created_at")
316
+ .all()
317
+ )
318
+ artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
319
+ if not artifact_with_same_hash_exists and len(result) > 0:
320
+ logger.important(
321
+ f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
322
+ )
323
+ previous_artifact_version = result[0]
324
+ if artifact_with_same_hash_exists:
325
+ message = "returning existing artifact with same hash"
326
+ if result[0]._branch_code == -1:
327
+ result[0].restore()
328
+ message = "restored artifact with same hash from trash"
329
+ logger.important(
330
+ f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
331
+ )
332
+ return result[0]
333
+ else:
334
+ return size, hash, hash_type, n_files, previous_artifact_version
335
+
336
+
337
+ def check_path_in_existing_storage(
338
+ path: Path | UPath, using_key: str | None = None
339
+ ) -> Storage | bool:
340
+ for storage in Storage.objects.using(using_key).filter().all():
341
+ # if path is part of storage, return it
342
+ if check_path_is_child_of_root(path, root=storage.root):
343
+ return storage
344
+ return False
345
+
346
+
347
+ def get_relative_path_to_directory(
348
+ path: PurePath | Path | UPath, directory: PurePath | Path | UPath
349
+ ) -> PurePath | Path:
350
+ if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
351
+ # UPath.relative_to() is not behaving as it should (2023-04-07)
352
+ # need to lstrip otherwise inconsistent behavior across trailing slashes
353
+ # see test_artifact.py: test_get_relative_path_to_directory
354
+ relpath = PurePath(
355
+ path.as_posix().replace(directory.as_posix(), "").lstrip("/")
356
+ )
357
+ elif isinstance(directory, Path):
358
+ relpath = path.resolve().relative_to(directory.resolve()) # type: ignore
359
+ elif isinstance(directory, PurePath):
360
+ relpath = path.relative_to(directory)
361
+ else:
362
+ raise TypeError("Directory not of type Path or UPath")
363
+ return relpath
364
+
365
+
366
+ def get_artifact_kwargs_from_data(
367
+ *,
368
+ data: Path | UPath | str | pd.DataFrame | ScverseDataStructures,
369
+ key: str | None,
370
+ run: Run | None,
371
+ format: str | None,
372
+ provisional_uid: str,
373
+ version: str | None,
374
+ default_storage: Storage,
375
+ using_key: str | None = None,
376
+ is_replace: bool = False,
377
+ skip_check_exists: bool = False,
378
+ ):
379
+ from lamindb import settings
380
+
381
+ run = get_run(run)
382
+ memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
383
+ provisional_uid,
384
+ data,
385
+ format,
386
+ key,
387
+ default_storage,
388
+ using_key,
389
+ skip_check_exists,
390
+ is_replace=is_replace,
391
+ )
392
+ stat_or_artifact = get_stat_or_artifact(
393
+ path=path,
394
+ key=key,
395
+ instance=using_key,
396
+ is_replace=is_replace,
397
+ )
398
+ if isinstance(stat_or_artifact, Artifact):
399
+ existing_artifact = stat_or_artifact
400
+ if run is not None:
401
+ existing_artifact._populate_subsequent_runs(run)
402
+ return existing_artifact, None
403
+ else:
404
+ size, hash, hash_type, n_files, revises = stat_or_artifact
405
+
406
+ if revises is not None: # update provisional_uid
407
+ provisional_uid, revises = create_uid(revises=revises, version=version)
408
+ if settings.cache_dir in path.parents:
409
+ path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
410
+
411
+ check_path_in_storage = False
412
+ if use_existing_storage_key:
413
+ inferred_key = get_relative_path_to_directory(
414
+ path=path, directory=UPath(storage.root)
415
+ ).as_posix()
416
+ if key is None:
417
+ key = inferred_key
418
+ else:
419
+ if not key == inferred_key:
420
+ raise InvalidArgument(
421
+ f"The path '{data}' is already in registered storage"
422
+ f" '{storage.root}' with key '{inferred_key}'\nYou passed"
423
+ f" conflicting key '{key}': please move the file before"
424
+ " registering it."
425
+ )
426
+ check_path_in_storage = True
427
+ else:
428
+ storage = default_storage
429
+
430
+ log_storage_hint(
431
+ check_path_in_storage=check_path_in_storage,
432
+ storage=storage,
433
+ key=key,
434
+ uid=provisional_uid,
435
+ suffix=suffix,
436
+ is_dir=n_files is not None,
437
+ )
438
+
439
+ # do we use a virtual or an actual storage key?
440
+ key_is_virtual = settings.creation._artifact_use_virtual_keys
441
+
442
+ # if the file is already in storage, independent of the default
443
+ # we use an actual storage key
444
+ if check_path_in_storage:
445
+ key_is_virtual = False
446
+
447
+ kwargs = {
448
+ "uid": provisional_uid,
449
+ "suffix": suffix,
450
+ "hash": hash,
451
+ "_hash_type": hash_type,
452
+ "key": key,
453
+ "size": size,
454
+ "storage_id": storage.id,
455
+ # passing both the id and the object
456
+ # to make them both available immediately
457
+ # after object creation
458
+ "n_files": n_files,
459
+ "_overwrite_versions": n_files is not None, # True for folder, False for file
460
+ "n_observations": None, # to implement
461
+ "run_id": run.id if run is not None else None,
462
+ "run": run,
463
+ "_key_is_virtual": key_is_virtual,
464
+ "revises": revises,
465
+ }
466
+ if not isinstance(path, LocalPathClasses):
467
+ local_filepath = None
468
+ cloud_filepath = path
469
+ else:
470
+ local_filepath = path
471
+ cloud_filepath = None
472
+ privates = {
473
+ "local_filepath": local_filepath,
474
+ "cloud_filepath": cloud_filepath,
475
+ "memory_rep": memory_rep,
476
+ "check_path_in_storage": check_path_in_storage,
477
+ }
478
+ return kwargs, privates
479
+
480
+
481
+ def log_storage_hint(
482
+ *,
483
+ check_path_in_storage: bool,
484
+ storage: Storage | None,
485
+ key: str | None,
486
+ uid: str,
487
+ suffix: str,
488
+ is_dir: bool,
489
+ ) -> None:
490
+ hint = ""
491
+ if check_path_in_storage:
492
+ display_root = storage.root # type: ignore
493
+ # check whether path is local
494
+ if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore
495
+ # if it's a local path, check whether it's in the current working directory
496
+ root_path = Path(storage.root) # type: ignore
497
+ if check_path_is_child_of_root(root_path, Path.cwd()):
498
+ # only display the relative path, not the fully resolved path
499
+ display_root = root_path.relative_to(Path.cwd()) # type: ignore
500
+ hint += f"path in storage '{display_root}'" # type: ignore
501
+ else:
502
+ hint += "path content will be copied to default storage upon `save()`"
503
+ if key is None:
504
+ storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
505
+ hint += f" with key `None` ('{storage_key}')"
506
+ else:
507
+ hint += f" with key '{key}'"
508
+ logger.hint(hint)
509
+
510
+
511
+ def data_is_anndata(data: AnnData | UPathStr) -> bool:
512
+ if isinstance(data, AnnData):
513
+ return True
514
+ if isinstance(data, (str, Path, UPath)):
515
+ data_path = UPath(data)
516
+ if ".h5ad" in data_path.suffixes: # ".h5ad.gz" is a valid suffix
517
+ return True
518
+ elif data_path.suffix == ".zarr":
519
+ # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
520
+ # TODO: the suffix based check should likely be moved to identify_zarr_type
521
+ if ".anndata" in data_path.suffixes:
522
+ return True
523
+ # check only for local, expensive for cloud
524
+ if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
525
+ return identify_zarr_type(data_path) == "anndata"
526
+ else:
527
+ logger.warning("We do not check if cloud zarr is AnnData or not")
528
+ return False
529
+ return False
530
+
531
+
532
+ def data_is_mudata(data: MuData | UPathStr) -> bool:
533
+ if is_package_installed("mudata"):
534
+ from mudata import MuData
535
+
536
+ if isinstance(data, MuData):
537
+ return True
538
+ if isinstance(data, (str, Path)):
539
+ return UPath(data).suffix == ".h5mu"
540
+ return False
541
+
542
+
543
+ def data_is_spatialdata(data: SpatialData | UPathStr) -> bool:
544
+ if is_package_installed("spatialdata"):
545
+ from spatialdata import SpatialData
546
+
547
+ if isinstance(data, SpatialData):
548
+ return True
549
+ if isinstance(data, (str, Path)):
550
+ if UPath(data).suffix == ".zarr":
551
+ # TODO: inconsistent with anndata, where we run the storage
552
+ # check only for local, expensive for cloud
553
+ return identify_zarr_type(data, check=False) == "spatialdata"
554
+ return False
555
+
556
+
557
+ def _check_otype_artifact(
558
+ data: UPathStr | pd.DataFrame | ScverseDataStructures,
559
+ otype: str | None = None,
560
+ ) -> str:
561
+ if otype is None:
562
+ if isinstance(data, pd.DataFrame):
563
+ logger.warning("data is a DataFrame, please use .from_df()")
564
+ otype = "DataFrame"
565
+ return otype
566
+
567
+ data_is_path = isinstance(data, (str, Path))
568
+ if data_is_anndata(data):
569
+ if not data_is_path:
570
+ logger.warning("data is an AnnData, please use .from_anndata()")
571
+ otype = "AnnData"
572
+ elif data_is_mudata(data):
573
+ if not data_is_path:
574
+ logger.warning("data is a MuData, please use .from_mudata()")
575
+ otype = "MuData"
576
+ elif data_is_spatialdata(data):
577
+ if not data_is_path:
578
+ logger.warning("data is a SpatialData, please use .from_spatialdata()")
579
+ otype = "SpatialData"
580
+ elif not data_is_path: # UPath is a subclass of Path
581
+ raise TypeError("data has to be a string, Path, UPath")
582
+ return otype
583
+
584
+
585
+ def _populate_subsequent_runs_(record: Union[Artifact, Collection], run: Run):
586
+ if record.run is None:
587
+ record.run = run
588
+ elif record.run != run:
589
+ record._subsequent_runs.add(run)
590
+
591
+
592
+ # also see current_run() in core._data
593
+ def get_run(run: Run | None) -> Run | None:
594
+ from lamindb import settings
595
+
596
+ from .._tracked import get_current_tracked_run
597
+ from ..core._context import context
598
+
599
+ if run is None:
600
+ run = get_current_tracked_run()
601
+ if run is None:
602
+ run = context.run
603
+ if run is None and not settings.creation.artifact_silence_missing_run_warning:
604
+ # here we check that this is not a read-only connection
605
+ # normally for our connection strings the read-only role name has _read in it
606
+ # not absolutely safe but the worst case is that the warning is not shown
607
+ instance = setup_settings.instance
608
+ if instance.dialect != "postgresql" or "_read" not in instance.db:
609
+ logger.warning(WARNING_RUN_TRANSFORM)
610
+ # suppress run by passing False
611
+ elif not run:
612
+ run = None
613
+ return run
614
+
615
+
616
+ def save_staged_feature_sets(self: Artifact) -> None:
617
+ if hasattr(self, "_staged_feature_sets"):
618
+ from lamindb.models._feature_manager import get_schema_by_slot_
619
+
620
+ existing_staged_feature_sets = get_schema_by_slot_(self)
621
+ saved_staged_feature_sets = {}
622
+ for key, schema in self._staged_feature_sets.items():
623
+ if isinstance(schema, Schema) and schema._state.adding:
624
+ schema.save()
625
+ saved_staged_feature_sets[key] = schema
626
+ if key in existing_staged_feature_sets:
627
+ # remove existing feature set on the same slot
628
+ self.feature_sets.remove(existing_staged_feature_sets[key])
629
+ if len(saved_staged_feature_sets) > 0:
630
+ s = "s" if len(saved_staged_feature_sets) > 1 else ""
631
+ display_schema_keys = ",".join(
632
+ f"'{key}'" for key in saved_staged_feature_sets.keys()
633
+ )
634
+ logger.save(
635
+ f"saved {len(saved_staged_feature_sets)} feature set{s} for slot{s}:"
636
+ f" {display_schema_keys}"
637
+ )
638
+
639
+
640
+ def save_schema_links(self: Artifact) -> None:
641
+ from lamindb.models.save import bulk_create
642
+
643
+ if hasattr(self, "_staged_feature_sets"):
644
+ links = []
645
+ for slot, schema in self._staged_feature_sets.items():
646
+ kwargs = {
647
+ "artifact_id": self.id,
648
+ "schema_id": schema.id,
649
+ "slot": slot,
650
+ }
651
+ links.append(Artifact.feature_sets.through(**kwargs))
652
+ bulk_create(links, ignore_conflicts=True)
653
+
654
+
655
+ # can restore later if needed
656
+ # def format_provenance(self, fk_data, print_types):
657
+ # type_str = lambda attr: (
658
+ # f": {get_related_model(self.__class__, attr).__name__}" if print_types else ""
659
+ # )
660
+
661
+ # return "".join(
662
+ # [
663
+ # f" .{field_name}{type_str(field_name)} = {format_field_value(value.get('name'))}\n"
664
+ # for field_name, value in fk_data.items()
665
+ # if value.get("name")
666
+ # ]
667
+ # )
668
+
669
+ # can restore later if needed
670
+ # def format_input_of_runs(self, print_types):
671
+ # if self.id is not None and self.input_of_runs.exists():
672
+ # values = [format_field_value(i.started_at) for i in self.input_of_runs.all()]
673
+ # type_str = ": Run" if print_types else "" # type: ignore
674
+ # return f" .input_of_runs{type_str} = {', '.join(values)}\n"
675
+ # return ""
676
+
677
+
678
+ def _describe_postgres(self): # for Artifact & Collection
679
+ from ._describe import describe_general
680
+ from ._feature_manager import describe_features
681
+
682
+ model_name = self.__class__.__name__
683
+ msg = f"{colors.green(model_name)}{record_repr(self, include_foreign_keys=False).lstrip(model_name)}\n"
684
+ if self._state.db is not None and self._state.db != "default":
685
+ msg += f" {colors.italic('Database instance')}\n"
686
+ msg += f" slug: {self._state.db}\n"
687
+
688
+ if model_name == "Artifact":
689
+ result = get_artifact_with_related(
690
+ self,
691
+ include_feature_link=True,
692
+ include_fk=True,
693
+ include_m2m=True,
694
+ include_schema=True,
695
+ )
696
+ else:
697
+ result = get_artifact_with_related(self, include_fk=True, include_m2m=True)
698
+ related_data = result.get("related_data", {})
699
+ # TODO: fk_data = related_data.get("fk", {})
700
+
701
+ tree = describe_general(self)
702
+ if model_name == "Artifact":
703
+ return describe_features(
704
+ self,
705
+ tree=tree,
706
+ related_data=related_data,
707
+ with_labels=True,
708
+ print_params=hasattr(self, "kind") and self.kind == "model",
709
+ )
710
+ else:
711
+ return tree
712
+
713
+
714
+ def _describe_sqlite(self, print_types: bool = False): # for artifact & collection
715
+ from ._describe import describe_general
716
+ from ._feature_manager import describe_features
717
+ from .collection import Collection
718
+
719
+ model_name = self.__class__.__name__
720
+ msg = f"{colors.green(model_name)}{record_repr(self, include_foreign_keys=False).lstrip(model_name)}\n"
721
+ if self._state.db is not None and self._state.db != "default":
722
+ msg += f" {colors.italic('Database instance')}\n"
723
+ msg += f" slug: {self._state.db}\n"
724
+
725
+ fields = self._meta.fields
726
+ direct_fields = []
727
+ foreign_key_fields = []
728
+ for f in fields:
729
+ if f.is_relation:
730
+ foreign_key_fields.append(f.name)
731
+ else:
732
+ direct_fields.append(f.name)
733
+ if not self._state.adding:
734
+ # prefetch foreign key relationships
735
+ self = (
736
+ self.__class__.objects.using(self._state.db)
737
+ .select_related(*foreign_key_fields)
738
+ .get(id=self.id)
739
+ )
740
+ # prefetch m-2-m relationships
741
+ many_to_many_fields = []
742
+ if isinstance(self, (Collection, Artifact)):
743
+ many_to_many_fields.append("input_of_runs")
744
+ if isinstance(self, Artifact):
745
+ many_to_many_fields.append("feature_sets")
746
+ self = (
747
+ self.__class__.objects.using(self._state.db)
748
+ .prefetch_related(*many_to_many_fields)
749
+ .get(id=self.id)
750
+ )
751
+ tree = describe_general(self)
752
+ if model_name == "Artifact":
753
+ return describe_features(
754
+ self,
755
+ tree=tree,
756
+ with_labels=True,
757
+ print_params=hasattr(self, "kind") and self.kind == "kind",
758
+ )
759
+ else:
760
+ return tree
761
+
762
+
763
+ def describe_artifact_collection(self): # for artifact & collection
764
+ from ._describe import print_rich_tree
765
+
766
+ if not self._state.adding and connections[self._state.db].vendor == "postgresql":
767
+ tree = _describe_postgres(self)
768
+ else:
769
+ tree = _describe_sqlite(self)
770
+
771
+ print_rich_tree(tree)
772
+
773
+
774
+ def validate_feature(feature: Feature, records: list[Record]) -> None:
775
+ """Validate feature record, adjust feature.dtype based on labels records."""
776
+ if not isinstance(feature, Feature):
777
+ raise TypeError("feature has to be of type Feature")
778
+ if feature._state.adding:
779
+ registries = {record.__class__.__get_name_with_module__() for record in records}
780
+ registries_str = "|".join(registries)
781
+ msg = f"ln.Feature(name='{feature.name}', type='cat[{registries_str}]').save()"
782
+ raise ValidationError(f"Feature not validated. If it looks correct: {msg}")
783
+
784
+
785
+ def get_labels(
786
+ self,
787
+ feature: Feature,
788
+ mute: bool = False,
789
+ flat_names: bool = False,
790
+ ) -> QuerySet | dict[str, QuerySet] | list:
791
+ """{}""" # noqa: D415
792
+ if not isinstance(feature, Feature):
793
+ raise TypeError("feature has to be of type Feature")
794
+ if feature.dtype is None or not feature.dtype.startswith("cat["):
795
+ raise ValueError("feature does not have linked labels")
796
+ registries_to_check = feature.dtype.replace("cat[", "").rstrip("]").split("|")
797
+ if len(registries_to_check) > 1 and not mute:
798
+ logger.warning("labels come from multiple registries!")
799
+ # return an empty query set if self.id is still None
800
+ if self.id is None:
801
+ return QuerySet(self.__class__)
802
+ qs_by_registry = {}
803
+ for registry in registries_to_check:
804
+ # currently need to distinguish between ULabel and non-ULabel, because
805
+ # we only have the feature information for Label
806
+ if registry == "ULabel":
807
+ links_to_labels = get_label_links(self, registry, feature)
808
+ label_ids = [link.ulabel_id for link in links_to_labels]
809
+ qs_by_registry[registry] = ULabel.objects.using(self._state.db).filter(
810
+ id__in=label_ids
811
+ )
812
+ elif registry in self.features._accessor_by_registry:
813
+ qs_by_registry[registry] = getattr(
814
+ self, self.features._accessor_by_registry[registry]
815
+ ).all()
816
+ if flat_names:
817
+ # returns a flat list of names
818
+ from .record import get_name_field
819
+
820
+ values = []
821
+ for v in qs_by_registry.values():
822
+ values += v.list(get_name_field(v))
823
+ return values
824
+ if len(registries_to_check) == 1 and registry in qs_by_registry:
825
+ return qs_by_registry[registry]
826
+ else:
827
+ return qs_by_registry
828
+
829
+
830
+ def add_labels(
831
+ self,
832
+ records: Record | list[Record] | QuerySet | Iterable,
833
+ feature: Feature | None = None,
834
+ *,
835
+ field: StrField | None = None,
836
+ feature_ref_is_name: bool | None = None,
837
+ label_ref_is_name: bool | None = None,
838
+ from_curator: bool = False,
839
+ ) -> None:
840
+ """{}""" # noqa: D415
841
+ if self._state.adding:
842
+ raise ValueError("Please save the artifact/collection before adding a label!")
843
+
844
+ if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
845
+ records = records.list()
846
+ if isinstance(records, (str, Record)):
847
+ records = [records]
848
+ if not isinstance(records, list): # avoids warning for pd Series
849
+ records = list(records)
850
+ # create records from values
851
+ if len(records) == 0:
852
+ return None
853
+ if isinstance(records[0], str): # type: ignore
854
+ records_validated = []
855
+ # feature is needed if we want to create records from values
856
+ if feature is None:
857
+ raise ValueError(
858
+ "Please pass a feature, e.g., via: label = ln.ULabel(name='my_label',"
859
+ " feature=ln.Feature(name='my_feature'))"
860
+ )
861
+ if feature.dtype.startswith("cat["):
862
+ orm_dict = dict_module_name_to_model_name(Artifact)
863
+ for reg in feature.dtype.replace("cat[", "").rstrip("]").split("|"):
864
+ registry = orm_dict.get(reg)
865
+ records_validated += registry.from_values(records, field=field)
866
+
867
+ # feature doesn't have registries and therefore can't create records from values
868
+ # ask users to pass records
869
+ if len(records_validated) == 0:
870
+ raise ValueError(
871
+ "Please pass a record (a `Record` object), not a string, e.g., via:"
872
+ " label"
873
+ f" = ln.ULabel(name='{records[0]}')" # type: ignore
874
+ )
875
+ records = records_validated
876
+
877
+ for record in records:
878
+ if record._state.adding:
879
+ raise ValidationError(
880
+ f"{record} not validated. If it looks correct: record.save()"
881
+ )
882
+
883
+ if feature is None:
884
+ d = dict_related_model_to_related_name(self.__class__)
885
+ # strategy: group records by registry to reduce number of transactions
886
+ records_by_related_name: dict = {}
887
+ for record in records:
888
+ related_name = d.get(record.__class__.__get_name_with_module__())
889
+ if related_name is None:
890
+ raise ValueError(f"Can't add labels to {record.__class__} record!")
891
+ if related_name not in records_by_related_name:
892
+ records_by_related_name[related_name] = []
893
+ records_by_related_name[related_name].append(record)
894
+ for related_name, records in records_by_related_name.items():
895
+ getattr(self, related_name).add(*records)
896
+ else:
897
+ validate_feature(feature, records) # type:ignore
898
+ records_by_registry = defaultdict(list)
899
+ feature_sets = self.feature_sets.filter(itype="Feature").all()
900
+ internal_features = set() # type: ignore
901
+ if len(feature_sets) > 0:
902
+ for schema in feature_sets:
903
+ internal_features = internal_features.union(
904
+ set(schema.members.values_list("name", flat=True))
905
+ ) # type: ignore
906
+ for record in records:
907
+ records_by_registry[record.__class__.__get_name_with_module__()].append(
908
+ record
909
+ )
910
+ for registry_name, records in records_by_registry.items():
911
+ if not from_curator and feature.name in internal_features:
912
+ raise ValidationError(
913
+ "Cannot manually annotate internal feature with label. Please use ln.Curator"
914
+ )
915
+ if registry_name not in feature.dtype:
916
+ if not feature.dtype.startswith("cat"):
917
+ raise ValidationError(
918
+ f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{feature.dtype}'"
919
+ )
920
+ if feature.dtype == "cat":
921
+ feature.dtype = f"cat[{registry_name}]" # type: ignore
922
+ feature.save()
923
+ elif registry_name not in feature.dtype:
924
+ new_dtype = feature.dtype.rstrip("]") + f"|{registry_name}]"
925
+ raise ValidationError(
926
+ f"Label type {registry_name} is not valid for Feature(name='{feature.name}', dtype='{feature.dtype}'), consider updating to dtype='{new_dtype}'"
927
+ )
928
+
929
+ if registry_name not in self.features._accessor_by_registry:
930
+ logger.warning(f"skipping {registry_name}")
931
+ continue
932
+ if len(records) == 0:
933
+ continue
934
+ features_labels = {
935
+ registry_name: [(feature, label_record) for label_record in records]
936
+ }
937
+ add_label_feature_links(
938
+ self.features,
939
+ features_labels,
940
+ feature_ref_is_name=feature_ref_is_name,
941
+ label_ref_is_name=label_ref_is_name,
942
+ )
943
+
944
+
945
+ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
946
+ # Note that this docstring has to be consistent with Curator.save_artifact()
947
+ """Datasets & models stored as files, folders, or arrays.
948
+
949
+ Artifacts manage data in local or remote storage.
950
+
951
+ Some artifacts are array-like, e.g., when stored as `.parquet`, `.h5ad`,
952
+ `.zarr`, or `.tiledb`.
953
+
954
+ Args:
955
+ data: `UPathStr` A path to a local or remote folder or file.
956
+ kind: `Literal["dataset", "model"] | None = None` Distinguish models from datasets from other files & folders.
957
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
958
+ description: `str | None = None` A description.
959
+ revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
960
+ run: `Run | None = None` The run that creates the artifact.
961
+
962
+ .. dropdown:: Typical storage formats & their API accessors
963
+
964
+ Arrays:
965
+
966
+ - Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
967
+ - Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
968
+ - Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
969
+
970
+ Non-arrays:
971
+
972
+ - Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
973
+ - Fastq: `.fastq` ⟷ /
974
+ - VCF: `.vcf` ⟷ /
975
+ - QC: `.html` ⟷ /
976
+
977
+ You'll find these values in the `suffix` & `accessor` fields.
978
+
979
+ LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
980
+
981
+ See Also:
982
+ :class:`~lamindb.Storage`
983
+ Storage locations for artifacts.
984
+ :class:`~lamindb.Collection`
985
+ Collections of artifacts.
986
+ :meth:`~lamindb.Artifact.from_df`
987
+ Create an artifact from a `DataFrame`.
988
+ :meth:`~lamindb.Artifact.from_anndata`
989
+ Create an artifact from an `AnnData`.
990
+
991
+ Examples:
992
+
993
+ Create an artifact by passing `key`:
994
+
995
+ >>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
996
+ >>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
997
+
998
+ Calling `.save()` uploads the file to the default storage location of your lamindb instance.
999
+ (If it's a local instance, the "upload" is a mere copy operation.)
1000
+
1001
+ If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
1002
+
1003
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
1004
+
1005
+ You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
1006
+
1007
+ >>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
1008
+ >>> artifact_v2.versions.df() # see all versions
1009
+
1010
+ .. dropdown:: Why does the API look this way?
1011
+
1012
+ It's inspired by APIs building on AWS S3.
1013
+
1014
+ Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
1015
+
1016
+ In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
1017
+
1018
+ # signature: S3.Bucket.upload_file(filepath, key)
1019
+ import boto3
1020
+ s3 = boto3.resource('s3')
1021
+ bucket = s3.Bucket('mybucket')
1022
+ bucket.upload_file('/tmp/hello.txt', 'hello.txt')
1023
+
1024
+ In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
1025
+
1026
+ # signature: quilt3.Bucket.put_file(key, filepath)
1027
+ import quilt3
1028
+ bucket = quilt3.Bucket('mybucket')
1029
+ bucket.put_file('hello.txt', '/tmp/hello.txt')
1030
+
1031
+ Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
1032
+
1033
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
1034
+ >>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
1035
+
1036
+ Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
1037
+
1038
+ >>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
1039
+
1040
+ If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
1041
+ the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
1042
+ detects the duplication and will return the existing artifact.
1043
+
1044
+ """
1045
+
1046
+ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
1047
+ abstract = False
1048
+
1049
+ _len_full_uid: int = 20
1050
+ _len_stem_uid: int = 16
1051
+
1052
+ params: ParamManager = ParamManagerArtifact # type: ignore
1053
+ """Param manager.
1054
+
1055
+ Example::
1056
+
1057
+ artifact.params.add_values({
1058
+ "hidden_size": 32,
1059
+ "bottleneck_size": 16,
1060
+ "batch_size": 32,
1061
+ "preprocess_params": {
1062
+ "normalization_type": "cool",
1063
+ "subset_highlyvariable": True,
1064
+ },
1065
+ })
1066
+ """
1067
+
1068
+ features: FeatureManager = FeatureManager # type: ignore
1069
+ """Feature manager.
1070
+
1071
+ Features denote dataset dimensions, i.e., the variables that measure labels & numbers.
1072
+
1073
+ Annotate with features & values::
1074
+
1075
+ artifact.features.add_values({
1076
+ "species": organism, # here, organism is an Organism record
1077
+ "scientist": ['Barbara McClintock', 'Edgar Anderson'],
1078
+ "temperature": 27.6,
1079
+ "study": "Candidate marker study"
1080
+ })
1081
+
1082
+ Query for features & values::
1083
+
1084
+ ln.Artifact.features.filter(scientist="Barbara McClintock")
1085
+
1086
+ Features may or may not be part of the artifact content in storage. For
1087
+ instance, the :class:`~lamindb.Curator` flow validates the columns of a
1088
+ `DataFrame`-like artifact and annotates it with features corresponding to
1089
+ these columns. `artifact.features.add_values`, by contrast, does not
1090
+ validate the content of the artifact.
1091
+ """
1092
+
1093
+ @property
1094
+ def labels(self) -> LabelManager:
1095
+ """Label manager.
1096
+
1097
+ To annotate with labels, you typically use the registry-specific accessors,
1098
+ for instance :attr:`~lamindb.Artifact.ulabels`::
1099
+
1100
+ candidate_marker_study = ln.ULabel(name="Candidate marker study").save()
1101
+ artifact.ulabels.add(candidate_marker_study)
1102
+
1103
+ Similarly, you query based on these accessors::
1104
+
1105
+ ln.Artifact.filter(ulabels__name="Candidate marker study").all()
1106
+
1107
+ Unlike the registry-specific accessors, the `.labels` accessor provides
1108
+ a way of associating labels with features::
1109
+
1110
+ study = ln.Feature(name="study", dtype="cat").save()
1111
+ artifact.labels.add(candidate_marker_study, feature=study)
1112
+
1113
+ Note that the above is equivalent to::
1114
+
1115
+ artifact.features.add_values({"study": candidate_marker_study})
1116
+ """
1117
+ from ._label_manager import LabelManager
1118
+
1119
+ return LabelManager(self)
1120
+
1121
+ id: int = models.AutoField(primary_key=True)
1122
+ """Internal id, valid only in one DB instance."""
1123
+ uid: str = CharField(
1124
+ editable=False, unique=True, db_index=True, max_length=_len_full_uid
1125
+ )
1126
+ """A universal random id."""
1127
+ key: str | None = CharField(db_index=True, null=True)
1128
+ """A (virtual) relative file path within the artifact's storage location.
1129
+
1130
+ Setting a `key` is useful to automatically group artifacts into a version family.
1131
+
1132
+ LaminDB defaults to a virtual file path to make renaming of data in object storage easy.
1133
+
1134
+ If you register existing files in a storage location, the `key` equals the
1135
+ actual filepath on the underyling filesytem or object store.
1136
+ """
1137
+ description: str | None = CharField(db_index=True, null=True)
1138
+ """A description."""
1139
+ storage: Storage = ForeignKey(
1140
+ Storage, PROTECT, related_name="artifacts", editable=False
1141
+ )
1142
+ """Storage location, e.g. an S3 or GCP bucket or a local directory."""
1143
+ suffix: str = CharField(max_length=30, db_index=True, editable=False)
1144
+ # Initially, we thought about having this be nullable to indicate folders
1145
+ # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
1146
+ """Path suffix or empty string if no canonical suffix exists.
1147
+
1148
+ This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "".
1149
+ """
1150
+ kind: ArtifactKind | None = CharField(
1151
+ max_length=20,
1152
+ db_index=True,
1153
+ null=True,
1154
+ )
1155
+ """:class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
1156
+ otype: str | None = CharField(
1157
+ max_length=64, db_index=True, null=True, editable=False
1158
+ )
1159
+ """Default Python object type, e.g., DataFrame, AnnData."""
1160
+ size: int | None = BigIntegerField(
1161
+ null=True, db_index=True, default=None, editable=False
1162
+ )
1163
+ """Size in bytes.
1164
+
1165
+ Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
1166
+ """
1167
+ hash: str | None = CharField(
1168
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True, editable=False
1169
+ )
1170
+ """Hash or pseudo-hash of artifact content.
1171
+
1172
+ Useful to ascertain integrity and avoid duplication.
1173
+ """
1174
+ n_files: int | None = BigIntegerField(
1175
+ null=True, db_index=True, default=None, editable=False
1176
+ )
1177
+ """Number of files for folder-like artifacts, `None` for file-like artifacts.
1178
+
1179
+ Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
1180
+
1181
+ .. versionchanged:: 1.0
1182
+ Renamed from `n_objects` to `n_files`.
1183
+ """
1184
+ n_observations: int | None = BigIntegerField(
1185
+ null=True, db_index=True, default=None, editable=False
1186
+ )
1187
+ """Number of observations.
1188
+
1189
+ Typically, this denotes the first array dimension.
1190
+ """
1191
+ _hash_type: str | None = CharField(
1192
+ max_length=30, db_index=True, null=True, editable=False
1193
+ )
1194
+ """Type of hash."""
1195
+ ulabels: ULabel = models.ManyToManyField(
1196
+ ULabel, through="ArtifactULabel", related_name="artifacts"
1197
+ )
1198
+ """The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
1199
+ run: Run | None = ForeignKey(
1200
+ Run,
1201
+ PROTECT,
1202
+ related_name="output_artifacts",
1203
+ null=True,
1204
+ default=None,
1205
+ editable=False,
1206
+ )
1207
+ """Run that created the artifact."""
1208
+ input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
1209
+ """Runs that use this artifact as an input."""
1210
+ _subsequent_runs: Run = models.ManyToManyField(
1211
+ "Run",
1212
+ related_name="_recreated_artifacts",
1213
+ db_table="lamindb_artifact__previous_runs", # legacy name, change in lamindb v2
1214
+ )
1215
+ """Runs that re-created the record after initial creation."""
1216
+ collections: Collection
1217
+ """The collections that this artifact is part of."""
1218
+ schema: Schema | None = ForeignKey(
1219
+ Schema,
1220
+ PROTECT,
1221
+ null=True,
1222
+ default=None,
1223
+ related_name="validated_artifacts",
1224
+ )
1225
+ """The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
1226
+ feature_sets: Schema = models.ManyToManyField(
1227
+ Schema, related_name="artifacts", through="ArtifactSchema"
1228
+ )
1229
+ """The feature sets measured by the artifact."""
1230
+ _feature_values: FeatureValue = models.ManyToManyField(
1231
+ FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
1232
+ )
1233
+ """Non-categorical feature values for annotation."""
1234
+ _param_values: ParamValue = models.ManyToManyField(
1235
+ ParamValue, through="ArtifactParamValue", related_name="artifacts"
1236
+ )
1237
+ """Parameter values."""
1238
+ _key_is_virtual: bool = BooleanField()
1239
+ """Indicates whether `key` is virtual or part of an actual file path."""
1240
+ # be mindful that below, passing related_name="+" leads to errors
1241
+ _actions: Artifact = models.ManyToManyField(
1242
+ "self", symmetrical=False, related_name="_action_targets"
1243
+ )
1244
+ """Actions to attach for the UI."""
1245
+ created_by: User = ForeignKey(
1246
+ "lamindb.User",
1247
+ PROTECT,
1248
+ default=current_user_id,
1249
+ related_name="created_artifacts",
1250
+ editable=False,
1251
+ )
1252
+ """Creator of record."""
1253
+ _overwrite_versions: bool = BooleanField(default=None)
1254
+ """Indicates whether to store or overwrite versions.
1255
+
1256
+ It defaults to False for file-like artifacts and to True for folder-like artifacts.
1257
+ """
1258
+ projects: Project
1259
+ """Linked projects."""
1260
+ references: Reference
1261
+ """Linked references."""
1262
+
1263
+ @overload
1264
+ def __init__(
1265
+ self,
1266
+ # we're not choosing the name "path" for this arg because
1267
+ # it'd be confusing with `artifact.path`, which is not the same
1268
+ # so "data" conveys better that this is input data that's ingested
1269
+ # and will be moved to a target path at `artifact.path`
1270
+ # also internally, we sometimes pass "data objects" like a DataFrame
1271
+ # here; and we might refactor this but we might also keep that internal
1272
+ # usage
1273
+ data: UPathStr,
1274
+ kind: ArtifactKind | None = None,
1275
+ key: str | None = None,
1276
+ description: str | None = None,
1277
+ revises: Artifact | None = None,
1278
+ run: Run | None = None,
1279
+ ): ...
1280
+
1281
+ @overload
1282
+ def __init__(
1283
+ self,
1284
+ *db_args,
1285
+ ): ...
1286
+
1287
+ def __init__(
1288
+ self,
1289
+ *args,
1290
+ **kwargs,
1291
+ ):
1292
+ self.features = FeatureManager(self) # type: ignore
1293
+ self.params = ParamManager(self) # type: ignore
1294
+ # Below checks for the Django-internal call in from_db()
1295
+ # it'd be better if we could avoid this, but not being able to create a Artifact
1296
+ # from data with the default constructor renders the central class of the API
1297
+ # essentially useless
1298
+ # The danger below is not that a user might pass as many args (12 of it), but rather
1299
+ # that at some point the Django API might change; on the other hand, this
1300
+ # condition of for calling the constructor based on kwargs should always
1301
+ # stay robust
1302
+ if len(args) == len(self._meta.concrete_fields):
1303
+ super().__init__(*args, **kwargs)
1304
+ return None
1305
+ # now we proceed with the user-facing constructor
1306
+ if len(args) > 1:
1307
+ raise ValueError("Only one non-keyword arg allowed: data")
1308
+ data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
1309
+ kind: str = kwargs.pop("kind", None)
1310
+ key: str | None = kwargs.pop("key", None)
1311
+ run: Run | None = kwargs.pop("run", None)
1312
+ description: str | None = kwargs.pop("description", None)
1313
+ revises: Artifact | None = kwargs.pop("revises", None)
1314
+ version: str | None = kwargs.pop("version", None)
1315
+ if "visibility" in kwargs: # backward compat
1316
+ _branch_code = kwargs.pop("visibility")
1317
+ elif "_branch_code" in kwargs:
1318
+ _branch_code = kwargs.pop("_branch_code")
1319
+ else:
1320
+ _branch_code = 1
1321
+ format = kwargs.pop("format", None)
1322
+ _is_internal_call = kwargs.pop("_is_internal_call", False)
1323
+ skip_check_exists = kwargs.pop("skip_check_exists", False)
1324
+ if "default_storage" in kwargs:
1325
+ default_storage = kwargs.pop("default_storage")
1326
+ else:
1327
+ if setup_settings.instance.keep_artifacts_local:
1328
+ default_storage = setup_settings.instance.storage_local.record
1329
+ else:
1330
+ default_storage = setup_settings.instance.storage.record
1331
+ using_key = kwargs.pop("using_key", None)
1332
+ otype = kwargs.pop("otype") if "otype" in kwargs else None
1333
+ otype = _check_otype_artifact(data=data, otype=otype)
1334
+ if "type" in kwargs:
1335
+ logger.warning("`type` will be removed soon, please use `kind`")
1336
+ kind = kwargs.pop("type")
1337
+ if not len(kwargs) == 0:
1338
+ valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
1339
+ raise FieldValidationError(
1340
+ f"Only {valid_keywords} can be passed, you passed: {kwargs}"
1341
+ )
1342
+ if revises is not None and key is not None and revises.key != key:
1343
+ note = message_update_key_in_version_family(
1344
+ suid=revises.stem_uid,
1345
+ existing_key=revises.key,
1346
+ new_key=key,
1347
+ registry="Artifact",
1348
+ )
1349
+ raise ValueError(
1350
+ f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
1351
+ )
1352
+ if revises is not None:
1353
+ if not isinstance(revises, Artifact):
1354
+ raise TypeError("`revises` has to be of type `Artifact`")
1355
+ if description is None:
1356
+ description = revises.description
1357
+ if key is not None and AUTO_KEY_PREFIX in key:
1358
+ raise ValueError(
1359
+ f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
1360
+ )
1361
+ # below is for internal calls that require defining the storage location
1362
+ # ahead of constructing the Artifact
1363
+ if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
1364
+ if _is_internal_call:
1365
+ is_automanaged_path = True
1366
+ user_provided_key = key
1367
+ key = None
1368
+ else:
1369
+ raise ValueError(
1370
+ f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
1371
+ )
1372
+ else:
1373
+ is_automanaged_path = False
1374
+ provisional_uid, revises = create_uid(revises=revises, version=version)
1375
+ kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
1376
+ data=data,
1377
+ key=key,
1378
+ run=run,
1379
+ format=format,
1380
+ provisional_uid=provisional_uid,
1381
+ version=version,
1382
+ default_storage=default_storage,
1383
+ using_key=using_key,
1384
+ skip_check_exists=skip_check_exists,
1385
+ )
1386
+
1387
+ # an object with the same hash already exists
1388
+ if isinstance(kwargs_or_artifact, Artifact):
1389
+ from .record import init_self_from_db, update_attributes
1390
+
1391
+ init_self_from_db(self, kwargs_or_artifact)
1392
+ # adding "key" here is dangerous because key might be auto-populated
1393
+ attr_to_update = {"description": description}
1394
+ if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None:
1395
+ attr_to_update["key"] = key
1396
+ elif self.key != key and key is not None:
1397
+ logger.warning(
1398
+ f"key {self.key} on existing artifact differs from passed key {key}"
1399
+ )
1400
+ update_attributes(self, attr_to_update)
1401
+ return None
1402
+ else:
1403
+ kwargs = kwargs_or_artifact
1404
+
1405
+ if revises is None:
1406
+ revises = kwargs_or_artifact.pop("revises")
1407
+
1408
+ if data is not None:
1409
+ self._local_filepath = privates["local_filepath"]
1410
+ self._cloud_filepath = privates["cloud_filepath"]
1411
+ self._memory_rep = privates["memory_rep"]
1412
+ self._to_store = not privates["check_path_in_storage"]
1413
+
1414
+ if is_automanaged_path and _is_internal_call:
1415
+ kwargs["_key_is_virtual"] = True
1416
+ assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
1417
+ uid = (
1418
+ kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
1419
+ )
1420
+ kwargs["key"] = user_provided_key
1421
+ if revises is not None:
1422
+ assert uid.startswith(revises.stem_uid) # noqa: S101
1423
+ if len(uid) == 16:
1424
+ if revises is None:
1425
+ uid += "0000"
1426
+ else:
1427
+ uid, revises = create_uid(revises=revises, version=version)
1428
+ kwargs["uid"] = uid
1429
+
1430
+ # only set key now so that we don't do a look-up on it in case revises is passed
1431
+ if revises is not None:
1432
+ kwargs["key"] = revises.key
1433
+
1434
+ kwargs["kind"] = kind
1435
+ kwargs["version"] = version
1436
+ kwargs["description"] = description
1437
+ kwargs["_branch_code"] = _branch_code
1438
+ kwargs["otype"] = otype
1439
+ kwargs["revises"] = revises
1440
+ # this check needs to come down here because key might be populated from an
1441
+ # existing file path during get_artifact_kwargs_from_data()
1442
+ if (
1443
+ kwargs["key"] is None
1444
+ and kwargs["description"] is None
1445
+ and kwargs["run"] is None
1446
+ ):
1447
+ raise ValueError("Pass one of key, run or description as a parameter")
1448
+
1449
+ super().__init__(**kwargs)
1450
+
1451
+ @property
1452
+ @deprecated("kind")
1453
+ def type(self) -> str:
1454
+ return self.kind
1455
+
1456
+ @property
1457
+ @deprecated("otype")
1458
+ def _accessor(self) -> str:
1459
+ return self.otype
1460
+
1461
+ @property
1462
+ def transform(self) -> Transform | None:
1463
+ """Transform whose run created the artifact."""
1464
+ return self.run.transform if self.run is not None else None
1465
+
1466
+ @property
1467
+ @deprecated("n_files")
1468
+ def n_objects(self) -> int:
1469
+ return self.n_files
1470
+
1471
+ # add the below because this is what people will have in their code
1472
+ # if they implement the recommended migration strategy
1473
+ # - FeatureSet -> Schema
1474
+ # - featureset -> schema
1475
+ # - feature_set -> schema
1476
+ # @property
1477
+ # def schemas(self) -> QuerySet[Schema]:
1478
+ # """Schemas linked to artifact via many-to-many relationship.
1479
+
1480
+ # Is now mediating the private `.feature_sets` relationship during
1481
+ # a transition period to better schema management.
1482
+
1483
+ # .. versionchanged: 1.0
1484
+ # Was previously called `.feature_sets`.
1485
+
1486
+ # """
1487
+ # return self.feature_sets
1488
+
1489
+ @property
1490
+ def path(self) -> Path:
1491
+ """Path.
1492
+
1493
+ File in cloud storage, here AWS S3:
1494
+
1495
+ >>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
1496
+ >>> artifact.path
1497
+ S3QueryPath('s3://my-bucket/my-file.csv')
1498
+
1499
+ File in local storage:
1500
+
1501
+ >>> ln.Artifact("./myfile.csv", key="myfile.csv").save()
1502
+ >>> artifact.path
1503
+ PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv')
1504
+ """
1505
+ from lamindb import settings
1506
+
1507
+ filepath, _ = filepath_from_artifact(self, using_key=settings._using_key)
1508
+ return filepath
1509
+
1510
+ @property
1511
+ def _cache_path(self) -> UPath:
1512
+ from lamindb import settings
1513
+
1514
+ filepath, cache_key = filepath_cache_key_from_artifact(
1515
+ self, using_key=settings._using_key
1516
+ )
1517
+ if isinstance(filepath, LocalPathClasses):
1518
+ return filepath
1519
+ return setup_settings.paths.cloud_to_local_no_update(
1520
+ filepath, cache_key=cache_key
1521
+ )
1522
+
1523
+ @classmethod
1524
+ def from_df(
1525
+ cls,
1526
+ df: pd.DataFrame,
1527
+ *,
1528
+ key: str | None = None,
1529
+ description: str | None = None,
1530
+ run: Run | None = None,
1531
+ revises: Artifact | None = None,
1532
+ **kwargs,
1533
+ ) -> Artifact:
1534
+ """Create from `DataFrame`, validate & link features.
1535
+
1536
+ Args:
1537
+ df: A `DataFrame` object.
1538
+ key: A relative path within default storage,
1539
+ e.g., `"myfolder/myfile.parquet"`.
1540
+ description: A description.
1541
+ revises: An old version of the artifact.
1542
+ run: The run that creates the artifact.
1543
+
1544
+ See Also:
1545
+ :meth:`~lamindb.Collection`
1546
+ Track collections.
1547
+ :class:`~lamindb.Feature`
1548
+ Track features.
1549
+
1550
+ Examples:
1551
+ >>> df = ln.core.datasets.df_iris_in_meter_batch1()
1552
+ >>> df.head()
1553
+ sepal_length sepal_width petal_length petal_width iris_organism_code
1554
+ 0 0.051 0.035 0.014 0.002 0
1555
+ 1 0.049 0.030 0.014 0.002 0
1556
+ 2 0.047 0.032 0.013 0.002 0
1557
+ 3 0.046 0.031 0.015 0.002 0
1558
+ 4 0.050 0.036 0.014 0.002 0
1559
+ >>> artifact = ln.Artifact.from_df(df, description="Iris flower collection batch1")
1560
+ >>> artifact.save()
1561
+ """
1562
+ artifact = Artifact( # type: ignore
1563
+ data=df,
1564
+ key=key,
1565
+ run=run,
1566
+ description=description,
1567
+ revises=revises,
1568
+ otype="DataFrame",
1569
+ kind="dataset",
1570
+ **kwargs,
1571
+ )
1572
+ artifact.n_observations = len(df)
1573
+ return artifact
1574
+
1575
+ @classmethod
1576
+ def from_anndata(
1577
+ cls,
1578
+ adata: Union[AnnData, UPathStr],
1579
+ *,
1580
+ key: str | None = None,
1581
+ description: str | None = None,
1582
+ run: Run | None = None,
1583
+ revises: Artifact | None = None,
1584
+ **kwargs,
1585
+ ) -> Artifact:
1586
+ """Create from ``AnnData``, validate & link features.
1587
+
1588
+ Args:
1589
+ adata: An `AnnData` object or a path of AnnData-like.
1590
+ key: A relative path within default storage,
1591
+ e.g., `"myfolder/myfile.h5ad"`.
1592
+ description: A description.
1593
+ revises: An old version of the artifact.
1594
+ run: The run that creates the artifact.
1595
+
1596
+ See Also:
1597
+
1598
+ :meth:`~lamindb.Collection`
1599
+ Track collections.
1600
+ :class:`~lamindb.Feature`
1601
+ Track features.
1602
+
1603
+ Examples:
1604
+ >>> import bionty as bt
1605
+ >>> bt.settings.organism = "human"
1606
+ >>> adata = ln.core.datasets.anndata_with_obs()
1607
+ >>> artifact = ln.Artifact.from_anndata(adata, description="mini anndata with obs")
1608
+ >>> artifact.save()
1609
+ """
1610
+ if not data_is_anndata(adata):
1611
+ raise ValueError(
1612
+ "data has to be an AnnData object or a path to AnnData-like"
1613
+ )
1614
+ _anndata_n_observations(adata)
1615
+ artifact = Artifact( # type: ignore
1616
+ data=adata,
1617
+ key=key,
1618
+ run=run,
1619
+ description=description,
1620
+ revises=revises,
1621
+ otype="AnnData",
1622
+ kind="dataset",
1623
+ **kwargs,
1624
+ )
1625
+ # this is done instead of _anndata_n_observations(adata)
1626
+ # because we need a proper path through create_path for cloud paths
1627
+ # for additional upath options etc that create_path adds
1628
+ obj_for_obs: AnnData | UPath
1629
+ if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
1630
+ obj_for_obs = artifact._memory_rep
1631
+ else:
1632
+ # returns ._local_filepath for local files
1633
+ # and the proper path through create_path for cloud paths
1634
+ obj_for_obs = artifact.path
1635
+ artifact.n_observations = _anndata_n_observations(obj_for_obs)
1636
+ return artifact
1637
+
1638
+ @classmethod
1639
+ def from_mudata(
1640
+ cls,
1641
+ mdata: Union[MuData, UPathStr],
1642
+ *,
1643
+ key: str | None = None,
1644
+ description: str | None = None,
1645
+ run: Run | None = None,
1646
+ revises: Artifact | None = None,
1647
+ **kwargs,
1648
+ ) -> Artifact:
1649
+ """Create from ``MuData``, validate & link features.
1650
+
1651
+ Args:
1652
+ mdata: A `MuData` object.
1653
+ key: A relative path within default storage,
1654
+ e.g., `"myfolder/myfile.h5mu"`.
1655
+ description: A description.
1656
+ revises: An old version of the artifact.
1657
+ run: The run that creates the artifact.
1658
+
1659
+ See Also:
1660
+ :meth:`~lamindb.Collection`
1661
+ Track collections.
1662
+ :class:`~lamindb.Feature`
1663
+ Track features.
1664
+
1665
+ Examples:
1666
+ >>> import bionty as bt
1667
+ >>> bt.settings.organism = "human"
1668
+ >>> mdata = ln.core.datasets.mudata_papalexi21_subset()
1669
+ >>> artifact = ln.Artifact.from_mudata(mdata, description="a mudata object")
1670
+ >>> artifact.save()
1671
+ """
1672
+ if not data_is_mudata(mdata):
1673
+ raise ValueError("data has to be a MuData object or a path to MuData-like")
1674
+ artifact = Artifact( # type: ignore
1675
+ data=mdata,
1676
+ key=key,
1677
+ run=run,
1678
+ description=description,
1679
+ revises=revises,
1680
+ otype="MuData",
1681
+ kind="dataset",
1682
+ **kwargs,
1683
+ )
1684
+ if not isinstance(mdata, UPathStr):
1685
+ artifact.n_observations = mdata.n_obs
1686
+ return artifact
1687
+
1688
+ @classmethod
1689
+ def from_spatialdata(
1690
+ cls,
1691
+ sdata: Union[SpatialData, UPathStr],
1692
+ *,
1693
+ key: str | None = None,
1694
+ description: str | None = None,
1695
+ run: Run | None = None,
1696
+ revises: Artifact | None = None,
1697
+ **kwargs,
1698
+ ) -> Artifact:
1699
+ """Create from ``SpatialData``, validate & link features.
1700
+
1701
+ Args:
1702
+ mdata: A `SpatialData` object.
1703
+ key: A relative path within default storage,
1704
+ e.g., `"myfolder/myfile.zarr"`.
1705
+ description: A description.
1706
+ revises: An old version of the artifact.
1707
+ run: The run that creates the artifact.
1708
+
1709
+ See Also:
1710
+ :meth:`~lamindb.Collection`
1711
+ Track collections.
1712
+ :class:`~lamindb.Feature`
1713
+ Track features.
1714
+
1715
+ Examples:
1716
+ >>> artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr")
1717
+ """
1718
+ if not data_is_spatialdata(sdata):
1719
+ raise ValueError(
1720
+ "data has to be a SpatialData object or a path to SpatialData-like"
1721
+ )
1722
+ artifact = Artifact( # type: ignore
1723
+ data=sdata,
1724
+ key=key,
1725
+ run=run,
1726
+ description=description,
1727
+ revises=revises,
1728
+ otype="SpatialData",
1729
+ kind="dataset",
1730
+ **kwargs,
1731
+ )
1732
+ # ill-defined https://scverse.zulipchat.com/#narrow/channel/315824-spatial/topic/How.20to.20calculate.20the.20number.20of.20observations.3F
1733
+ # artifact.n_observations = ...
1734
+ return artifact
1735
+
1736
+ @classmethod
1737
+ def from_tiledbsoma(
1738
+ cls,
1739
+ path: UPathStr,
1740
+ *,
1741
+ key: str | None = None,
1742
+ description: str | None = None,
1743
+ run: Run | None = None,
1744
+ revises: Artifact | None = None,
1745
+ **kwargs,
1746
+ ) -> Artifact:
1747
+ """Create from a tiledbsoma store.
1748
+
1749
+ Args:
1750
+ path: A tiledbsoma store with .tiledbsoma suffix.
1751
+ key: A relative path within default storage,
1752
+ e.g., `"myfolder/mystore.tiledbsoma"`.
1753
+ description: A description.
1754
+ revises: An old version of the artifact.
1755
+ run: The run that creates the artifact.
1756
+
1757
+ Examples:
1758
+ >>> artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store")
1759
+ >>> artifact.save()
1760
+ """
1761
+ if UPath(path).suffix != ".tiledbsoma":
1762
+ raise ValueError(
1763
+ "A tiledbsoma store should have .tiledbsoma suffix to be registered."
1764
+ )
1765
+ artifact = Artifact( # type: ignore
1766
+ data=path,
1767
+ key=key,
1768
+ run=run,
1769
+ description=description,
1770
+ revises=revises,
1771
+ otype="tiledbsoma",
1772
+ kind="dataset",
1773
+ **kwargs,
1774
+ )
1775
+ artifact.n_observations = _soma_n_observations(artifact.path)
1776
+ return artifact
1777
+
1778
+ @classmethod
1779
+ def from_dir(
1780
+ cls,
1781
+ path: UPathStr,
1782
+ *,
1783
+ key: str | None = None,
1784
+ run: Run | None = None,
1785
+ ) -> list[Artifact]:
1786
+ """Create a list of artifact objects from a directory.
1787
+
1788
+ Hint:
1789
+ If you have a high number of files (several 100k) and don't want to
1790
+ track them individually, create a single :class:`~lamindb.Artifact` via
1791
+ ``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`.
1792
+
1793
+ Args:
1794
+ path: Source path of folder.
1795
+ key: Key for storage destination. If `None` and
1796
+ directory is in a registered location, the inferred `key` will
1797
+ reflect the relative position. If `None` and directory is outside
1798
+ of a registered storage location, the inferred key defaults to `path.name`.
1799
+ run: A `Run` object.
1800
+
1801
+ Examples:
1802
+ >>> dir_path = ln.core.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
1803
+ >>> artifacts = ln.Artifact.from_dir(dir_path)
1804
+ >>> ln.save(artifacts)
1805
+ """
1806
+ from lamindb import settings
1807
+
1808
+ folderpath: UPath = create_path(path) # returns Path for local
1809
+ default_storage = settings.storage.record
1810
+ using_key = settings._using_key
1811
+ storage, use_existing_storage = process_pathlike(
1812
+ folderpath, default_storage, using_key
1813
+ )
1814
+ folder_key_path: PurePath | Path
1815
+ if key is None:
1816
+ if not use_existing_storage:
1817
+ logger.warning(
1818
+ "folder is outside existing storage location, will copy files from"
1819
+ f" {path} to {storage.root}/{folderpath.name}"
1820
+ )
1821
+ folder_key_path = Path(folderpath.name)
1822
+ else:
1823
+ # maintain the hierachy within an existing storage location
1824
+ folder_key_path = get_relative_path_to_directory(
1825
+ folderpath, UPath(storage.root)
1826
+ )
1827
+ else:
1828
+ folder_key_path = Path(key)
1829
+
1830
+ folder_key = folder_key_path.as_posix()
1831
+ # silence fine-grained logging
1832
+ verbosity = settings.verbosity
1833
+ verbosity_int = settings._verbosity_int
1834
+ if verbosity_int >= 1:
1835
+ settings.verbosity = "warning"
1836
+ artifacts_dict = {}
1837
+ for filepath in folderpath.rglob("*"):
1838
+ if filepath.is_file():
1839
+ relative_path = get_relative_path_to_directory(filepath, folderpath)
1840
+ artifact_key = folder_key + "/" + relative_path.as_posix()
1841
+ # if creating from rglob, we don't need to check for existence
1842
+ artifact = Artifact(
1843
+ filepath, run=run, key=artifact_key, skip_check_exists=True
1844
+ )
1845
+ artifacts_dict[artifact.uid] = artifact
1846
+ settings.verbosity = verbosity
1847
+
1848
+ # run sanity check on hashes
1849
+ hashes = [
1850
+ artifact.hash
1851
+ for artifact in artifacts_dict.values()
1852
+ if artifact.hash is not None
1853
+ ]
1854
+ uids = artifacts_dict.keys()
1855
+ n_unique_hashes = len(set(hashes))
1856
+ if n_unique_hashes == len(hashes):
1857
+ artifacts = list(artifacts_dict.values())
1858
+ else:
1859
+ # consider exact duplicates (same id, same hash)
1860
+ # below can't happen anymore because artifacts is a dict now
1861
+ # if len(set(uids)) == len(set(hashes)):
1862
+ # logger.warning("dropping duplicate records in list of artifact records")
1863
+ # artifacts = list(set(uids))
1864
+ # consider false duplicates (different id, same hash)
1865
+ if not len(set(uids)) == n_unique_hashes:
1866
+ seen_hashes = set()
1867
+ non_unique_artifacts = {
1868
+ hash: artifact
1869
+ for hash, artifact in artifacts_dict.items()
1870
+ if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore
1871
+ }
1872
+ display_non_unique = "\n ".join(
1873
+ f"{artifact}" for artifact in non_unique_artifacts
1874
+ )
1875
+ logger.warning(
1876
+ "there are multiple artifact uids with the same hashes, dropping"
1877
+ f" {len(non_unique_artifacts)} duplicates out of"
1878
+ f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
1879
+ )
1880
+ artifacts = [
1881
+ artifact
1882
+ for artifact in artifacts_dict.values()
1883
+ if artifact not in non_unique_artifacts.values()
1884
+ ]
1885
+ logger.success(
1886
+ f"created {len(artifacts)} artifacts from directory using storage"
1887
+ f" {storage.root} and key = {folder_key}/"
1888
+ )
1889
+ return artifacts
1890
+
1891
+ def replace(
1892
+ self,
1893
+ data: Union[UPathStr, pd.DataFrame, AnnData, MuData],
1894
+ run: Run | None = None,
1895
+ format: str | None = None,
1896
+ ) -> None:
1897
+ """Replace artifact content.
1898
+
1899
+ Args:
1900
+ data: A file path.
1901
+ run: The run that created the artifact gets
1902
+ auto-linked if ``ln.track()`` was called.
1903
+
1904
+ Examples:
1905
+ Say we made a change to the content of an artifact, e.g., edited the image
1906
+ `paradisi05_laminopathic_nuclei.jpg`.
1907
+
1908
+ This is how we replace the old file in storage with the new file:
1909
+
1910
+ >>> artifact.replace("paradisi05_laminopathic_nuclei.jpg")
1911
+ >>> artifact.save()
1912
+
1913
+ Note that this neither changes the storage key nor the filename.
1914
+
1915
+ However, it will update the suffix if it changes.
1916
+ """
1917
+ from lamindb import settings
1918
+
1919
+ default_storage = settings.storage.record
1920
+ kwargs, privates = get_artifact_kwargs_from_data(
1921
+ provisional_uid=self.uid,
1922
+ data=data,
1923
+ key=self.key,
1924
+ run=run,
1925
+ format=format,
1926
+ default_storage=default_storage,
1927
+ version=None,
1928
+ is_replace=True,
1929
+ )
1930
+
1931
+ # this artifact already exists
1932
+ if privates is None:
1933
+ return kwargs
1934
+
1935
+ check_path_in_storage = privates["check_path_in_storage"]
1936
+ if check_path_in_storage:
1937
+ err_msg = (
1938
+ "Can only replace with a local path not in any Storage. "
1939
+ f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}."
1940
+ )
1941
+ raise ValueError(err_msg)
1942
+
1943
+ _overwrite_versions = kwargs["_overwrite_versions"]
1944
+ if self._overwrite_versions != _overwrite_versions:
1945
+ err_msg = "It is not allowed to replace "
1946
+ err_msg += "a folder" if self._overwrite_versions else "a file"
1947
+ err_msg += " with " + ("a folder." if _overwrite_versions else "a file.")
1948
+ raise ValueError(err_msg)
1949
+
1950
+ if self.key is not None and not self._key_is_virtual:
1951
+ key_path = PurePosixPath(self.key)
1952
+ new_filename = f"{key_path.stem}{kwargs['suffix']}"
1953
+ # the following will only be true if the suffix changes!
1954
+ if key_path.name != new_filename:
1955
+ self._clear_storagekey = self.key
1956
+ self.key = str(key_path.with_name(new_filename))
1957
+ # update old key with the new one so that checks in record pass
1958
+ self._old_key = self.key
1959
+ logger.warning(
1960
+ f"replacing the file will replace key '{key_path}' with '{self.key}'"
1961
+ f" and delete '{key_path}' upon `save()`"
1962
+ )
1963
+ else:
1964
+ old_storage = auto_storage_key_from_artifact(self)
1965
+ is_dir = self.n_files is not None
1966
+ new_storage = auto_storage_key_from_artifact_uid(
1967
+ self.uid, kwargs["suffix"], is_dir
1968
+ )
1969
+ if old_storage != new_storage:
1970
+ self._clear_storagekey = old_storage
1971
+ if self.key is not None:
1972
+ new_key_path = PurePosixPath(self.key).with_suffix(kwargs["suffix"])
1973
+ self.key = str(new_key_path)
1974
+ # update old key with the new one so that checks in record pass
1975
+ self._old_key = self.key
1976
+
1977
+ self.suffix = kwargs["suffix"]
1978
+ self.size = kwargs["size"]
1979
+ self.hash = kwargs["hash"]
1980
+ self._hash_type = kwargs["_hash_type"]
1981
+ self.run_id = kwargs["run_id"]
1982
+ self.run = kwargs["run"]
1983
+ self.n_files = kwargs["n_files"]
1984
+
1985
+ self._local_filepath = privates["local_filepath"]
1986
+ self._cloud_filepath = privates["cloud_filepath"]
1987
+ self._memory_rep = privates["memory_rep"]
1988
+ # no need to upload if new file is already in storage
1989
+ self._to_store = not check_path_in_storage
1990
+
1991
+ def open(
1992
+ self, mode: str = "r", is_run_input: bool | None = None, **kwargs
1993
+ ) -> Union[
1994
+ AnnDataAccessor,
1995
+ BackedAccessor,
1996
+ SOMACollection,
1997
+ SOMAExperiment,
1998
+ SOMAMeasurement,
1999
+ PyArrowDataset,
2000
+ ]:
2001
+ """Return a cloud-backed data object.
2002
+
2003
+ Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
2004
+ `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
2005
+
2006
+ Args:
2007
+ mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
2008
+ otherwise should be always `"r"` (read-only mode).
2009
+
2010
+ Notes:
2011
+ For more info, see tutorial: :doc:`/arrays`.
2012
+
2013
+ Examples:
2014
+
2015
+ Read AnnData in backed mode from cloud:
2016
+
2017
+ >>> artifact = ln.Artifact.get(key="lndb-storage/pbmc68k.h5ad")
2018
+ >>> artifact.open()
2019
+ AnnDataAccessor object with n_obs × n_vars = 70 × 765
2020
+ constructed for the AnnData object pbmc68k.h5ad
2021
+ ...
2022
+ """
2023
+ if self._overwrite_versions and not self.is_latest:
2024
+ raise ValueError(INCONSISTENT_STATE_MSG)
2025
+ # all hdf5 suffixes including gzipped
2026
+ h5_suffixes = [".h5", ".hdf5", ".h5ad"]
2027
+ h5_suffixes += [s + ".gz" for s in h5_suffixes]
2028
+ # ignore empty suffix for now
2029
+ suffixes = (
2030
+ (
2031
+ "",
2032
+ ".zarr",
2033
+ ".anndata.zarr",
2034
+ ".tiledbsoma",
2035
+ )
2036
+ + tuple(h5_suffixes)
2037
+ + PYARROW_SUFFIXES
2038
+ + tuple(
2039
+ s + ".gz" for s in PYARROW_SUFFIXES
2040
+ ) # this doesn't work for externally gzipped files, REMOVE LATER
2041
+ )
2042
+ if self.suffix not in suffixes:
2043
+ raise ValueError(
2044
+ "Artifact should have a zarr, h5, tiledbsoma object"
2045
+ " or a compatible `pyarrow.dataset.dataset` directory"
2046
+ " as the underlying data, please use one of the following suffixes"
2047
+ f" for the object name: {', '.join(suffixes[1:])}."
2048
+ f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
2049
+ " (no mixing allowed)."
2050
+ )
2051
+ if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
2052
+ raise ValueError(
2053
+ "Only a tiledbsoma store can be openened with `mode!='r'`."
2054
+ )
2055
+
2056
+ from lamindb import settings
2057
+ from lamindb.core.storage._backed_access import (
2058
+ _track_writes_factory,
2059
+ backed_access,
2060
+ )
2061
+
2062
+ using_key = settings._using_key
2063
+ filepath, cache_key = filepath_cache_key_from_artifact(
2064
+ self, using_key=using_key
2065
+ )
2066
+ is_tiledbsoma_w = (
2067
+ filepath.name == "soma" or self.suffix == ".tiledbsoma"
2068
+ ) and mode == "w"
2069
+ # consider the case where an object is already locally cached
2070
+ localpath = setup_settings.paths.cloud_to_local_no_update(
2071
+ filepath, cache_key=cache_key
2072
+ )
2073
+ if is_tiledbsoma_w:
2074
+ open_cache = False
2075
+ else:
2076
+ open_cache = not isinstance(
2077
+ filepath, LocalPathClasses
2078
+ ) and not filepath.synchronize(localpath, just_check=True)
2079
+ if open_cache:
2080
+ try:
2081
+ access = backed_access(localpath, mode, using_key, **kwargs)
2082
+ except Exception as e:
2083
+ if isinstance(filepath, LocalPathClasses):
2084
+ raise e
2085
+ logger.warning(
2086
+ f"The cache might be corrupted: {e}. Trying to open directly."
2087
+ )
2088
+ access = backed_access(filepath, mode, using_key, **kwargs)
2089
+ # happens only if backed_access has been successful
2090
+ # delete the corrupted cache
2091
+ if localpath.is_dir():
2092
+ shutil.rmtree(localpath)
2093
+ else:
2094
+ localpath.unlink(missing_ok=True)
2095
+ else:
2096
+ access = backed_access(filepath, mode, using_key, **kwargs)
2097
+ if is_tiledbsoma_w:
2098
+
2099
+ def finalize():
2100
+ nonlocal self, filepath, localpath
2101
+ if not isinstance(filepath, LocalPathClasses):
2102
+ _, hash, _, _ = get_stat_dir_cloud(filepath)
2103
+ else:
2104
+ # this can be very slow
2105
+ _, hash, _, _ = hash_dir(filepath)
2106
+ if self.hash != hash:
2107
+ from .record import init_self_from_db
2108
+
2109
+ new_version = Artifact(
2110
+ filepath, revises=self, _is_internal_call=True
2111
+ ).save()
2112
+ init_self_from_db(self, new_version)
2113
+
2114
+ if localpath != filepath and localpath.exists():
2115
+ shutil.rmtree(localpath)
2116
+
2117
+ access = _track_writes_factory(access, finalize)
2118
+ # only call if open is successfull
2119
+ _track_run_input(self, is_run_input)
2120
+ return access
2121
+
2122
+ def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
2123
+ """Cache and load into memory.
2124
+
2125
+ See all :mod:`~lamindb.core.loaders`.
2126
+
2127
+ Examples:
2128
+
2129
+ Load a `DataFrame`-like artifact:
2130
+
2131
+ >>> artifact.load().head()
2132
+ sepal_length sepal_width petal_length petal_width iris_organism_code
2133
+ 0 0.051 0.035 0.014 0.002 0
2134
+ 1 0.049 0.030 0.014 0.002 0
2135
+ 2 0.047 0.032 0.013 0.002 0
2136
+ 3 0.046 0.031 0.015 0.002 0
2137
+ 4 0.050 0.036 0.014 0.002 0
2138
+
2139
+ Load an `AnnData`-like artifact:
2140
+
2141
+ >>> artifact.load()
2142
+ AnnData object with n_obs × n_vars = 70 × 765
2143
+
2144
+ Fall back to :meth:`~lamindb.Artifact.cache` if no in-memory representation is configured:
2145
+
2146
+ >>> artifact.load()
2147
+ PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/.lamindb/jb7BY5UJoQVGMUOKiLcn.jpg')
2148
+ """
2149
+ from lamindb import settings
2150
+
2151
+ if self._overwrite_versions and not self.is_latest:
2152
+ raise ValueError(INCONSISTENT_STATE_MSG)
2153
+
2154
+ if hasattr(self, "_memory_rep") and self._memory_rep is not None:
2155
+ access_memory = self._memory_rep
2156
+ else:
2157
+ filepath, cache_key = filepath_cache_key_from_artifact(
2158
+ self, using_key=settings._using_key
2159
+ )
2160
+ cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
2161
+ try:
2162
+ # cache_path is local so doesn't trigger any sync in load_to_memory
2163
+ access_memory = load_to_memory(cache_path, **kwargs)
2164
+ except Exception as e:
2165
+ # raise the exception if it comes from not having a correct loader
2166
+ # or if the original path is local
2167
+ if isinstance(e, NotImplementedError) or isinstance(
2168
+ filepath, LocalPathClasses
2169
+ ):
2170
+ raise e
2171
+ logger.warning(
2172
+ f"The cache might be corrupted: {e}. Retrying to synchronize."
2173
+ )
2174
+ # delete the existing cache
2175
+ if cache_path.is_dir():
2176
+ shutil.rmtree(cache_path)
2177
+ else:
2178
+ cache_path.unlink(missing_ok=True)
2179
+ # download again and try to load into memory
2180
+ cache_path = _synchronize_cleanup_on_error(
2181
+ filepath, cache_key=cache_key
2182
+ )
2183
+ access_memory = load_to_memory(cache_path, **kwargs)
2184
+ # only call if load is successfull
2185
+ _track_run_input(self, is_run_input)
2186
+ return access_memory
2187
+
2188
+ def cache(self, is_run_input: bool | None = None) -> Path:
2189
+ """Download cloud artifact to local cache.
2190
+
2191
+ Follows synching logic: only caches an artifact if it's outdated in the local cache.
2192
+
2193
+ Returns a path to a locally cached on-disk object (say a `.jpg` file).
2194
+
2195
+ Examples:
2196
+
2197
+ Sync file from cloud and return the local path of the cache:
2198
+
2199
+ >>> artifact.cache()
2200
+ PosixPath('/home/runner/work/Caches/lamindb/lamindb-ci/lndb-storage/pbmc68k.h5ad')
2201
+ """
2202
+ from lamindb import settings
2203
+
2204
+ if self._overwrite_versions and not self.is_latest:
2205
+ raise ValueError(INCONSISTENT_STATE_MSG)
2206
+
2207
+ filepath, cache_key = filepath_cache_key_from_artifact(
2208
+ self, using_key=settings._using_key
2209
+ )
2210
+ cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
2211
+ # only call if sync is successfull
2212
+ _track_run_input(self, is_run_input)
2213
+ return cache_path
2214
+
2215
+ def delete(
2216
+ self,
2217
+ permanent: bool | None = None,
2218
+ storage: bool | None = None,
2219
+ using_key: str | None = None,
2220
+ ) -> None:
2221
+ """Trash or permanently delete.
2222
+
2223
+ A first call to `.delete()` puts an artifact into the trash (sets `_branch_code` to `-1`).
2224
+ A second call permanently deletes the artifact.
2225
+ If it is a folder artifact with multiple versions, deleting a non-latest version
2226
+ will not delete the underlying storage by default (if `storage=True` is not specified).
2227
+ Deleting the latest version will delete all the versions for folder artifacts.
2228
+
2229
+ FAQ: :doc:`docs:faq/storage`
2230
+
2231
+ Args:
2232
+ permanent: Permanently delete the artifact (skip trash).
2233
+ storage: Indicate whether you want to delete the artifact in storage.
2234
+
2235
+ Examples:
2236
+
2237
+ For an `Artifact` object `artifact`, call:
2238
+
2239
+ >>> artifact = ln.Artifact.filter(key="some.csv").one()
2240
+ >>> artifact.delete() # delete a single file artifact
2241
+
2242
+ >>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=False).first()
2243
+ >>> artiact.delete() # delete an old version, the data will not be deleted
2244
+
2245
+ >>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=True).one()
2246
+ >>> artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2247
+ """
2248
+ # this first check means an invalid delete fails fast rather than cascading through
2249
+ # database and storage permission errors
2250
+ if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
2251
+ isettings = setup_settings.instance
2252
+ if self.storage.instance_uid != isettings.uid and (
2253
+ storage or storage is None
2254
+ ):
2255
+ raise IntegrityError(
2256
+ "Cannot simply delete artifacts outside of this instance's managed storage locations."
2257
+ "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
2258
+ f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
2259
+ f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
2260
+ )
2261
+ # by default, we only move artifacts into the trash (_branch_code = -1)
2262
+ trash__branch_code = -1
2263
+ if self._branch_code > trash__branch_code and not permanent:
2264
+ if storage is not None:
2265
+ logger.warning("moving artifact to trash, storage arg is ignored")
2266
+ # move to trash
2267
+ self._branch_code = trash__branch_code
2268
+ self.save()
2269
+ logger.important(
2270
+ f"moved artifact to trash (_branch_code = {trash__branch_code})"
2271
+ )
2272
+ return
2273
+
2274
+ # if the artifact is already in the trash
2275
+ # permanent delete skips the trash
2276
+ if permanent is None:
2277
+ # ask for confirmation of permanent delete
2278
+ response = input(
2279
+ "Artifact record is already in trash! Are you sure you want to permanently"
2280
+ " delete it? (y/n) You can't undo this action."
2281
+ )
2282
+ delete_record = response == "y"
2283
+ else:
2284
+ assert permanent # noqa: S101
2285
+ delete_record = True
2286
+
2287
+ if delete_record:
2288
+ # need to grab file path before deletion
2289
+ try:
2290
+ path, _ = filepath_from_artifact(self, using_key)
2291
+ except OSError:
2292
+ # we can still delete the record
2293
+ logger.warning("Could not get path")
2294
+ storage = False
2295
+ # only delete in storage if DB delete is successful
2296
+ # DB delete might error because of a foreign key constraint violated etc.
2297
+ if self._overwrite_versions and self.is_latest:
2298
+ # includes self
2299
+ for version in self.versions.all():
2300
+ _delete_skip_storage(version)
2301
+ else:
2302
+ self._delete_skip_storage()
2303
+ # by default do not delete storage if deleting only a previous version
2304
+ # and the underlying store is mutable
2305
+ if self._overwrite_versions and not self.is_latest:
2306
+ delete_in_storage = False
2307
+ if storage:
2308
+ logger.warning(
2309
+ "Storage argument is ignored; can't delete storage on an previous version"
2310
+ )
2311
+ elif self.key is None or self._key_is_virtual:
2312
+ # do not ask for confirmation also if storage is None
2313
+ delete_in_storage = storage is None or storage
2314
+ else:
2315
+ # for artifacts with non-virtual semantic storage keys (key is not None)
2316
+ # ask for extra-confirmation
2317
+ if storage is None:
2318
+ response = input(
2319
+ f"Are you sure to want to delete {path}? (y/n) You can't undo"
2320
+ " this action."
2321
+ )
2322
+ delete_in_storage = response == "y"
2323
+ else:
2324
+ delete_in_storage = storage
2325
+ if not delete_in_storage:
2326
+ logger.important(f"a file/folder remains here: {path}")
2327
+ # we don't yet have logic to bring back the deleted metadata record
2328
+ # in case storage deletion fails - this is important for ACID down the road
2329
+ if delete_in_storage:
2330
+ delete_msg = delete_storage(path, raise_file_not_found_error=False)
2331
+ if delete_msg != "did-not-delete":
2332
+ logger.success(f"deleted {colors.yellow(f'{path}')}")
2333
+
2334
+ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
2335
+ """Save to database & storage.
2336
+
2337
+ Args:
2338
+ upload: Trigger upload to cloud storage in instances with hybrid storage mode.
2339
+
2340
+ Examples:
2341
+ >>> artifact = ln.Artifact("./myfile.csv", description="myfile")
2342
+ >>> artifact.save()
2343
+ """
2344
+ state_was_adding = self._state.adding
2345
+ print_progress = kwargs.pop("print_progress", True)
2346
+ store_kwargs = kwargs.pop(
2347
+ "store_kwargs", {}
2348
+ ) # kwargs for .upload_from in the end
2349
+ access_token = kwargs.pop("access_token", None)
2350
+ local_path = None
2351
+ if upload and setup_settings.instance.keep_artifacts_local:
2352
+ # switch local storage location to cloud
2353
+ local_path = self.path
2354
+ self.storage_id = setup_settings.instance.storage.id
2355
+ self._local_filepath = local_path
2356
+ # switch to virtual storage key upon upload
2357
+ # the local filepath is already cached at that point
2358
+ self._key_is_virtual = True
2359
+ # ensure that the artifact is uploaded
2360
+ self._to_store = True
2361
+
2362
+ self._save_skip_storage(**kwargs)
2363
+
2364
+ from .save import check_and_attempt_clearing, check_and_attempt_upload
2365
+
2366
+ using_key = None
2367
+ if "using" in kwargs:
2368
+ using_key = kwargs["using"]
2369
+ exception_upload = check_and_attempt_upload(
2370
+ self,
2371
+ using_key,
2372
+ access_token=access_token,
2373
+ print_progress=print_progress,
2374
+ **store_kwargs,
2375
+ )
2376
+ if exception_upload is not None:
2377
+ # we do not want to raise file not found on cleanup if upload of a file failed
2378
+ # often it is ACID in the filesystem itself
2379
+ # for example, s3 won't have the failed file, so just skip the delete in this case
2380
+ raise_file_not_found_error = False
2381
+ self._delete_skip_storage()
2382
+ else:
2383
+ # this is the case when it is cleaned on .replace
2384
+ raise_file_not_found_error = True
2385
+ # this is triggered by an exception in check_and_attempt_upload or by replace.
2386
+ exception_clear = check_and_attempt_clearing(
2387
+ self,
2388
+ raise_file_not_found_error=raise_file_not_found_error,
2389
+ using_key=using_key,
2390
+ )
2391
+ if exception_upload is not None:
2392
+ raise RuntimeError(exception_upload)
2393
+ if exception_clear is not None:
2394
+ raise RuntimeError(exception_clear)
2395
+ # this is only for keep_artifacts_local
2396
+ if local_path is not None and not state_was_adding:
2397
+ # only move the local artifact to cache if it was not newly created
2398
+ local_path_cache = ln_setup.settings.cache_dir / local_path.name
2399
+ # don't use Path.rename here because of cross-device link error
2400
+ # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
2401
+ shutil.move(
2402
+ local_path, # type: ignore
2403
+ local_path_cache,
2404
+ )
2405
+ logger.important(f"moved local artifact to cache: {local_path_cache}")
2406
+ return self
2407
+
2408
+ def restore(self) -> None:
2409
+ """Restore from trash.
2410
+
2411
+ Examples:
2412
+ >>> artifact.restore()
2413
+ """
2414
+ self._branch_code = 1
2415
+ self.save()
2416
+
2417
+ def describe(self) -> None:
2418
+ """Describe relations of record.
2419
+
2420
+ Examples:
2421
+ >>> artifact.describe()
2422
+ """
2423
+ return describe_artifact_collection(self)
2424
+
2425
+ def _populate_subsequent_runs(self, run: Run) -> None:
2426
+ _populate_subsequent_runs_(self, run)
2427
+
2428
+
2429
+ # can't really just call .cache in .load because of double tracking
2430
+ def _synchronize_cleanup_on_error(
2431
+ filepath: UPath, cache_key: str | None = None
2432
+ ) -> UPath:
2433
+ try:
2434
+ cache_path = setup_settings.paths.cloud_to_local(
2435
+ filepath, cache_key=cache_key, print_progress=True
2436
+ )
2437
+ except Exception as e:
2438
+ if not isinstance(filepath, LocalPathClasses):
2439
+ cache_path = setup_settings.paths.cloud_to_local_no_update(
2440
+ filepath, cache_key=cache_key
2441
+ )
2442
+ if cache_path.is_dir():
2443
+ shutil.rmtree(cache_path)
2444
+ else:
2445
+ cache_path.unlink(missing_ok=True)
2446
+ raise e
2447
+ return cache_path
2448
+
2449
+
2450
+ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
2451
+ super(Artifact, artifact).delete(*args, **kwargs)
2452
+
2453
+
2454
+ def _save_skip_storage(artifact, **kwargs) -> None:
2455
+ save_staged_feature_sets(artifact)
2456
+ super(Artifact, artifact).save(**kwargs)
2457
+ save_schema_links(artifact)
2458
+
2459
+
2460
+ class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
2461
+ id: int = models.BigAutoField(primary_key=True)
2462
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
2463
+ # we follow the lower() case convention rather than snake case for link models
2464
+ featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
2465
+
2466
+ class Meta:
2467
+ unique_together = ("artifact", "featurevalue")
2468
+
2469
+
2470
+ class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
2471
+ id: int = models.BigAutoField(primary_key=True)
2472
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
2473
+ # we follow the lower() case convention rather than snake case for link models
2474
+ paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
2475
+
2476
+ class Meta:
2477
+ unique_together = ("artifact", "paramvalue")
2478
+
2479
+
2480
+ def _track_run_input(
2481
+ data: Artifact
2482
+ | Iterable[Artifact], # can also be Collection | Iterable[Collection]
2483
+ is_run_input: bool | Run | None = None,
2484
+ run: Run | None = None,
2485
+ ):
2486
+ from lamindb import settings
2487
+
2488
+ from .._tracked import get_current_tracked_run
2489
+ from ..core._context import context
2490
+ from .collection import Collection
2491
+
2492
+ if isinstance(is_run_input, Run):
2493
+ run = is_run_input
2494
+ is_run_input = True
2495
+ elif run is None:
2496
+ run = get_current_tracked_run()
2497
+ if run is None:
2498
+ run = context.run
2499
+ # consider that data is an iterable of Data
2500
+ data_iter: Iterable[Artifact] | Iterable[Collection] = (
2501
+ [data] if isinstance(data, (Artifact, Collection)) else data
2502
+ )
2503
+ track_run_input = False
2504
+ input_data = []
2505
+ if run is not None:
2506
+ # avoid cycles: data can't be both input and output
2507
+ def is_valid_input(data: Artifact | Collection):
2508
+ is_valid = False
2509
+ if data._state.db == "default":
2510
+ # things are OK if the record is on the default db
2511
+ is_valid = True
2512
+ elif data._state.db is None:
2513
+ # if a record is not yet saved, it can't be an input
2514
+ # we silently ignore because what likely happens is that
2515
+ # the user works with an object that's about to be saved
2516
+ # in the current Python session
2517
+ is_valid = False
2518
+ else:
2519
+ # record is on another db
2520
+ # we have to save the record into the current db with
2521
+ # the run being attached to a transfer transform
2522
+ logger.important(
2523
+ f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}') as input"
2524
+ )
2525
+ data.save()
2526
+ is_valid = True
2527
+ return (
2528
+ data.run_id != run.id
2529
+ and not data._state.adding # this seems duplicated with data._state.db is None
2530
+ and is_valid
2531
+ )
2532
+
2533
+ input_data = [data for data in data_iter if is_valid_input(data)]
2534
+ input_data_ids = [data.id for data in input_data]
2535
+ if input_data:
2536
+ data_class_name = input_data[0].__class__.__name__.lower()
2537
+ # let us first look at the case in which the user does not
2538
+ # provide a boolean value for `is_run_input`
2539
+ # hence, we need to determine whether we actually want to
2540
+ # track a run or not
2541
+ if is_run_input is None:
2542
+ # we don't have a run record
2543
+ if run is None:
2544
+ if settings.track_run_inputs:
2545
+ # here we check that this is not a read-only connection
2546
+ # normally for our connection strings the read-only role name has _read in it
2547
+ # not absolutely safe but the worst case is that the warning is not shown
2548
+ instance = setup_settings.instance
2549
+ if instance.dialect != "postgresql" or "_read" not in instance.db:
2550
+ logger.warning(WARNING_NO_INPUT)
2551
+ # assume we have a run record
2552
+ else:
2553
+ # assume there is non-cyclic candidate input data
2554
+ if input_data:
2555
+ if settings.track_run_inputs:
2556
+ transform_note = ""
2557
+ if len(input_data) == 1:
2558
+ if input_data[0].transform is not None:
2559
+ transform_note = (
2560
+ ", adding parent transform"
2561
+ f" {input_data[0].transform.id}"
2562
+ )
2563
+ logger.info(
2564
+ f"adding {data_class_name} ids {input_data_ids} as inputs for run"
2565
+ f" {run.id}{transform_note}"
2566
+ )
2567
+ track_run_input = True
2568
+ else:
2569
+ logger.hint(
2570
+ "track these data as a run input by passing `is_run_input=True`"
2571
+ )
2572
+ else:
2573
+ track_run_input = is_run_input
2574
+ if track_run_input:
2575
+ if run is None:
2576
+ raise ValueError("No run context set. Call `ln.track()`.")
2577
+ # avoid adding the same run twice
2578
+ run.save()
2579
+ if data_class_name == "artifact":
2580
+ LinkORM = run.input_artifacts.through
2581
+ links = [
2582
+ LinkORM(run_id=run.id, artifact_id=data_id)
2583
+ for data_id in input_data_ids
2584
+ ]
2585
+ else:
2586
+ LinkORM = run.input_collections.through
2587
+ links = [
2588
+ LinkORM(run_id=run.id, collection_id=data_id)
2589
+ for data_id in input_data_ids
2590
+ ]
2591
+ LinkORM.objects.bulk_create(links, ignore_conflicts=True)
2592
+ # generalize below for more than one data batch
2593
+ if len(input_data) == 1:
2594
+ if input_data[0].transform is not None:
2595
+ run.transform.predecessors.add(input_data[0].transform)
2596
+
2597
+
2598
+ # privates currently dealt with separately
2599
+ # mypy: ignore-errors
2600
+ Artifact._delete_skip_storage = _delete_skip_storage
2601
+ Artifact._save_skip_storage = _save_skip_storage
2602
+ Artifact.view_lineage = view_lineage