lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. lamindb/__init__.py +31 -26
  2. lamindb/_finish.py +9 -1
  3. lamindb/_tracked.py +26 -3
  4. lamindb/_view.py +2 -3
  5. lamindb/base/__init__.py +1 -1
  6. lamindb/base/ids.py +1 -10
  7. lamindb/base/users.py +1 -4
  8. lamindb/core/__init__.py +7 -65
  9. lamindb/core/_context.py +41 -10
  10. lamindb/core/_mapped_collection.py +4 -2
  11. lamindb/core/_settings.py +6 -6
  12. lamindb/core/_sync_git.py +1 -1
  13. lamindb/core/_track_environment.py +2 -1
  14. lamindb/core/datasets/_small.py +3 -3
  15. lamindb/core/loaders.py +22 -9
  16. lamindb/core/storage/_anndata_accessor.py +8 -3
  17. lamindb/core/storage/_backed_access.py +14 -7
  18. lamindb/core/storage/_pyarrow_dataset.py +24 -9
  19. lamindb/core/storage/_tiledbsoma.py +6 -4
  20. lamindb/core/storage/_zarr.py +32 -11
  21. lamindb/core/storage/objects.py +59 -26
  22. lamindb/core/storage/paths.py +16 -13
  23. lamindb/curators/__init__.py +173 -145
  24. lamindb/errors.py +1 -1
  25. lamindb/integrations/_vitessce.py +4 -4
  26. lamindb/migrations/0089_subsequent_runs.py +159 -0
  27. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  28. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  29. lamindb/models/__init__.py +79 -0
  30. lamindb/{core → models}/_describe.py +3 -3
  31. lamindb/{core → models}/_django.py +8 -5
  32. lamindb/{core → models}/_feature_manager.py +103 -87
  33. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  34. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  35. lamindb/{core → models}/_label_manager.py +10 -17
  36. lamindb/{core/relations.py → models/_relations.py} +8 -1
  37. lamindb/models/artifact.py +2601 -0
  38. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  39. lamindb/models/collection.py +683 -0
  40. lamindb/models/core.py +135 -0
  41. lamindb/models/feature.py +643 -0
  42. lamindb/models/flextable.py +163 -0
  43. lamindb/{_parents.py → models/has_parents.py} +55 -49
  44. lamindb/models/project.py +384 -0
  45. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  46. lamindb/{_query_set.py → models/query_set.py} +52 -30
  47. lamindb/models/record.py +1757 -0
  48. lamindb/models/run.py +563 -0
  49. lamindb/{_save.py → models/save.py} +18 -8
  50. lamindb/models/schema.py +732 -0
  51. lamindb/models/transform.py +360 -0
  52. lamindb/models/ulabel.py +249 -0
  53. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
  54. lamindb-1.2a2.dist-info/RECORD +94 -0
  55. lamindb/_artifact.py +0 -1361
  56. lamindb/_collection.py +0 -440
  57. lamindb/_feature.py +0 -316
  58. lamindb/_is_versioned.py +0 -40
  59. lamindb/_record.py +0 -1065
  60. lamindb/_run.py +0 -60
  61. lamindb/_schema.py +0 -347
  62. lamindb/_storage.py +0 -15
  63. lamindb/_transform.py +0 -170
  64. lamindb/_ulabel.py +0 -56
  65. lamindb/_utils.py +0 -9
  66. lamindb/base/validation.py +0 -63
  67. lamindb/core/_data.py +0 -491
  68. lamindb/core/fields.py +0 -12
  69. lamindb/models.py +0 -4435
  70. lamindb-1.1.0.dist-info/RECORD +0 -95
  71. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
  72. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,2601 @@
1
+ # ruff: noqa: TC004
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import shutil
6
+ from collections import defaultdict
7
+ from pathlib import Path, PurePath, PurePosixPath
8
+ from typing import TYPE_CHECKING, Any, Union, overload
9
+
10
+ import fsspec
11
+ import lamindb_setup as ln_setup
12
+ import pandas as pd
13
+ from anndata import AnnData
14
+ from django.db import connections, models
15
+ from django.db.models import CASCADE, PROTECT, Q
16
+ from lamin_utils import colors, logger
17
+ from lamindb_setup import settings as setup_settings
18
+ from lamindb_setup._init_instance import register_storage_in_instance
19
+ from lamindb_setup.core._settings_storage import init_storage
20
+ from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file
21
+ from lamindb_setup.core.types import UPathStr
22
+ from lamindb_setup.core.upath import (
23
+ create_path,
24
+ extract_suffix_from_path,
25
+ get_stat_dir_cloud,
26
+ get_stat_file_cloud,
27
+ )
28
+
29
+ from lamindb.base import deprecated
30
+ from lamindb.base.fields import (
31
+ BigIntegerField,
32
+ BooleanField,
33
+ CharField,
34
+ ForeignKey,
35
+ )
36
+ from lamindb.errors import FieldValidationError
37
+ from lamindb.models.query_set import QuerySet
38
+
39
+ from ..base.users import current_user_id
40
+ from ..core.loaders import load_to_memory
41
+ from ..core.storage import (
42
+ LocalPathClasses,
43
+ UPath,
44
+ delete_storage,
45
+ infer_suffix,
46
+ write_to_disk,
47
+ )
48
+ from ..core.storage._anndata_accessor import _anndata_n_observations
49
+ from ..core.storage._pyarrow_dataset import PYARROW_SUFFIXES
50
+ from ..core.storage._tiledbsoma import _soma_n_observations
51
+ from ..core.storage.objects import is_package_installed
52
+ from ..core.storage.paths import (
53
+ AUTO_KEY_PREFIX,
54
+ auto_storage_key_from_artifact,
55
+ auto_storage_key_from_artifact_uid,
56
+ check_path_is_child_of_root,
57
+ filepath_cache_key_from_artifact,
58
+ filepath_from_artifact,
59
+ )
60
+ from ..errors import IntegrityError, InvalidArgument, ValidationError
61
+ from ..models._is_versioned import (
62
+ create_uid,
63
+ message_update_key_in_version_family,
64
+ )
65
+ from ._django import get_artifact_with_related
66
+ from ._feature_manager import (
67
+ FeatureManager,
68
+ ParamManager,
69
+ ParamManagerArtifact,
70
+ add_label_feature_links,
71
+ get_label_links,
72
+ )
73
+ from ._is_versioned import IsVersioned
74
+ from ._relations import (
75
+ dict_module_name_to_model_name,
76
+ dict_related_model_to_related_name,
77
+ )
78
+ from .core import Storage
79
+ from .feature import Feature, FeatureValue
80
+ from .has_parents import view_lineage
81
+ from .record import (
82
+ BasicRecord,
83
+ LinkORM,
84
+ Record,
85
+ _get_record_kwargs,
86
+ record_repr,
87
+ )
88
+ from .run import ParamValue, Run, TracksRun, TracksUpdates, User
89
+ from .schema import Schema
90
+ from .ulabel import ULabel
91
+
92
+ WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-run"
93
+
94
+ WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
95
+
96
+ try:
97
+ from ..core.storage._zarr import identify_zarr_type
98
+ except ImportError:
99
+
100
+ def identify_zarr_type(storepath): # type: ignore
101
+ raise ImportError("Please install zarr: pip install zarr<=2.18.4")
102
+
103
+
104
+ if TYPE_CHECKING:
105
+ from collections.abc import Iterable
106
+
107
+ from mudata import MuData # noqa: TC004
108
+ from pyarrow.dataset import Dataset as PyArrowDataset
109
+ from spatialdata import SpatialData # noqa: TC004
110
+ from tiledbsoma import Collection as SOMACollection
111
+ from tiledbsoma import Experiment as SOMAExperiment
112
+ from tiledbsoma import Measurement as SOMAMeasurement
113
+
114
+ from lamindb.base.types import StrField
115
+ from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
116
+
117
+ from ..base.types import (
118
+ ArtifactKind,
119
+ )
120
+ from ._label_manager import LabelManager
121
+ from .collection import Collection
122
+ from .project import Project, Reference
123
+ from .transform import Transform
124
+
125
+
126
+ INCONSISTENT_STATE_MSG = (
127
+ "Trying to read a folder artifact from an outdated version, "
128
+ "this can result in an incosistent state.\n"
129
+ "Read from the latest version: artifact.versions.filter(is_latest=True).one()"
130
+ )
131
+
132
+
133
+ def process_pathlike(
134
+ filepath: UPath,
135
+ default_storage: Storage,
136
+ using_key: str | None,
137
+ skip_existence_check: bool = False,
138
+ ) -> tuple[Storage, bool]:
139
+ """Determines the appropriate storage for a given path and whether to use an existing storage key."""
140
+ if not skip_existence_check:
141
+ try: # check if file exists
142
+ if not filepath.exists():
143
+ raise FileNotFoundError(filepath)
144
+ except PermissionError:
145
+ pass
146
+ if check_path_is_child_of_root(filepath, default_storage.root):
147
+ use_existing_storage_key = True
148
+ return default_storage, use_existing_storage_key
149
+ else:
150
+ # check whether the path is part of one of the existing
151
+ # already-registered storage locations
152
+ result = False
153
+ # within the hub, we don't want to perform check_path_in_existing_storage
154
+ if using_key is None:
155
+ result = check_path_in_existing_storage(filepath, using_key)
156
+ if isinstance(result, Storage):
157
+ use_existing_storage_key = True
158
+ return result, use_existing_storage_key
159
+ else:
160
+ # if the path is in the cloud, we have a good candidate
161
+ # for the storage root: the bucket
162
+ if not isinstance(filepath, LocalPathClasses):
163
+ # for a cloud path, new_root is always the bucket name
164
+ if filepath.protocol == "hf":
165
+ hf_path = filepath.fs.resolve_path(filepath.as_posix())
166
+ hf_path.path_in_repo = ""
167
+ new_root = "hf://" + hf_path.unresolve()
168
+ else:
169
+ if filepath.protocol == "s3":
170
+ # check that endpoint_url didn't propagate here
171
+ # as a part of the path string
172
+ assert "?" not in filepath.path # noqa: S101
173
+ new_root = list(filepath.parents)[-1]
174
+ # do not register remote storage locations on hub if the current instance
175
+ # is not managed on the hub
176
+ storage_settings, _ = init_storage(
177
+ new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
178
+ )
179
+ storage_record = register_storage_in_instance(storage_settings)
180
+ use_existing_storage_key = True
181
+ return storage_record, use_existing_storage_key
182
+ # if the filepath is local
183
+ else:
184
+ use_existing_storage_key = False
185
+ # if the default storage is local we'll throw an error if the user
186
+ # doesn't provide a key
187
+ if default_storage.type == "local":
188
+ return default_storage, use_existing_storage_key
189
+ # if the default storage is in the cloud (the file is going to
190
+ # be uploaded upon saving it), we treat the filepath as a cache
191
+ else:
192
+ return default_storage, use_existing_storage_key
193
+
194
+
195
+ def process_data(
196
+ provisional_uid: str,
197
+ data: UPathStr | pd.DataFrame | AnnData,
198
+ format: str | None,
199
+ key: str | None,
200
+ default_storage: Storage,
201
+ using_key: str | None,
202
+ skip_existence_check: bool = False,
203
+ is_replace: bool = False,
204
+ ) -> tuple[Any, Path | UPath, str, Storage, bool]:
205
+ """Serialize a data object that's provided as file or in memory.
206
+
207
+ if not overwritten, data gets stored in default storage
208
+ """
209
+ supported_data_types = [pd.DataFrame, AnnData]
210
+ if is_package_installed("mudata"):
211
+ from mudata import MuData
212
+
213
+ supported_data_types.append(MuData)
214
+ if is_package_installed("spatialdata"):
215
+ from spatialdata import SpatialData
216
+
217
+ supported_data_types.append(SpatialData)
218
+ supported_data_types = tuple(supported_data_types) # type: ignore
219
+
220
+ if key is not None:
221
+ key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
222
+ # use suffix as the (adata) format if the format is not provided
223
+ if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
224
+ format = key_suffix[1:]
225
+ else:
226
+ key_suffix = None
227
+ if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
228
+ access_token = (
229
+ default_storage._access_token
230
+ if hasattr(default_storage, "_access_token")
231
+ else None
232
+ )
233
+ path = create_path(data, access_token=access_token)
234
+ # we don't resolve http links because they can resolve into a different domain
235
+ # for example into a temporary url
236
+ if path.protocol not in {"http", "https"}:
237
+ path = path.resolve()
238
+ storage, use_existing_storage_key = process_pathlike(
239
+ path,
240
+ default_storage=default_storage,
241
+ using_key=using_key,
242
+ skip_existence_check=skip_existence_check,
243
+ )
244
+ suffix = extract_suffix_from_path(path)
245
+ memory_rep = None
246
+ elif isinstance(data, supported_data_types):
247
+ storage = default_storage
248
+ memory_rep = data
249
+ suffix = infer_suffix(data, format)
250
+ else:
251
+ raise NotImplementedError(
252
+ f"Do not know how to create a artifact object from {data}, pass a path instead!"
253
+ )
254
+ if key_suffix is not None and key_suffix != suffix and not is_replace:
255
+ # consciously omitting a trailing period
256
+ if isinstance(data, (str, Path, UPath)):
257
+ message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
258
+ else:
259
+ message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
260
+ raise InvalidArgument(message)
261
+ # in case we have an in-memory representation, we need to write it to disk
262
+ from lamindb import settings
263
+
264
+ if isinstance(data, supported_data_types):
265
+ path = settings.cache_dir / f"{provisional_uid}{suffix}"
266
+ write_to_disk(data, path)
267
+ use_existing_storage_key = False
268
+ return memory_rep, path, suffix, storage, use_existing_storage_key
269
+
270
+
271
+ def get_stat_or_artifact(
272
+ path: UPath,
273
+ key: str | None = None,
274
+ check_hash: bool = True,
275
+ is_replace: bool = False,
276
+ instance: str | None = None,
277
+ ) -> Union[tuple[int, str | None, str | None, int | None, Artifact | None], Artifact]:
278
+ """Retrieves file statistics or an existing artifact based on the path, hash, and key."""
279
+ n_files = None
280
+ from lamindb import settings
281
+
282
+ if settings.creation.artifact_skip_size_hash:
283
+ return None, None, None, n_files, None
284
+ stat = path.stat() # one network request
285
+ if not isinstance(path, LocalPathClasses):
286
+ size, hash, hash_type = None, None, None
287
+ if stat is not None:
288
+ # convert UPathStatResult to fsspec info dict
289
+ stat = stat.as_info()
290
+ if (store_type := stat["type"]) == "file":
291
+ size, hash, hash_type = get_stat_file_cloud(stat)
292
+ elif store_type == "directory":
293
+ size, hash, hash_type, n_files = get_stat_dir_cloud(path)
294
+ if hash is None:
295
+ logger.warning(f"did not add hash for {path}")
296
+ return size, hash, hash_type, n_files, None
297
+ else:
298
+ if path.is_dir():
299
+ size, hash, hash_type, n_files = hash_dir(path)
300
+ else:
301
+ hash, hash_type = hash_file(path)
302
+ size = stat.st_size
303
+ if not check_hash:
304
+ return size, hash, hash_type, n_files, None
305
+ previous_artifact_version = None
306
+ if key is None or is_replace:
307
+ result = Artifact.objects.using(instance).filter(hash=hash).all()
308
+ artifact_with_same_hash_exists = len(result) > 0
309
+ else:
310
+ storage_id = settings.storage.id
311
+ result = (
312
+ Artifact.objects.using(instance)
313
+ .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
314
+ .order_by("-created_at")
315
+ .all()
316
+ )
317
+ artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
318
+ if not artifact_with_same_hash_exists and len(result) > 0:
319
+ logger.important(
320
+ f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
321
+ )
322
+ previous_artifact_version = result[0]
323
+ if artifact_with_same_hash_exists:
324
+ message = "returning existing artifact with same hash"
325
+ if result[0]._branch_code == -1:
326
+ result[0].restore()
327
+ message = "restored artifact with same hash from trash"
328
+ logger.important(
329
+ f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
330
+ )
331
+ return result[0]
332
+ else:
333
+ return size, hash, hash_type, n_files, previous_artifact_version
334
+
335
+
336
+ def check_path_in_existing_storage(
337
+ path: Path | UPath, using_key: str | None = None
338
+ ) -> Storage | bool:
339
+ for storage in Storage.objects.using(using_key).filter().all():
340
+ # if path is part of storage, return it
341
+ if check_path_is_child_of_root(path, root=storage.root):
342
+ return storage
343
+ return False
344
+
345
+
346
+ def get_relative_path_to_directory(
347
+ path: PurePath | Path | UPath, directory: PurePath | Path | UPath
348
+ ) -> PurePath | Path:
349
+ if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
350
+ # UPath.relative_to() is not behaving as it should (2023-04-07)
351
+ # need to lstrip otherwise inconsistent behavior across trailing slashes
352
+ # see test_artifact.py: test_get_relative_path_to_directory
353
+ relpath = PurePath(
354
+ path.as_posix().replace(directory.as_posix(), "").lstrip("/")
355
+ )
356
+ elif isinstance(directory, Path):
357
+ relpath = path.resolve().relative_to(directory.resolve()) # type: ignore
358
+ elif isinstance(directory, PurePath):
359
+ relpath = path.relative_to(directory)
360
+ else:
361
+ raise TypeError("Directory not of type Path or UPath")
362
+ return relpath
363
+
364
+
365
+ def get_artifact_kwargs_from_data(
366
+ *,
367
+ data: Path | UPath | str | pd.DataFrame | AnnData | MuData,
368
+ key: str | None,
369
+ run: Run | None,
370
+ format: str | None,
371
+ provisional_uid: str,
372
+ version: str | None,
373
+ default_storage: Storage,
374
+ using_key: str | None = None,
375
+ is_replace: bool = False,
376
+ skip_check_exists: bool = False,
377
+ ):
378
+ from lamindb import settings
379
+
380
+ run = get_run(run)
381
+ memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
382
+ provisional_uid,
383
+ data,
384
+ format,
385
+ key,
386
+ default_storage,
387
+ using_key,
388
+ skip_check_exists,
389
+ is_replace=is_replace,
390
+ )
391
+ stat_or_artifact = get_stat_or_artifact(
392
+ path=path,
393
+ key=key,
394
+ instance=using_key,
395
+ is_replace=is_replace,
396
+ )
397
+ if isinstance(stat_or_artifact, Artifact):
398
+ existing_artifact = stat_or_artifact
399
+ if run is not None:
400
+ existing_artifact._populate_subsequent_runs(run)
401
+ return existing_artifact, None
402
+ else:
403
+ size, hash, hash_type, n_files, revises = stat_or_artifact
404
+
405
+ if revises is not None: # update provisional_uid
406
+ provisional_uid, revises = create_uid(revises=revises, version=version)
407
+ if settings.cache_dir in path.parents:
408
+ path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
409
+
410
+ check_path_in_storage = False
411
+ if use_existing_storage_key:
412
+ inferred_key = get_relative_path_to_directory(
413
+ path=path, directory=UPath(storage.root)
414
+ ).as_posix()
415
+ if key is None:
416
+ key = inferred_key
417
+ else:
418
+ if not key == inferred_key:
419
+ raise InvalidArgument(
420
+ f"The path '{data}' is already in registered storage"
421
+ f" '{storage.root}' with key '{inferred_key}'\nYou passed"
422
+ f" conflicting key '{key}': please move the file before"
423
+ " registering it."
424
+ )
425
+ check_path_in_storage = True
426
+ else:
427
+ storage = default_storage
428
+
429
+ log_storage_hint(
430
+ check_path_in_storage=check_path_in_storage,
431
+ storage=storage,
432
+ key=key,
433
+ uid=provisional_uid,
434
+ suffix=suffix,
435
+ is_dir=n_files is not None,
436
+ )
437
+
438
+ # do we use a virtual or an actual storage key?
439
+ key_is_virtual = settings.creation._artifact_use_virtual_keys
440
+
441
+ # if the file is already in storage, independent of the default
442
+ # we use an actual storage key
443
+ if check_path_in_storage:
444
+ key_is_virtual = False
445
+
446
+ kwargs = {
447
+ "uid": provisional_uid,
448
+ "suffix": suffix,
449
+ "hash": hash,
450
+ "_hash_type": hash_type,
451
+ "key": key,
452
+ "size": size,
453
+ "storage_id": storage.id,
454
+ # passing both the id and the object
455
+ # to make them both available immediately
456
+ # after object creation
457
+ "n_files": n_files,
458
+ "_overwrite_versions": n_files is not None, # True for folder, False for file
459
+ "n_observations": None, # to implement
460
+ "run_id": run.id if run is not None else None,
461
+ "run": run,
462
+ "_key_is_virtual": key_is_virtual,
463
+ "revises": revises,
464
+ }
465
+ if not isinstance(path, LocalPathClasses):
466
+ local_filepath = None
467
+ cloud_filepath = path
468
+ else:
469
+ local_filepath = path
470
+ cloud_filepath = None
471
+ privates = {
472
+ "local_filepath": local_filepath,
473
+ "cloud_filepath": cloud_filepath,
474
+ "memory_rep": memory_rep,
475
+ "check_path_in_storage": check_path_in_storage,
476
+ }
477
+ return kwargs, privates
478
+
479
+
480
+ def log_storage_hint(
481
+ *,
482
+ check_path_in_storage: bool,
483
+ storage: Storage | None,
484
+ key: str | None,
485
+ uid: str,
486
+ suffix: str,
487
+ is_dir: bool,
488
+ ) -> None:
489
+ hint = ""
490
+ if check_path_in_storage:
491
+ display_root = storage.root # type: ignore
492
+ # check whether path is local
493
+ if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore
494
+ # if it's a local path, check whether it's in the current working directory
495
+ root_path = Path(storage.root) # type: ignore
496
+ if check_path_is_child_of_root(root_path, Path.cwd()):
497
+ # only display the relative path, not the fully resolved path
498
+ display_root = root_path.relative_to(Path.cwd()) # type: ignore
499
+ hint += f"path in storage '{display_root}'" # type: ignore
500
+ else:
501
+ hint += "path content will be copied to default storage upon `save()`"
502
+ if key is None:
503
+ storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
504
+ hint += f" with key `None` ('{storage_key}')"
505
+ else:
506
+ hint += f" with key '{key}'"
507
+ logger.hint(hint)
508
+
509
+
510
+ def data_is_anndata(data: AnnData | UPathStr) -> bool:
511
+ if isinstance(data, AnnData):
512
+ return True
513
+ if isinstance(data, (str, Path, UPath)):
514
+ data_path = UPath(data)
515
+ if ".h5ad" in data_path.suffixes: # ".h5ad.gz" is a valid suffix
516
+ return True
517
+ elif data_path.suffix == ".zarr":
518
+ # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
519
+ # TODO: the suffix based check should likely be moved to identify_zarr_type
520
+ if ".anndata" in data_path.suffixes:
521
+ return True
522
+ # check only for local, expensive for cloud
523
+ if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
524
+ return identify_zarr_type(data_path) == "anndata"
525
+ else:
526
+ logger.warning("We do not check if cloud zarr is AnnData or not")
527
+ return False
528
+ return False
529
+
530
+
531
+ def data_is_mudata(data: MuData | UPathStr) -> bool:
532
+ if is_package_installed("mudata"):
533
+ from mudata import MuData
534
+
535
+ if isinstance(data, MuData):
536
+ return True
537
+ if isinstance(data, (str, Path)):
538
+ return UPath(data).suffix == ".h5mu"
539
+ return False
540
+
541
+
542
+ def data_is_spatialdata(data: SpatialData | UPathStr) -> bool:
543
+ if is_package_installed("spatialdata"):
544
+ from spatialdata import SpatialData
545
+
546
+ if isinstance(data, SpatialData):
547
+ return True
548
+ if isinstance(data, (str, Path)):
549
+ if UPath(data).suffix == ".zarr":
550
+ # TODO: inconsistent with anndata, where we run the storage
551
+ # check only for local, expensive for cloud
552
+ return identify_zarr_type(data, check=False) == "spatialdata"
553
+ return False
554
+
555
+
556
+ def _check_otype_artifact(
557
+ data: UPathStr | pd.DataFrame | AnnData | MuData | SpatialData,
558
+ otype: str | None = None,
559
+ ) -> str:
560
+ if otype is None:
561
+ if isinstance(data, pd.DataFrame):
562
+ logger.warning("data is a DataFrame, please use .from_df()")
563
+ otype = "DataFrame"
564
+ return otype
565
+
566
+ data_is_path = isinstance(data, (str, Path))
567
+ if data_is_anndata(data):
568
+ if not data_is_path:
569
+ logger.warning("data is an AnnData, please use .from_anndata()")
570
+ otype = "AnnData"
571
+ elif data_is_mudata(data):
572
+ if not data_is_path:
573
+ logger.warning("data is a MuData, please use .from_mudata()")
574
+ otype = "MuData"
575
+ elif data_is_spatialdata(data):
576
+ if not data_is_path:
577
+ logger.warning("data is a SpatialData, please use .from_spatialdata()")
578
+ otype = "SpatialData"
579
+ elif not data_is_path: # UPath is a subclass of Path
580
+ raise TypeError("data has to be a string, Path, UPath")
581
+ return otype
582
+
583
+
584
+ def _populate_subsequent_runs_(record: Union[Artifact, Collection], run: Run):
585
+ if record.run is None:
586
+ record.run = run
587
+ elif record.run != run:
588
+ record._subsequent_runs.add(run)
589
+
590
+
591
+ # also see current_run() in core._data
592
+ def get_run(run: Run | None) -> Run | None:
593
+ from lamindb import settings
594
+
595
+ from .._tracked import get_current_tracked_run
596
+ from ..core._context import context
597
+
598
+ if run is None:
599
+ run = get_current_tracked_run()
600
+ if run is None:
601
+ run = context.run
602
+ if run is None and not settings.creation.artifact_silence_missing_run_warning:
603
+ # here we check that this is not a read-only connection
604
+ # normally for our connection strings the read-only role name has _read in it
605
+ # not absolutely safe but the worst case is that the warning is not shown
606
+ instance = setup_settings.instance
607
+ if instance.dialect != "postgresql" or "_read" not in instance.db:
608
+ logger.warning(WARNING_RUN_TRANSFORM)
609
+ # suppress run by passing False
610
+ elif not run:
611
+ run = None
612
+ return run
613
+
614
+
615
+ def save_staged_feature_sets(self: Artifact) -> None:
616
+ if hasattr(self, "_staged_feature_sets"):
617
+ from lamindb.models._feature_manager import get_schema_by_slot_
618
+
619
+ existing_staged_feature_sets = get_schema_by_slot_(self)
620
+ saved_staged_feature_sets = {}
621
+ for key, schema in self._staged_feature_sets.items():
622
+ if isinstance(schema, Schema) and schema._state.adding:
623
+ schema.save()
624
+ saved_staged_feature_sets[key] = schema
625
+ if key in existing_staged_feature_sets:
626
+ # remove existing feature set on the same slot
627
+ self.feature_sets.remove(existing_staged_feature_sets[key])
628
+ if len(saved_staged_feature_sets) > 0:
629
+ s = "s" if len(saved_staged_feature_sets) > 1 else ""
630
+ display_schema_keys = ",".join(
631
+ f"'{key}'" for key in saved_staged_feature_sets.keys()
632
+ )
633
+ logger.save(
634
+ f"saved {len(saved_staged_feature_sets)} feature set{s} for slot{s}:"
635
+ f" {display_schema_keys}"
636
+ )
637
+
638
+
639
+ def save_schema_links(self: Artifact) -> None:
640
+ from lamindb.models.save import bulk_create
641
+
642
+ if hasattr(self, "_staged_feature_sets"):
643
+ links = []
644
+ for slot, schema in self._staged_feature_sets.items():
645
+ kwargs = {
646
+ "artifact_id": self.id,
647
+ "schema_id": schema.id,
648
+ "slot": slot,
649
+ }
650
+ links.append(Artifact.feature_sets.through(**kwargs))
651
+ bulk_create(links, ignore_conflicts=True)
652
+
653
+
654
+ # can restore later if needed
655
+ # def format_provenance(self, fk_data, print_types):
656
+ # type_str = lambda attr: (
657
+ # f": {get_related_model(self.__class__, attr).__name__}" if print_types else ""
658
+ # )
659
+
660
+ # return "".join(
661
+ # [
662
+ # f" .{field_name}{type_str(field_name)} = {format_field_value(value.get('name'))}\n"
663
+ # for field_name, value in fk_data.items()
664
+ # if value.get("name")
665
+ # ]
666
+ # )
667
+
668
+ # can restore later if needed
669
+ # def format_input_of_runs(self, print_types):
670
+ # if self.id is not None and self.input_of_runs.exists():
671
+ # values = [format_field_value(i.started_at) for i in self.input_of_runs.all()]
672
+ # type_str = ": Run" if print_types else "" # type: ignore
673
+ # return f" .input_of_runs{type_str} = {', '.join(values)}\n"
674
+ # return ""
675
+
676
+
677
+ def _describe_postgres(self): # for Artifact & Collection
678
+ from ._describe import describe_general
679
+ from ._feature_manager import describe_features
680
+
681
+ model_name = self.__class__.__name__
682
+ msg = f"{colors.green(model_name)}{record_repr(self, include_foreign_keys=False).lstrip(model_name)}\n"
683
+ if self._state.db is not None and self._state.db != "default":
684
+ msg += f" {colors.italic('Database instance')}\n"
685
+ msg += f" slug: {self._state.db}\n"
686
+
687
+ if model_name == "Artifact":
688
+ result = get_artifact_with_related(
689
+ self,
690
+ include_feature_link=True,
691
+ include_fk=True,
692
+ include_m2m=True,
693
+ include_schema=True,
694
+ )
695
+ else:
696
+ result = get_artifact_with_related(self, include_fk=True, include_m2m=True)
697
+ related_data = result.get("related_data", {})
698
+ # TODO: fk_data = related_data.get("fk", {})
699
+
700
+ tree = describe_general(self)
701
+ if model_name == "Artifact":
702
+ return describe_features(
703
+ self,
704
+ tree=tree,
705
+ related_data=related_data,
706
+ with_labels=True,
707
+ print_params=hasattr(self, "kind") and self.kind == "model",
708
+ )
709
+ else:
710
+ return tree
711
+
712
+
713
+ def _describe_sqlite(self, print_types: bool = False): # for artifact & collection
714
+ from ._describe import describe_general
715
+ from ._feature_manager import describe_features
716
+ from .collection import Collection
717
+
718
+ model_name = self.__class__.__name__
719
+ msg = f"{colors.green(model_name)}{record_repr(self, include_foreign_keys=False).lstrip(model_name)}\n"
720
+ if self._state.db is not None and self._state.db != "default":
721
+ msg += f" {colors.italic('Database instance')}\n"
722
+ msg += f" slug: {self._state.db}\n"
723
+
724
+ fields = self._meta.fields
725
+ direct_fields = []
726
+ foreign_key_fields = []
727
+ for f in fields:
728
+ if f.is_relation:
729
+ foreign_key_fields.append(f.name)
730
+ else:
731
+ direct_fields.append(f.name)
732
+ if not self._state.adding:
733
+ # prefetch foreign key relationships
734
+ self = (
735
+ self.__class__.objects.using(self._state.db)
736
+ .select_related(*foreign_key_fields)
737
+ .get(id=self.id)
738
+ )
739
+ # prefetch m-2-m relationships
740
+ many_to_many_fields = []
741
+ if isinstance(self, (Collection, Artifact)):
742
+ many_to_many_fields.append("input_of_runs")
743
+ if isinstance(self, Artifact):
744
+ many_to_many_fields.append("feature_sets")
745
+ self = (
746
+ self.__class__.objects.using(self._state.db)
747
+ .prefetch_related(*many_to_many_fields)
748
+ .get(id=self.id)
749
+ )
750
+ tree = describe_general(self)
751
+ if model_name == "Artifact":
752
+ return describe_features(
753
+ self,
754
+ tree=tree,
755
+ with_labels=True,
756
+ print_params=hasattr(self, "kind") and self.kind == "kind",
757
+ )
758
+ else:
759
+ return tree
760
+
761
+
762
+ def describe_artifact_collection(self): # for artifact & collection
763
+ from ._describe import print_rich_tree
764
+
765
+ if not self._state.adding and connections[self._state.db].vendor == "postgresql":
766
+ tree = _describe_postgres(self)
767
+ else:
768
+ tree = _describe_sqlite(self)
769
+
770
+ print_rich_tree(tree)
771
+
772
+
773
+ def validate_feature(feature: Feature, records: list[Record]) -> None:
774
+ """Validate feature record, adjust feature.dtype based on labels records."""
775
+ if not isinstance(feature, Feature):
776
+ raise TypeError("feature has to be of type Feature")
777
+ if feature._state.adding:
778
+ registries = {record.__class__.__get_name_with_module__() for record in records}
779
+ registries_str = "|".join(registries)
780
+ msg = f"ln.Feature(name='{feature.name}', type='cat[{registries_str}]').save()"
781
+ raise ValidationError(f"Feature not validated. If it looks correct: {msg}")
782
+
783
+
784
+ def get_labels(
785
+ self,
786
+ feature: Feature,
787
+ mute: bool = False,
788
+ flat_names: bool = False,
789
+ ) -> QuerySet | dict[str, QuerySet] | list:
790
+ """{}""" # noqa: D415
791
+ if not isinstance(feature, Feature):
792
+ raise TypeError("feature has to be of type Feature")
793
+ if feature.dtype is None or not feature.dtype.startswith("cat["):
794
+ raise ValueError("feature does not have linked labels")
795
+ registries_to_check = feature.dtype.replace("cat[", "").rstrip("]").split("|")
796
+ if len(registries_to_check) > 1 and not mute:
797
+ logger.warning("labels come from multiple registries!")
798
+ # return an empty query set if self.id is still None
799
+ if self.id is None:
800
+ return QuerySet(self.__class__)
801
+ qs_by_registry = {}
802
+ for registry in registries_to_check:
803
+ # currently need to distinguish between ULabel and non-ULabel, because
804
+ # we only have the feature information for Label
805
+ if registry == "ULabel":
806
+ links_to_labels = get_label_links(self, registry, feature)
807
+ label_ids = [link.ulabel_id for link in links_to_labels]
808
+ qs_by_registry[registry] = ULabel.objects.using(self._state.db).filter(
809
+ id__in=label_ids
810
+ )
811
+ elif registry in self.features._accessor_by_registry:
812
+ qs_by_registry[registry] = getattr(
813
+ self, self.features._accessor_by_registry[registry]
814
+ ).all()
815
+ if flat_names:
816
+ # returns a flat list of names
817
+ from .record import get_name_field
818
+
819
+ values = []
820
+ for v in qs_by_registry.values():
821
+ values += v.list(get_name_field(v))
822
+ return values
823
+ if len(registries_to_check) == 1 and registry in qs_by_registry:
824
+ return qs_by_registry[registry]
825
+ else:
826
+ return qs_by_registry
827
+
828
+
829
+ def add_labels(
830
+ self,
831
+ records: Record | list[Record] | QuerySet | Iterable,
832
+ feature: Feature | None = None,
833
+ *,
834
+ field: StrField | None = None,
835
+ feature_ref_is_name: bool | None = None,
836
+ label_ref_is_name: bool | None = None,
837
+ from_curator: bool = False,
838
+ ) -> None:
839
+ """{}""" # noqa: D415
840
+ if self._state.adding:
841
+ raise ValueError("Please save the artifact/collection before adding a label!")
842
+
843
+ if isinstance(records, (QuerySet, QuerySet.__base__)): # need to have both
844
+ records = records.list()
845
+ if isinstance(records, (str, Record)):
846
+ records = [records]
847
+ if not isinstance(records, list): # avoids warning for pd Series
848
+ records = list(records)
849
+ # create records from values
850
+ if len(records) == 0:
851
+ return None
852
+ if isinstance(records[0], str): # type: ignore
853
+ records_validated = []
854
+ # feature is needed if we want to create records from values
855
+ if feature is None:
856
+ raise ValueError(
857
+ "Please pass a feature, e.g., via: label = ln.ULabel(name='my_label',"
858
+ " feature=ln.Feature(name='my_feature'))"
859
+ )
860
+ if feature.dtype.startswith("cat["):
861
+ orm_dict = dict_module_name_to_model_name(Artifact)
862
+ for reg in feature.dtype.replace("cat[", "").rstrip("]").split("|"):
863
+ registry = orm_dict.get(reg)
864
+ records_validated += registry.from_values(records, field=field)
865
+
866
+ # feature doesn't have registries and therefore can't create records from values
867
+ # ask users to pass records
868
+ if len(records_validated) == 0:
869
+ raise ValueError(
870
+ "Please pass a record (a `Record` object), not a string, e.g., via:"
871
+ " label"
872
+ f" = ln.ULabel(name='{records[0]}')" # type: ignore
873
+ )
874
+ records = records_validated
875
+
876
+ for record in records:
877
+ if record._state.adding:
878
+ raise ValidationError(
879
+ f"{record} not validated. If it looks correct: record.save()"
880
+ )
881
+
882
+ if feature is None:
883
+ d = dict_related_model_to_related_name(self.__class__)
884
+ # strategy: group records by registry to reduce number of transactions
885
+ records_by_related_name: dict = {}
886
+ for record in records:
887
+ related_name = d.get(record.__class__.__get_name_with_module__())
888
+ if related_name is None:
889
+ raise ValueError(f"Can't add labels to {record.__class__} record!")
890
+ if related_name not in records_by_related_name:
891
+ records_by_related_name[related_name] = []
892
+ records_by_related_name[related_name].append(record)
893
+ for related_name, records in records_by_related_name.items():
894
+ getattr(self, related_name).add(*records)
895
+ else:
896
+ validate_feature(feature, records) # type:ignore
897
+ records_by_registry = defaultdict(list)
898
+ feature_sets = self.feature_sets.filter(itype="Feature").all()
899
+ internal_features = set() # type: ignore
900
+ if len(feature_sets) > 0:
901
+ for schema in feature_sets:
902
+ internal_features = internal_features.union(
903
+ set(schema.members.values_list("name", flat=True))
904
+ ) # type: ignore
905
+ for record in records:
906
+ records_by_registry[record.__class__.__get_name_with_module__()].append(
907
+ record
908
+ )
909
+ for registry_name, records in records_by_registry.items():
910
+ if not from_curator and feature.name in internal_features:
911
+ raise ValidationError(
912
+ "Cannot manually annotate internal feature with label. Please use ln.Curator"
913
+ )
914
+ if registry_name not in feature.dtype:
915
+ if not feature.dtype.startswith("cat"):
916
+ raise ValidationError(
917
+ f"Feature {feature.name} needs dtype='cat' for label annotation, currently has dtype='{feature.dtype}'"
918
+ )
919
+ if feature.dtype == "cat":
920
+ feature.dtype = f"cat[{registry_name}]" # type: ignore
921
+ feature.save()
922
+ elif registry_name not in feature.dtype:
923
+ new_dtype = feature.dtype.rstrip("]") + f"|{registry_name}]"
924
+ raise ValidationError(
925
+ f"Label type {registry_name} is not valid for Feature(name='{feature.name}', dtype='{feature.dtype}'), consider updating to dtype='{new_dtype}'"
926
+ )
927
+
928
+ if registry_name not in self.features._accessor_by_registry:
929
+ logger.warning(f"skipping {registry_name}")
930
+ continue
931
+ if len(records) == 0:
932
+ continue
933
+ features_labels = {
934
+ registry_name: [(feature, label_record) for label_record in records]
935
+ }
936
+ add_label_feature_links(
937
+ self.features,
938
+ features_labels,
939
+ feature_ref_is_name=feature_ref_is_name,
940
+ label_ref_is_name=label_ref_is_name,
941
+ )
942
+
943
+
944
+ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
945
+ # Note that this docstring has to be consistent with Curator.save_artifact()
946
+ """Datasets & models stored as files, folders, or arrays.
947
+
948
+ Artifacts manage data in local or remote storage.
949
+
950
+ Some artifacts are array-like, e.g., when stored as `.parquet`, `.h5ad`,
951
+ `.zarr`, or `.tiledb`.
952
+
953
+ Args:
954
+ data: `UPathStr` A path to a local or remote folder or file.
955
+ kind: `Literal["dataset", "model"] | None = None` Distinguish models from datasets from other files & folders.
956
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
957
+ description: `str | None = None` A description.
958
+ revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
959
+ run: `Run | None = None` The run that creates the artifact.
960
+
961
+ .. dropdown:: Typical storage formats & their API accessors
962
+
963
+ Arrays:
964
+
965
+ - Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
966
+ - Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
967
+ - Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
968
+
969
+ Non-arrays:
970
+
971
+ - Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
972
+ - Fastq: `.fastq` ⟷ /
973
+ - VCF: `.vcf` ⟷ /
974
+ - QC: `.html` ⟷ /
975
+
976
+ You'll find these values in the `suffix` & `accessor` fields.
977
+
978
+ LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
979
+
980
+ See Also:
981
+ :class:`~lamindb.Storage`
982
+ Storage locations for artifacts.
983
+ :class:`~lamindb.Collection`
984
+ Collections of artifacts.
985
+ :meth:`~lamindb.Artifact.from_df`
986
+ Create an artifact from a `DataFrame`.
987
+ :meth:`~lamindb.Artifact.from_anndata`
988
+ Create an artifact from an `AnnData`.
989
+
990
+ Examples:
991
+
992
+ Create an artifact by passing `key`:
993
+
994
+ >>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
995
+ >>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
996
+
997
+ Calling `.save()` uploads the file to the default storage location of your lamindb instance.
998
+ (If it's a local instance, the "upload" is a mere copy operation.)
999
+
1000
+ If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
1001
+
1002
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
1003
+
1004
+ You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
1005
+
1006
+ >>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
1007
+ >>> artifact_v2.versions.df() # see all versions
1008
+
1009
+ .. dropdown:: Why does the API look this way?
1010
+
1011
+ It's inspired by APIs building on AWS S3.
1012
+
1013
+ Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
1014
+
1015
+ In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
1016
+
1017
+ # signature: S3.Bucket.upload_file(filepath, key)
1018
+ import boto3
1019
+ s3 = boto3.resource('s3')
1020
+ bucket = s3.Bucket('mybucket')
1021
+ bucket.upload_file('/tmp/hello.txt', 'hello.txt')
1022
+
1023
+ In `quilt3 <https://docs.quiltdata.com/api-reference/bucket>`__::
1024
+
1025
+ # signature: quilt3.Bucket.put_file(key, filepath)
1026
+ import quilt3
1027
+ bucket = quilt3.Bucket('mybucket')
1028
+ bucket.put_file('hello.txt', '/tmp/hello.txt')
1029
+
1030
+ Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
1031
+
1032
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
1033
+ >>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
1034
+
1035
+ Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
1036
+
1037
+ >>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
1038
+
1039
+ If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
1040
+ the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
1041
+ detects the duplication and will return the existing artifact.
1042
+
1043
+ """
1044
+
1045
+ class Meta(Record.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
1046
+ abstract = False
1047
+
1048
+ _len_full_uid: int = 20
1049
+ _len_stem_uid: int = 16
1050
+
1051
+ params: ParamManager = ParamManagerArtifact # type: ignore
1052
+ """Param manager.
1053
+
1054
+ Example::
1055
+
1056
+ artifact.params.add_values({
1057
+ "hidden_size": 32,
1058
+ "bottleneck_size": 16,
1059
+ "batch_size": 32,
1060
+ "preprocess_params": {
1061
+ "normalization_type": "cool",
1062
+ "subset_highlyvariable": True,
1063
+ },
1064
+ })
1065
+ """
1066
+
1067
+ features: FeatureManager = FeatureManager # type: ignore
1068
+ """Feature manager.
1069
+
1070
+ Features denote dataset dimensions, i.e., the variables that measure labels & numbers.
1071
+
1072
+ Annotate with features & values::
1073
+
1074
+ artifact.features.add_values({
1075
+ "species": organism, # here, organism is an Organism record
1076
+ "scientist": ['Barbara McClintock', 'Edgar Anderson'],
1077
+ "temperature": 27.6,
1078
+ "study": "Candidate marker study"
1079
+ })
1080
+
1081
+ Query for features & values::
1082
+
1083
+ ln.Artifact.features.filter(scientist="Barbara McClintock")
1084
+
1085
+ Features may or may not be part of the artifact content in storage. For
1086
+ instance, the :class:`~lamindb.Curator` flow validates the columns of a
1087
+ `DataFrame`-like artifact and annotates it with features corresponding to
1088
+ these columns. `artifact.features.add_values`, by contrast, does not
1089
+ validate the content of the artifact.
1090
+ """
1091
+
1092
+ @property
1093
+ def labels(self) -> LabelManager:
1094
+ """Label manager.
1095
+
1096
+ To annotate with labels, you typically use the registry-specific accessors,
1097
+ for instance :attr:`~lamindb.Artifact.ulabels`::
1098
+
1099
+ candidate_marker_study = ln.ULabel(name="Candidate marker study").save()
1100
+ artifact.ulabels.add(candidate_marker_study)
1101
+
1102
+ Similarly, you query based on these accessors::
1103
+
1104
+ ln.Artifact.filter(ulabels__name="Candidate marker study").all()
1105
+
1106
+ Unlike the registry-specific accessors, the `.labels` accessor provides
1107
+ a way of associating labels with features::
1108
+
1109
+ study = ln.Feature(name="study", dtype="cat").save()
1110
+ artifact.labels.add(candidate_marker_study, feature=study)
1111
+
1112
+ Note that the above is equivalent to::
1113
+
1114
+ artifact.features.add_values({"study": candidate_marker_study})
1115
+ """
1116
+ from ._label_manager import LabelManager
1117
+
1118
+ return LabelManager(self)
1119
+
1120
+ id: int = models.AutoField(primary_key=True)
1121
+ """Internal id, valid only in one DB instance."""
1122
+ uid: str = CharField(
1123
+ editable=False, unique=True, db_index=True, max_length=_len_full_uid
1124
+ )
1125
+ """A universal random id."""
1126
+ key: str | None = CharField(db_index=True, null=True)
1127
+ """A (virtual) relative file path within the artifact's storage location.
1128
+
1129
+ Setting a `key` is useful to automatically group artifacts into a version family.
1130
+
1131
+ LaminDB defaults to a virtual file path to make renaming of data in object storage easy.
1132
+
1133
+ If you register existing files in a storage location, the `key` equals the
1134
+ actual filepath on the underyling filesytem or object store.
1135
+ """
1136
+ description: str | None = CharField(db_index=True, null=True)
1137
+ """A description."""
1138
+ storage: Storage = ForeignKey(
1139
+ Storage, PROTECT, related_name="artifacts", editable=False
1140
+ )
1141
+ """Storage location, e.g. an S3 or GCP bucket or a local directory."""
1142
+ suffix: str = CharField(max_length=30, db_index=True, editable=False)
1143
+ # Initially, we thought about having this be nullable to indicate folders
1144
+ # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
1145
+ """Path suffix or empty string if no canonical suffix exists.
1146
+
1147
+ This is either a file suffix (`".csv"`, `".h5ad"`, etc.) or the empty string "".
1148
+ """
1149
+ kind: ArtifactKind | None = CharField(
1150
+ max_length=20,
1151
+ db_index=True,
1152
+ null=True,
1153
+ )
1154
+ """:class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
1155
+ otype: str | None = CharField(
1156
+ max_length=64, db_index=True, null=True, editable=False
1157
+ )
1158
+ """Default Python object type, e.g., DataFrame, AnnData."""
1159
+ size: int | None = BigIntegerField(
1160
+ null=True, db_index=True, default=None, editable=False
1161
+ )
1162
+ """Size in bytes.
1163
+
1164
+ Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
1165
+ """
1166
+ hash: str | None = CharField(
1167
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True, editable=False
1168
+ )
1169
+ """Hash or pseudo-hash of artifact content.
1170
+
1171
+ Useful to ascertain integrity and avoid duplication.
1172
+ """
1173
+ n_files: int | None = BigIntegerField(
1174
+ null=True, db_index=True, default=None, editable=False
1175
+ )
1176
+ """Number of files for folder-like artifacts, `None` for file-like artifacts.
1177
+
1178
+ Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
1179
+
1180
+ .. versionchanged:: 1.0
1181
+ Renamed from `n_objects` to `n_files`.
1182
+ """
1183
+ n_observations: int | None = BigIntegerField(
1184
+ null=True, db_index=True, default=None, editable=False
1185
+ )
1186
+ """Number of observations.
1187
+
1188
+ Typically, this denotes the first array dimension.
1189
+ """
1190
+ _hash_type: str | None = CharField(
1191
+ max_length=30, db_index=True, null=True, editable=False
1192
+ )
1193
+ """Type of hash."""
1194
+ ulabels: ULabel = models.ManyToManyField(
1195
+ ULabel, through="ArtifactULabel", related_name="artifacts"
1196
+ )
1197
+ """The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
1198
+ run: Run | None = ForeignKey(
1199
+ Run,
1200
+ PROTECT,
1201
+ related_name="output_artifacts",
1202
+ null=True,
1203
+ default=None,
1204
+ editable=False,
1205
+ )
1206
+ """Run that created the artifact."""
1207
+ input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
1208
+ """Runs that use this artifact as an input."""
1209
+ _subsequent_runs: Run = models.ManyToManyField(
1210
+ "Run",
1211
+ related_name="_recreated_artifacts",
1212
+ db_table="lamindb_artifact__previous_runs", # legacy name, change in lamindb v2
1213
+ )
1214
+ """Runs that re-created the record after initial creation."""
1215
+ collections: Collection
1216
+ """The collections that this artifact is part of."""
1217
+ schema: Schema | None = ForeignKey(
1218
+ Schema,
1219
+ PROTECT,
1220
+ null=True,
1221
+ default=None,
1222
+ related_name="validated_artifacts",
1223
+ )
1224
+ """The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
1225
+ feature_sets: Schema = models.ManyToManyField(
1226
+ Schema, related_name="artifacts", through="ArtifactSchema"
1227
+ )
1228
+ """The feature sets measured by the artifact."""
1229
+ _feature_values: FeatureValue = models.ManyToManyField(
1230
+ FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
1231
+ )
1232
+ """Non-categorical feature values for annotation."""
1233
+ _param_values: ParamValue = models.ManyToManyField(
1234
+ ParamValue, through="ArtifactParamValue", related_name="artifacts"
1235
+ )
1236
+ """Parameter values."""
1237
+ _key_is_virtual: bool = BooleanField()
1238
+ """Indicates whether `key` is virtual or part of an actual file path."""
1239
+ # be mindful that below, passing related_name="+" leads to errors
1240
+ _actions: Artifact = models.ManyToManyField(
1241
+ "self", symmetrical=False, related_name="_action_targets"
1242
+ )
1243
+ """Actions to attach for the UI."""
1244
+ created_by: User = ForeignKey(
1245
+ "lamindb.User",
1246
+ PROTECT,
1247
+ default=current_user_id,
1248
+ related_name="created_artifacts",
1249
+ editable=False,
1250
+ )
1251
+ """Creator of record."""
1252
+ _overwrite_versions: bool = BooleanField(default=None)
1253
+ """Indicates whether to store or overwrite versions.
1254
+
1255
+ It defaults to False for file-like artifacts and to True for folder-like artifacts.
1256
+ """
1257
+ projects: Project
1258
+ """Linked projects."""
1259
+ references: Reference
1260
+ """Linked references."""
1261
+
1262
+ @overload
1263
+ def __init__(
1264
+ self,
1265
+ # we're not choosing the name "path" for this arg because
1266
+ # it'd be confusing with `artifact.path`, which is not the same
1267
+ # so "data" conveys better that this is input data that's ingested
1268
+ # and will be moved to a target path at `artifact.path`
1269
+ # also internally, we sometimes pass "data objects" like a DataFrame
1270
+ # here; and we might refactor this but we might also keep that internal
1271
+ # usage
1272
+ data: UPathStr,
1273
+ kind: ArtifactKind | None = None,
1274
+ key: str | None = None,
1275
+ description: str | None = None,
1276
+ revises: Artifact | None = None,
1277
+ run: Run | None = None,
1278
+ ): ...
1279
+
1280
+ @overload
1281
+ def __init__(
1282
+ self,
1283
+ *db_args,
1284
+ ): ...
1285
+
1286
+ def __init__(
1287
+ self,
1288
+ *args,
1289
+ **kwargs,
1290
+ ):
1291
+ self.features = FeatureManager(self) # type: ignore
1292
+ self.params = ParamManager(self) # type: ignore
1293
+ # Below checks for the Django-internal call in from_db()
1294
+ # it'd be better if we could avoid this, but not being able to create a Artifact
1295
+ # from data with the default constructor renders the central class of the API
1296
+ # essentially useless
1297
+ # The danger below is not that a user might pass as many args (12 of it), but rather
1298
+ # that at some point the Django API might change; on the other hand, this
1299
+ # condition of for calling the constructor based on kwargs should always
1300
+ # stay robust
1301
+ if len(args) == len(self._meta.concrete_fields):
1302
+ super().__init__(*args, **kwargs)
1303
+ return None
1304
+ # now we proceed with the user-facing constructor
1305
+ if len(args) > 1:
1306
+ raise ValueError("Only one non-keyword arg allowed: data")
1307
+ data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
1308
+ kind: str = kwargs.pop("kind", None)
1309
+ key: str | None = kwargs.pop("key", None)
1310
+ run: Run | None = kwargs.pop("run", None)
1311
+ description: str | None = kwargs.pop("description", None)
1312
+ revises: Artifact | None = kwargs.pop("revises", None)
1313
+ version: str | None = kwargs.pop("version", None)
1314
+ if "visibility" in kwargs: # backward compat
1315
+ _branch_code = kwargs.pop("visibility")
1316
+ elif "_branch_code" in kwargs:
1317
+ _branch_code = kwargs.pop("_branch_code")
1318
+ else:
1319
+ _branch_code = 1
1320
+ format = kwargs.pop("format", None)
1321
+ _is_internal_call = kwargs.pop("_is_internal_call", False)
1322
+ skip_check_exists = kwargs.pop("skip_check_exists", False)
1323
+ if "default_storage" in kwargs:
1324
+ default_storage = kwargs.pop("default_storage")
1325
+ else:
1326
+ if setup_settings.instance.keep_artifacts_local:
1327
+ default_storage = setup_settings.instance.storage_local.record
1328
+ else:
1329
+ default_storage = setup_settings.instance.storage.record
1330
+ using_key = kwargs.pop("using_key", None)
1331
+ otype = kwargs.pop("otype") if "otype" in kwargs else None
1332
+ otype = _check_otype_artifact(data=data, otype=otype)
1333
+ if "type" in kwargs:
1334
+ logger.warning("`type` will be removed soon, please use `kind`")
1335
+ kind = kwargs.pop("type")
1336
+ if not len(kwargs) == 0:
1337
+ valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
1338
+ raise FieldValidationError(
1339
+ f"Only {valid_keywords} can be passed, you passed: {kwargs}"
1340
+ )
1341
+ if revises is not None and key is not None and revises.key != key:
1342
+ note = message_update_key_in_version_family(
1343
+ suid=revises.stem_uid,
1344
+ existing_key=revises.key,
1345
+ new_key=key,
1346
+ registry="Artifact",
1347
+ )
1348
+ raise ValueError(
1349
+ f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
1350
+ )
1351
+ if revises is not None:
1352
+ if not isinstance(revises, Artifact):
1353
+ raise TypeError("`revises` has to be of type `Artifact`")
1354
+ if description is None:
1355
+ description = revises.description
1356
+ if key is not None and AUTO_KEY_PREFIX in key:
1357
+ raise ValueError(
1358
+ f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
1359
+ )
1360
+ # below is for internal calls that require defining the storage location
1361
+ # ahead of constructing the Artifact
1362
+ if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
1363
+ if _is_internal_call:
1364
+ is_automanaged_path = True
1365
+ user_provided_key = key
1366
+ key = None
1367
+ else:
1368
+ raise ValueError(
1369
+ f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
1370
+ )
1371
+ else:
1372
+ is_automanaged_path = False
1373
+ provisional_uid, revises = create_uid(revises=revises, version=version)
1374
+ kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
1375
+ data=data,
1376
+ key=key,
1377
+ run=run,
1378
+ format=format,
1379
+ provisional_uid=provisional_uid,
1380
+ version=version,
1381
+ default_storage=default_storage,
1382
+ using_key=using_key,
1383
+ skip_check_exists=skip_check_exists,
1384
+ )
1385
+
1386
+ # an object with the same hash already exists
1387
+ if isinstance(kwargs_or_artifact, Artifact):
1388
+ from .record import init_self_from_db, update_attributes
1389
+
1390
+ init_self_from_db(self, kwargs_or_artifact)
1391
+ # adding "key" here is dangerous because key might be auto-populated
1392
+ attr_to_update = {"description": description}
1393
+ if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None:
1394
+ attr_to_update["key"] = key
1395
+ elif self.key != key and key is not None:
1396
+ logger.warning(
1397
+ f"key {self.key} on existing artifact differs from passed key {key}"
1398
+ )
1399
+ update_attributes(self, attr_to_update)
1400
+ return None
1401
+ else:
1402
+ kwargs = kwargs_or_artifact
1403
+
1404
+ if revises is None:
1405
+ revises = kwargs_or_artifact.pop("revises")
1406
+
1407
+ if data is not None:
1408
+ self._local_filepath = privates["local_filepath"]
1409
+ self._cloud_filepath = privates["cloud_filepath"]
1410
+ self._memory_rep = privates["memory_rep"]
1411
+ self._to_store = not privates["check_path_in_storage"]
1412
+
1413
+ if is_automanaged_path and _is_internal_call:
1414
+ kwargs["_key_is_virtual"] = True
1415
+ assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
1416
+ uid = (
1417
+ kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
1418
+ )
1419
+ kwargs["key"] = user_provided_key
1420
+ if revises is not None:
1421
+ assert uid.startswith(revises.stem_uid) # noqa: S101
1422
+ if len(uid) == 16:
1423
+ if revises is None:
1424
+ uid += "0000"
1425
+ else:
1426
+ uid, revises = create_uid(revises=revises, version=version)
1427
+ kwargs["uid"] = uid
1428
+
1429
+ # only set key now so that we don't do a look-up on it in case revises is passed
1430
+ if revises is not None:
1431
+ kwargs["key"] = revises.key
1432
+
1433
+ kwargs["kind"] = kind
1434
+ kwargs["version"] = version
1435
+ kwargs["description"] = description
1436
+ kwargs["_branch_code"] = _branch_code
1437
+ kwargs["otype"] = otype
1438
+ kwargs["revises"] = revises
1439
+ # this check needs to come down here because key might be populated from an
1440
+ # existing file path during get_artifact_kwargs_from_data()
1441
+ if (
1442
+ kwargs["key"] is None
1443
+ and kwargs["description"] is None
1444
+ and kwargs["run"] is None
1445
+ ):
1446
+ raise ValueError("Pass one of key, run or description as a parameter")
1447
+
1448
+ super().__init__(**kwargs)
1449
+
1450
+ @property
1451
+ @deprecated("kind")
1452
+ def type(self) -> str:
1453
+ return self.kind
1454
+
1455
+ @property
1456
+ @deprecated("otype")
1457
+ def _accessor(self) -> str:
1458
+ return self.otype
1459
+
1460
+ @property
1461
+ def transform(self) -> Transform | None:
1462
+ """Transform whose run created the artifact."""
1463
+ return self.run.transform if self.run is not None else None
1464
+
1465
+ @property
1466
+ @deprecated("n_files")
1467
+ def n_objects(self) -> int:
1468
+ return self.n_files
1469
+
1470
+ # add the below because this is what people will have in their code
1471
+ # if they implement the recommended migration strategy
1472
+ # - FeatureSet -> Schema
1473
+ # - featureset -> schema
1474
+ # - feature_set -> schema
1475
+ # @property
1476
+ # def schemas(self) -> QuerySet[Schema]:
1477
+ # """Schemas linked to artifact via many-to-many relationship.
1478
+
1479
+ # Is now mediating the private `.feature_sets` relationship during
1480
+ # a transition period to better schema management.
1481
+
1482
+ # .. versionchanged: 1.0
1483
+ # Was previously called `.feature_sets`.
1484
+
1485
+ # """
1486
+ # return self.feature_sets
1487
+
1488
+ @property
1489
+ def path(self) -> Path:
1490
+ """Path.
1491
+
1492
+ File in cloud storage, here AWS S3:
1493
+
1494
+ >>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
1495
+ >>> artifact.path
1496
+ S3QueryPath('s3://my-bucket/my-file.csv')
1497
+
1498
+ File in local storage:
1499
+
1500
+ >>> ln.Artifact("./myfile.csv", key="myfile.csv").save()
1501
+ >>> artifact.path
1502
+ PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/myfile.csv')
1503
+ """
1504
+ from lamindb import settings
1505
+
1506
+ filepath, _ = filepath_from_artifact(self, using_key=settings._using_key)
1507
+ return filepath
1508
+
1509
+ @property
1510
+ def _cache_path(self) -> UPath:
1511
+ from lamindb import settings
1512
+
1513
+ filepath, cache_key = filepath_cache_key_from_artifact(
1514
+ self, using_key=settings._using_key
1515
+ )
1516
+ if isinstance(filepath, LocalPathClasses):
1517
+ return filepath
1518
+ return setup_settings.paths.cloud_to_local_no_update(
1519
+ filepath, cache_key=cache_key
1520
+ )
1521
+
1522
+ @classmethod
1523
+ def from_df(
1524
+ cls,
1525
+ df: pd.DataFrame,
1526
+ *,
1527
+ key: str | None = None,
1528
+ description: str | None = None,
1529
+ run: Run | None = None,
1530
+ revises: Artifact | None = None,
1531
+ **kwargs,
1532
+ ) -> Artifact:
1533
+ """Create from `DataFrame`, validate & link features.
1534
+
1535
+ Args:
1536
+ df: A `DataFrame` object.
1537
+ key: A relative path within default storage,
1538
+ e.g., `"myfolder/myfile.parquet"`.
1539
+ description: A description.
1540
+ revises: An old version of the artifact.
1541
+ run: The run that creates the artifact.
1542
+
1543
+ See Also:
1544
+ :meth:`~lamindb.Collection`
1545
+ Track collections.
1546
+ :class:`~lamindb.Feature`
1547
+ Track features.
1548
+
1549
+ Examples:
1550
+ >>> df = ln.core.datasets.df_iris_in_meter_batch1()
1551
+ >>> df.head()
1552
+ sepal_length sepal_width petal_length petal_width iris_organism_code
1553
+ 0 0.051 0.035 0.014 0.002 0
1554
+ 1 0.049 0.030 0.014 0.002 0
1555
+ 2 0.047 0.032 0.013 0.002 0
1556
+ 3 0.046 0.031 0.015 0.002 0
1557
+ 4 0.050 0.036 0.014 0.002 0
1558
+ >>> artifact = ln.Artifact.from_df(df, description="Iris flower collection batch1")
1559
+ >>> artifact.save()
1560
+ """
1561
+ artifact = Artifact( # type: ignore
1562
+ data=df,
1563
+ key=key,
1564
+ run=run,
1565
+ description=description,
1566
+ revises=revises,
1567
+ otype="DataFrame",
1568
+ kind="dataset",
1569
+ **kwargs,
1570
+ )
1571
+ artifact.n_observations = len(df)
1572
+ return artifact
1573
+
1574
+ @classmethod
1575
+ def from_anndata(
1576
+ cls,
1577
+ adata: Union[AnnData, UPathStr],
1578
+ *,
1579
+ key: str | None = None,
1580
+ description: str | None = None,
1581
+ run: Run | None = None,
1582
+ revises: Artifact | None = None,
1583
+ **kwargs,
1584
+ ) -> Artifact:
1585
+ """Create from ``AnnData``, validate & link features.
1586
+
1587
+ Args:
1588
+ adata: An `AnnData` object or a path of AnnData-like.
1589
+ key: A relative path within default storage,
1590
+ e.g., `"myfolder/myfile.h5ad"`.
1591
+ description: A description.
1592
+ revises: An old version of the artifact.
1593
+ run: The run that creates the artifact.
1594
+
1595
+ See Also:
1596
+
1597
+ :meth:`~lamindb.Collection`
1598
+ Track collections.
1599
+ :class:`~lamindb.Feature`
1600
+ Track features.
1601
+
1602
+ Examples:
1603
+ >>> import bionty as bt
1604
+ >>> bt.settings.organism = "human"
1605
+ >>> adata = ln.core.datasets.anndata_with_obs()
1606
+ >>> artifact = ln.Artifact.from_anndata(adata, description="mini anndata with obs")
1607
+ >>> artifact.save()
1608
+ """
1609
+ if not data_is_anndata(adata):
1610
+ raise ValueError(
1611
+ "data has to be an AnnData object or a path to AnnData-like"
1612
+ )
1613
+ _anndata_n_observations(adata)
1614
+ artifact = Artifact( # type: ignore
1615
+ data=adata,
1616
+ key=key,
1617
+ run=run,
1618
+ description=description,
1619
+ revises=revises,
1620
+ otype="AnnData",
1621
+ kind="dataset",
1622
+ **kwargs,
1623
+ )
1624
+ # this is done instead of _anndata_n_observations(adata)
1625
+ # because we need a proper path through create_path for cloud paths
1626
+ # for additional upath options etc that create_path adds
1627
+ obj_for_obs: AnnData | UPath
1628
+ if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
1629
+ obj_for_obs = artifact._memory_rep
1630
+ else:
1631
+ # returns ._local_filepath for local files
1632
+ # and the proper path through create_path for cloud paths
1633
+ obj_for_obs = artifact.path
1634
+ artifact.n_observations = _anndata_n_observations(obj_for_obs)
1635
+ return artifact
1636
+
1637
+ @classmethod
1638
+ def from_mudata(
1639
+ cls,
1640
+ mdata: Union[MuData, UPathStr],
1641
+ *,
1642
+ key: str | None = None,
1643
+ description: str | None = None,
1644
+ run: Run | None = None,
1645
+ revises: Artifact | None = None,
1646
+ **kwargs,
1647
+ ) -> Artifact:
1648
+ """Create from ``MuData``, validate & link features.
1649
+
1650
+ Args:
1651
+ mdata: A `MuData` object.
1652
+ key: A relative path within default storage,
1653
+ e.g., `"myfolder/myfile.h5mu"`.
1654
+ description: A description.
1655
+ revises: An old version of the artifact.
1656
+ run: The run that creates the artifact.
1657
+
1658
+ See Also:
1659
+ :meth:`~lamindb.Collection`
1660
+ Track collections.
1661
+ :class:`~lamindb.Feature`
1662
+ Track features.
1663
+
1664
+ Examples:
1665
+ >>> import bionty as bt
1666
+ >>> bt.settings.organism = "human"
1667
+ >>> mdata = ln.core.datasets.mudata_papalexi21_subset()
1668
+ >>> artifact = ln.Artifact.from_mudata(mdata, description="a mudata object")
1669
+ >>> artifact.save()
1670
+ """
1671
+ if not data_is_mudata(mdata):
1672
+ raise ValueError("data has to be a MuData object or a path to MuData-like")
1673
+ artifact = Artifact( # type: ignore
1674
+ data=mdata,
1675
+ key=key,
1676
+ run=run,
1677
+ description=description,
1678
+ revises=revises,
1679
+ otype="MuData",
1680
+ kind="dataset",
1681
+ **kwargs,
1682
+ )
1683
+ if not isinstance(mdata, UPathStr):
1684
+ artifact.n_observations = mdata.n_obs
1685
+ return artifact
1686
+
1687
+ @classmethod
1688
+ def from_spatialdata(
1689
+ cls,
1690
+ sdata: Union[SpatialData, UPathStr],
1691
+ *,
1692
+ key: str | None = None,
1693
+ description: str | None = None,
1694
+ run: Run | None = None,
1695
+ revises: Artifact | None = None,
1696
+ **kwargs,
1697
+ ) -> Artifact:
1698
+ """Create from ``SpatialData``, validate & link features.
1699
+
1700
+ Args:
1701
+ mdata: A `SpatialData` object.
1702
+ key: A relative path within default storage,
1703
+ e.g., `"myfolder/myfile.zarr"`.
1704
+ description: A description.
1705
+ revises: An old version of the artifact.
1706
+ run: The run that creates the artifact.
1707
+
1708
+ See Also:
1709
+ :meth:`~lamindb.Collection`
1710
+ Track collections.
1711
+ :class:`~lamindb.Feature`
1712
+ Track features.
1713
+
1714
+ Examples:
1715
+ >>> artifact = ln.Artifact.from_spatialdata(sdata, key="my_dataset.zarr")
1716
+ """
1717
+ if not data_is_spatialdata(sdata):
1718
+ raise ValueError(
1719
+ "data has to be a SpatialData object or a path to SpatialData-like"
1720
+ )
1721
+ artifact = Artifact( # type: ignore
1722
+ data=sdata,
1723
+ key=key,
1724
+ run=run,
1725
+ description=description,
1726
+ revises=revises,
1727
+ otype="SpatialData",
1728
+ kind="dataset",
1729
+ **kwargs,
1730
+ )
1731
+ # ill-defined https://scverse.zulipchat.com/#narrow/channel/315824-spatial/topic/How.20to.20calculate.20the.20number.20of.20observations.3F
1732
+ # artifact.n_observations = ...
1733
+ return artifact
1734
+
1735
+ @classmethod
1736
+ def from_tiledbsoma(
1737
+ cls,
1738
+ path: UPathStr,
1739
+ *,
1740
+ key: str | None = None,
1741
+ description: str | None = None,
1742
+ run: Run | None = None,
1743
+ revises: Artifact | None = None,
1744
+ **kwargs,
1745
+ ) -> Artifact:
1746
+ """Create from a tiledbsoma store.
1747
+
1748
+ Args:
1749
+ path: A tiledbsoma store with .tiledbsoma suffix.
1750
+ key: A relative path within default storage,
1751
+ e.g., `"myfolder/mystore.tiledbsoma"`.
1752
+ description: A description.
1753
+ revises: An old version of the artifact.
1754
+ run: The run that creates the artifact.
1755
+
1756
+ Examples:
1757
+ >>> artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store")
1758
+ >>> artifact.save()
1759
+ """
1760
+ if UPath(path).suffix != ".tiledbsoma":
1761
+ raise ValueError(
1762
+ "A tiledbsoma store should have .tiledbsoma suffix to be registered."
1763
+ )
1764
+ artifact = Artifact( # type: ignore
1765
+ data=path,
1766
+ key=key,
1767
+ run=run,
1768
+ description=description,
1769
+ revises=revises,
1770
+ otype="tiledbsoma",
1771
+ kind="dataset",
1772
+ **kwargs,
1773
+ )
1774
+ artifact.n_observations = _soma_n_observations(artifact.path)
1775
+ return artifact
1776
+
1777
+ @classmethod
1778
+ def from_dir(
1779
+ cls,
1780
+ path: UPathStr,
1781
+ *,
1782
+ key: str | None = None,
1783
+ run: Run | None = None,
1784
+ ) -> list[Artifact]:
1785
+ """Create a list of artifact objects from a directory.
1786
+
1787
+ Hint:
1788
+ If you have a high number of files (several 100k) and don't want to
1789
+ track them individually, create a single :class:`~lamindb.Artifact` via
1790
+ ``Artifact(path)`` for them. See, e.g., :doc:`docs:rxrx`.
1791
+
1792
+ Args:
1793
+ path: Source path of folder.
1794
+ key: Key for storage destination. If `None` and
1795
+ directory is in a registered location, the inferred `key` will
1796
+ reflect the relative position. If `None` and directory is outside
1797
+ of a registered storage location, the inferred key defaults to `path.name`.
1798
+ run: A `Run` object.
1799
+
1800
+ Examples:
1801
+ >>> dir_path = ln.core.datasets.generate_cell_ranger_files("sample_001", ln.settings.storage)
1802
+ >>> artifacts = ln.Artifact.from_dir(dir_path)
1803
+ >>> ln.save(artifacts)
1804
+ """
1805
+ from lamindb import settings
1806
+
1807
+ folderpath: UPath = create_path(path) # returns Path for local
1808
+ default_storage = settings.storage.record
1809
+ using_key = settings._using_key
1810
+ storage, use_existing_storage = process_pathlike(
1811
+ folderpath, default_storage, using_key
1812
+ )
1813
+ folder_key_path: PurePath | Path
1814
+ if key is None:
1815
+ if not use_existing_storage:
1816
+ logger.warning(
1817
+ "folder is outside existing storage location, will copy files from"
1818
+ f" {path} to {storage.root}/{folderpath.name}"
1819
+ )
1820
+ folder_key_path = Path(folderpath.name)
1821
+ else:
1822
+ # maintain the hierachy within an existing storage location
1823
+ folder_key_path = get_relative_path_to_directory(
1824
+ folderpath, UPath(storage.root)
1825
+ )
1826
+ else:
1827
+ folder_key_path = Path(key)
1828
+
1829
+ folder_key = folder_key_path.as_posix()
1830
+ # silence fine-grained logging
1831
+ verbosity = settings.verbosity
1832
+ verbosity_int = settings._verbosity_int
1833
+ if verbosity_int >= 1:
1834
+ settings.verbosity = "warning"
1835
+ artifacts_dict = {}
1836
+ for filepath in folderpath.rglob("*"):
1837
+ if filepath.is_file():
1838
+ relative_path = get_relative_path_to_directory(filepath, folderpath)
1839
+ artifact_key = folder_key + "/" + relative_path.as_posix()
1840
+ # if creating from rglob, we don't need to check for existence
1841
+ artifact = Artifact(
1842
+ filepath, run=run, key=artifact_key, skip_check_exists=True
1843
+ )
1844
+ artifacts_dict[artifact.uid] = artifact
1845
+ settings.verbosity = verbosity
1846
+
1847
+ # run sanity check on hashes
1848
+ hashes = [
1849
+ artifact.hash
1850
+ for artifact in artifacts_dict.values()
1851
+ if artifact.hash is not None
1852
+ ]
1853
+ uids = artifacts_dict.keys()
1854
+ n_unique_hashes = len(set(hashes))
1855
+ if n_unique_hashes == len(hashes):
1856
+ artifacts = list(artifacts_dict.values())
1857
+ else:
1858
+ # consider exact duplicates (same id, same hash)
1859
+ # below can't happen anymore because artifacts is a dict now
1860
+ # if len(set(uids)) == len(set(hashes)):
1861
+ # logger.warning("dropping duplicate records in list of artifact records")
1862
+ # artifacts = list(set(uids))
1863
+ # consider false duplicates (different id, same hash)
1864
+ if not len(set(uids)) == n_unique_hashes:
1865
+ seen_hashes = set()
1866
+ non_unique_artifacts = {
1867
+ hash: artifact
1868
+ for hash, artifact in artifacts_dict.items()
1869
+ if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore
1870
+ }
1871
+ display_non_unique = "\n ".join(
1872
+ f"{artifact}" for artifact in non_unique_artifacts
1873
+ )
1874
+ logger.warning(
1875
+ "there are multiple artifact uids with the same hashes, dropping"
1876
+ f" {len(non_unique_artifacts)} duplicates out of"
1877
+ f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
1878
+ )
1879
+ artifacts = [
1880
+ artifact
1881
+ for artifact in artifacts_dict.values()
1882
+ if artifact not in non_unique_artifacts.values()
1883
+ ]
1884
+ logger.success(
1885
+ f"created {len(artifacts)} artifacts from directory using storage"
1886
+ f" {storage.root} and key = {folder_key}/"
1887
+ )
1888
+ return artifacts
1889
+
1890
+ def replace(
1891
+ self,
1892
+ data: Union[UPathStr, pd.DataFrame, AnnData, MuData],
1893
+ run: Run | None = None,
1894
+ format: str | None = None,
1895
+ ) -> None:
1896
+ """Replace artifact content.
1897
+
1898
+ Args:
1899
+ data: A file path.
1900
+ run: The run that created the artifact gets
1901
+ auto-linked if ``ln.track()`` was called.
1902
+
1903
+ Examples:
1904
+ Say we made a change to the content of an artifact, e.g., edited the image
1905
+ `paradisi05_laminopathic_nuclei.jpg`.
1906
+
1907
+ This is how we replace the old file in storage with the new file:
1908
+
1909
+ >>> artifact.replace("paradisi05_laminopathic_nuclei.jpg")
1910
+ >>> artifact.save()
1911
+
1912
+ Note that this neither changes the storage key nor the filename.
1913
+
1914
+ However, it will update the suffix if it changes.
1915
+ """
1916
+ from lamindb import settings
1917
+
1918
+ default_storage = settings.storage.record
1919
+ kwargs, privates = get_artifact_kwargs_from_data(
1920
+ provisional_uid=self.uid,
1921
+ data=data,
1922
+ key=self.key,
1923
+ run=run,
1924
+ format=format,
1925
+ default_storage=default_storage,
1926
+ version=None,
1927
+ is_replace=True,
1928
+ )
1929
+
1930
+ # this artifact already exists
1931
+ if privates is None:
1932
+ return kwargs
1933
+
1934
+ check_path_in_storage = privates["check_path_in_storage"]
1935
+ if check_path_in_storage:
1936
+ err_msg = (
1937
+ "Can only replace with a local path not in any Storage. "
1938
+ f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}."
1939
+ )
1940
+ raise ValueError(err_msg)
1941
+
1942
+ _overwrite_versions = kwargs["_overwrite_versions"]
1943
+ if self._overwrite_versions != _overwrite_versions:
1944
+ err_msg = "It is not allowed to replace "
1945
+ err_msg += "a folder" if self._overwrite_versions else "a file"
1946
+ err_msg += " with " + ("a folder." if _overwrite_versions else "a file.")
1947
+ raise ValueError(err_msg)
1948
+
1949
+ if self.key is not None and not self._key_is_virtual:
1950
+ key_path = PurePosixPath(self.key)
1951
+ new_filename = f"{key_path.stem}{kwargs['suffix']}"
1952
+ # the following will only be true if the suffix changes!
1953
+ if key_path.name != new_filename:
1954
+ self._clear_storagekey = self.key
1955
+ self.key = str(key_path.with_name(new_filename))
1956
+ # update old key with the new one so that checks in record pass
1957
+ self._old_key = self.key
1958
+ logger.warning(
1959
+ f"replacing the file will replace key '{key_path}' with '{self.key}'"
1960
+ f" and delete '{key_path}' upon `save()`"
1961
+ )
1962
+ else:
1963
+ old_storage = auto_storage_key_from_artifact(self)
1964
+ is_dir = self.n_files is not None
1965
+ new_storage = auto_storage_key_from_artifact_uid(
1966
+ self.uid, kwargs["suffix"], is_dir
1967
+ )
1968
+ if old_storage != new_storage:
1969
+ self._clear_storagekey = old_storage
1970
+ if self.key is not None:
1971
+ new_key_path = PurePosixPath(self.key).with_suffix(kwargs["suffix"])
1972
+ self.key = str(new_key_path)
1973
+ # update old key with the new one so that checks in record pass
1974
+ self._old_key = self.key
1975
+
1976
+ self.suffix = kwargs["suffix"]
1977
+ self.size = kwargs["size"]
1978
+ self.hash = kwargs["hash"]
1979
+ self._hash_type = kwargs["_hash_type"]
1980
+ self.run_id = kwargs["run_id"]
1981
+ self.run = kwargs["run"]
1982
+ self.n_files = kwargs["n_files"]
1983
+
1984
+ self._local_filepath = privates["local_filepath"]
1985
+ self._cloud_filepath = privates["cloud_filepath"]
1986
+ self._memory_rep = privates["memory_rep"]
1987
+ # no need to upload if new file is already in storage
1988
+ self._to_store = not check_path_in_storage
1989
+
1990
+ def open(
1991
+ self, mode: str = "r", is_run_input: bool | None = None, **kwargs
1992
+ ) -> Union[
1993
+ AnnDataAccessor,
1994
+ BackedAccessor,
1995
+ SOMACollection,
1996
+ SOMAExperiment,
1997
+ SOMAMeasurement,
1998
+ PyArrowDataset,
1999
+ ]:
2000
+ """Return a cloud-backed data object.
2001
+
2002
+ Works for `AnnData` (`.h5ad` and `.zarr`), generic `hdf5` and `zarr`,
2003
+ `tiledbsoma` objects (`.tiledbsoma`), `pyarrow` compatible formats.
2004
+
2005
+ Args:
2006
+ mode: can only be `"w"` (write mode) for `tiledbsoma` stores,
2007
+ otherwise should be always `"r"` (read-only mode).
2008
+
2009
+ Notes:
2010
+ For more info, see tutorial: :doc:`/arrays`.
2011
+
2012
+ Examples:
2013
+
2014
+ Read AnnData in backed mode from cloud:
2015
+
2016
+ >>> artifact = ln.Artifact.get(key="lndb-storage/pbmc68k.h5ad")
2017
+ >>> artifact.open()
2018
+ AnnDataAccessor object with n_obs × n_vars = 70 × 765
2019
+ constructed for the AnnData object pbmc68k.h5ad
2020
+ ...
2021
+ """
2022
+ if self._overwrite_versions and not self.is_latest:
2023
+ raise ValueError(INCONSISTENT_STATE_MSG)
2024
+ # all hdf5 suffixes including gzipped
2025
+ h5_suffixes = [".h5", ".hdf5", ".h5ad"]
2026
+ h5_suffixes += [s + ".gz" for s in h5_suffixes]
2027
+ # ignore empty suffix for now
2028
+ suffixes = (
2029
+ (
2030
+ "",
2031
+ ".zarr",
2032
+ ".anndata.zarr",
2033
+ ".tiledbsoma",
2034
+ )
2035
+ + tuple(h5_suffixes)
2036
+ + PYARROW_SUFFIXES
2037
+ + tuple(
2038
+ s + ".gz" for s in PYARROW_SUFFIXES
2039
+ ) # this doesn't work for externally gzipped files, REMOVE LATER
2040
+ )
2041
+ if self.suffix not in suffixes:
2042
+ raise ValueError(
2043
+ "Artifact should have a zarr, h5, tiledbsoma object"
2044
+ " or a compatible `pyarrow.dataset.dataset` directory"
2045
+ " as the underlying data, please use one of the following suffixes"
2046
+ f" for the object name: {', '.join(suffixes[1:])}."
2047
+ f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
2048
+ " (no mixing allowed)."
2049
+ )
2050
+ if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
2051
+ raise ValueError(
2052
+ "Only a tiledbsoma store can be openened with `mode!='r'`."
2053
+ )
2054
+
2055
+ from lamindb import settings
2056
+ from lamindb.core.storage._backed_access import (
2057
+ _track_writes_factory,
2058
+ backed_access,
2059
+ )
2060
+
2061
+ using_key = settings._using_key
2062
+ filepath, cache_key = filepath_cache_key_from_artifact(
2063
+ self, using_key=using_key
2064
+ )
2065
+ is_tiledbsoma_w = (
2066
+ filepath.name == "soma" or self.suffix == ".tiledbsoma"
2067
+ ) and mode == "w"
2068
+ # consider the case where an object is already locally cached
2069
+ localpath = setup_settings.paths.cloud_to_local_no_update(
2070
+ filepath, cache_key=cache_key
2071
+ )
2072
+ if is_tiledbsoma_w:
2073
+ open_cache = False
2074
+ else:
2075
+ open_cache = not isinstance(
2076
+ filepath, LocalPathClasses
2077
+ ) and not filepath.synchronize(localpath, just_check=True)
2078
+ if open_cache:
2079
+ try:
2080
+ access = backed_access(localpath, mode, using_key, **kwargs)
2081
+ except Exception as e:
2082
+ if isinstance(filepath, LocalPathClasses):
2083
+ raise e
2084
+ logger.warning(
2085
+ f"The cache might be corrupted: {e}. Trying to open directly."
2086
+ )
2087
+ access = backed_access(filepath, mode, using_key, **kwargs)
2088
+ # happens only if backed_access has been successful
2089
+ # delete the corrupted cache
2090
+ if localpath.is_dir():
2091
+ shutil.rmtree(localpath)
2092
+ else:
2093
+ localpath.unlink(missing_ok=True)
2094
+ else:
2095
+ access = backed_access(filepath, mode, using_key, **kwargs)
2096
+ if is_tiledbsoma_w:
2097
+
2098
+ def finalize():
2099
+ nonlocal self, filepath, localpath
2100
+ if not isinstance(filepath, LocalPathClasses):
2101
+ _, hash, _, _ = get_stat_dir_cloud(filepath)
2102
+ else:
2103
+ # this can be very slow
2104
+ _, hash, _, _ = hash_dir(filepath)
2105
+ if self.hash != hash:
2106
+ from .record import init_self_from_db
2107
+
2108
+ new_version = Artifact(
2109
+ filepath, revises=self, _is_internal_call=True
2110
+ ).save()
2111
+ init_self_from_db(self, new_version)
2112
+
2113
+ if localpath != filepath and localpath.exists():
2114
+ shutil.rmtree(localpath)
2115
+
2116
+ access = _track_writes_factory(access, finalize)
2117
+ # only call if open is successfull
2118
+ _track_run_input(self, is_run_input)
2119
+ return access
2120
+
2121
+ def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
2122
+ """Cache and load into memory.
2123
+
2124
+ See all :mod:`~lamindb.core.loaders`.
2125
+
2126
+ Examples:
2127
+
2128
+ Load a `DataFrame`-like artifact:
2129
+
2130
+ >>> artifact.load().head()
2131
+ sepal_length sepal_width petal_length petal_width iris_organism_code
2132
+ 0 0.051 0.035 0.014 0.002 0
2133
+ 1 0.049 0.030 0.014 0.002 0
2134
+ 2 0.047 0.032 0.013 0.002 0
2135
+ 3 0.046 0.031 0.015 0.002 0
2136
+ 4 0.050 0.036 0.014 0.002 0
2137
+
2138
+ Load an `AnnData`-like artifact:
2139
+
2140
+ >>> artifact.load()
2141
+ AnnData object with n_obs × n_vars = 70 × 765
2142
+
2143
+ Fall back to :meth:`~lamindb.Artifact.cache` if no in-memory representation is configured:
2144
+
2145
+ >>> artifact.load()
2146
+ PosixPath('/home/runner/work/lamindb/lamindb/docs/guide/mydata/.lamindb/jb7BY5UJoQVGMUOKiLcn.jpg')
2147
+ """
2148
+ from lamindb import settings
2149
+
2150
+ if self._overwrite_versions and not self.is_latest:
2151
+ raise ValueError(INCONSISTENT_STATE_MSG)
2152
+
2153
+ if hasattr(self, "_memory_rep") and self._memory_rep is not None:
2154
+ access_memory = self._memory_rep
2155
+ else:
2156
+ filepath, cache_key = filepath_cache_key_from_artifact(
2157
+ self, using_key=settings._using_key
2158
+ )
2159
+ cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
2160
+ try:
2161
+ # cache_path is local so doesn't trigger any sync in load_to_memory
2162
+ access_memory = load_to_memory(cache_path, **kwargs)
2163
+ except Exception as e:
2164
+ # raise the exception if it comes from not having a correct loader
2165
+ # or if the original path is local
2166
+ if isinstance(e, NotImplementedError) or isinstance(
2167
+ filepath, LocalPathClasses
2168
+ ):
2169
+ raise e
2170
+ logger.warning(
2171
+ f"The cache might be corrupted: {e}. Retrying to synchronize."
2172
+ )
2173
+ # delete the existing cache
2174
+ if cache_path.is_dir():
2175
+ shutil.rmtree(cache_path)
2176
+ else:
2177
+ cache_path.unlink(missing_ok=True)
2178
+ # download again and try to load into memory
2179
+ cache_path = _synchronize_cleanup_on_error(
2180
+ filepath, cache_key=cache_key
2181
+ )
2182
+ access_memory = load_to_memory(cache_path, **kwargs)
2183
+ # only call if load is successfull
2184
+ _track_run_input(self, is_run_input)
2185
+ return access_memory
2186
+
2187
+ def cache(self, is_run_input: bool | None = None) -> Path:
2188
+ """Download cloud artifact to local cache.
2189
+
2190
+ Follows synching logic: only caches an artifact if it's outdated in the local cache.
2191
+
2192
+ Returns a path to a locally cached on-disk object (say a `.jpg` file).
2193
+
2194
+ Examples:
2195
+
2196
+ Sync file from cloud and return the local path of the cache:
2197
+
2198
+ >>> artifact.cache()
2199
+ PosixPath('/home/runner/work/Caches/lamindb/lamindb-ci/lndb-storage/pbmc68k.h5ad')
2200
+ """
2201
+ from lamindb import settings
2202
+
2203
+ if self._overwrite_versions and not self.is_latest:
2204
+ raise ValueError(INCONSISTENT_STATE_MSG)
2205
+
2206
+ filepath, cache_key = filepath_cache_key_from_artifact(
2207
+ self, using_key=settings._using_key
2208
+ )
2209
+ cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
2210
+ # only call if sync is successfull
2211
+ _track_run_input(self, is_run_input)
2212
+ return cache_path
2213
+
2214
+ def delete(
2215
+ self,
2216
+ permanent: bool | None = None,
2217
+ storage: bool | None = None,
2218
+ using_key: str | None = None,
2219
+ ) -> None:
2220
+ """Trash or permanently delete.
2221
+
2222
+ A first call to `.delete()` puts an artifact into the trash (sets `_branch_code` to `-1`).
2223
+ A second call permanently deletes the artifact.
2224
+ If it is a folder artifact with multiple versions, deleting a non-latest version
2225
+ will not delete the underlying storage by default (if `storage=True` is not specified).
2226
+ Deleting the latest version will delete all the versions for folder artifacts.
2227
+
2228
+ FAQ: :doc:`docs:faq/storage`
2229
+
2230
+ Args:
2231
+ permanent: Permanently delete the artifact (skip trash).
2232
+ storage: Indicate whether you want to delete the artifact in storage.
2233
+
2234
+ Examples:
2235
+
2236
+ For an `Artifact` object `artifact`, call:
2237
+
2238
+ >>> artifact = ln.Artifact.filter(key="some.csv").one()
2239
+ >>> artifact.delete() # delete a single file artifact
2240
+
2241
+ >>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=False).first()
2242
+ >>> artiact.delete() # delete an old version, the data will not be deleted
2243
+
2244
+ >>> artifact = ln.Artifact.filter(key="some.tiledbsoma". is_latest=True).one()
2245
+ >>> artiact.delete() # delete all versions, the data will be deleted or prompted for deletion.
2246
+ """
2247
+ # this first check means an invalid delete fails fast rather than cascading through
2248
+ # database and storage permission errors
2249
+ if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
2250
+ isettings = setup_settings.instance
2251
+ if self.storage.instance_uid != isettings.uid and (
2252
+ storage or storage is None
2253
+ ):
2254
+ raise IntegrityError(
2255
+ "Cannot simply delete artifacts outside of this instance's managed storage locations."
2256
+ "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
2257
+ f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
2258
+ f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
2259
+ )
2260
+ # by default, we only move artifacts into the trash (_branch_code = -1)
2261
+ trash__branch_code = -1
2262
+ if self._branch_code > trash__branch_code and not permanent:
2263
+ if storage is not None:
2264
+ logger.warning("moving artifact to trash, storage arg is ignored")
2265
+ # move to trash
2266
+ self._branch_code = trash__branch_code
2267
+ self.save()
2268
+ logger.important(
2269
+ f"moved artifact to trash (_branch_code = {trash__branch_code})"
2270
+ )
2271
+ return
2272
+
2273
+ # if the artifact is already in the trash
2274
+ # permanent delete skips the trash
2275
+ if permanent is None:
2276
+ # ask for confirmation of permanent delete
2277
+ response = input(
2278
+ "Artifact record is already in trash! Are you sure you want to permanently"
2279
+ " delete it? (y/n) You can't undo this action."
2280
+ )
2281
+ delete_record = response == "y"
2282
+ else:
2283
+ assert permanent # noqa: S101
2284
+ delete_record = True
2285
+
2286
+ if delete_record:
2287
+ # need to grab file path before deletion
2288
+ try:
2289
+ path, _ = filepath_from_artifact(self, using_key)
2290
+ except OSError:
2291
+ # we can still delete the record
2292
+ logger.warning("Could not get path")
2293
+ storage = False
2294
+ # only delete in storage if DB delete is successful
2295
+ # DB delete might error because of a foreign key constraint violated etc.
2296
+ if self._overwrite_versions and self.is_latest:
2297
+ # includes self
2298
+ for version in self.versions.all():
2299
+ _delete_skip_storage(version)
2300
+ else:
2301
+ self._delete_skip_storage()
2302
+ # by default do not delete storage if deleting only a previous version
2303
+ # and the underlying store is mutable
2304
+ if self._overwrite_versions and not self.is_latest:
2305
+ delete_in_storage = False
2306
+ if storage:
2307
+ logger.warning(
2308
+ "Storage argument is ignored; can't delete storage on an previous version"
2309
+ )
2310
+ elif self.key is None or self._key_is_virtual:
2311
+ # do not ask for confirmation also if storage is None
2312
+ delete_in_storage = storage is None or storage
2313
+ else:
2314
+ # for artifacts with non-virtual semantic storage keys (key is not None)
2315
+ # ask for extra-confirmation
2316
+ if storage is None:
2317
+ response = input(
2318
+ f"Are you sure to want to delete {path}? (y/n) You can't undo"
2319
+ " this action."
2320
+ )
2321
+ delete_in_storage = response == "y"
2322
+ else:
2323
+ delete_in_storage = storage
2324
+ if not delete_in_storage:
2325
+ logger.important(f"a file/folder remains here: {path}")
2326
+ # we don't yet have logic to bring back the deleted metadata record
2327
+ # in case storage deletion fails - this is important for ACID down the road
2328
+ if delete_in_storage:
2329
+ delete_msg = delete_storage(path, raise_file_not_found_error=False)
2330
+ if delete_msg != "did-not-delete":
2331
+ logger.success(f"deleted {colors.yellow(f'{path}')}")
2332
+
2333
+ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
2334
+ """Save to database & storage.
2335
+
2336
+ Args:
2337
+ upload: Trigger upload to cloud storage in instances with hybrid storage mode.
2338
+
2339
+ Examples:
2340
+ >>> artifact = ln.Artifact("./myfile.csv", description="myfile")
2341
+ >>> artifact.save()
2342
+ """
2343
+ state_was_adding = self._state.adding
2344
+ print_progress = kwargs.pop("print_progress", True)
2345
+ store_kwargs = kwargs.pop(
2346
+ "store_kwargs", {}
2347
+ ) # kwargs for .upload_from in the end
2348
+ access_token = kwargs.pop("access_token", None)
2349
+ local_path = None
2350
+ if upload and setup_settings.instance.keep_artifacts_local:
2351
+ # switch local storage location to cloud
2352
+ local_path = self.path
2353
+ self.storage_id = setup_settings.instance.storage.id
2354
+ self._local_filepath = local_path
2355
+ # switch to virtual storage key upon upload
2356
+ # the local filepath is already cached at that point
2357
+ self._key_is_virtual = True
2358
+ # ensure that the artifact is uploaded
2359
+ self._to_store = True
2360
+
2361
+ self._save_skip_storage(**kwargs)
2362
+
2363
+ from .save import check_and_attempt_clearing, check_and_attempt_upload
2364
+
2365
+ using_key = None
2366
+ if "using" in kwargs:
2367
+ using_key = kwargs["using"]
2368
+ exception_upload = check_and_attempt_upload(
2369
+ self,
2370
+ using_key,
2371
+ access_token=access_token,
2372
+ print_progress=print_progress,
2373
+ **store_kwargs,
2374
+ )
2375
+ if exception_upload is not None:
2376
+ # we do not want to raise file not found on cleanup if upload of a file failed
2377
+ # often it is ACID in the filesystem itself
2378
+ # for example, s3 won't have the failed file, so just skip the delete in this case
2379
+ raise_file_not_found_error = False
2380
+ self._delete_skip_storage()
2381
+ else:
2382
+ # this is the case when it is cleaned on .replace
2383
+ raise_file_not_found_error = True
2384
+ # this is triggered by an exception in check_and_attempt_upload or by replace.
2385
+ exception_clear = check_and_attempt_clearing(
2386
+ self,
2387
+ raise_file_not_found_error=raise_file_not_found_error,
2388
+ using_key=using_key,
2389
+ )
2390
+ if exception_upload is not None:
2391
+ raise RuntimeError(exception_upload)
2392
+ if exception_clear is not None:
2393
+ raise RuntimeError(exception_clear)
2394
+ # this is only for keep_artifacts_local
2395
+ if local_path is not None and not state_was_adding:
2396
+ # only move the local artifact to cache if it was not newly created
2397
+ local_path_cache = ln_setup.settings.cache_dir / local_path.name
2398
+ # don't use Path.rename here because of cross-device link error
2399
+ # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
2400
+ shutil.move(
2401
+ local_path, # type: ignore
2402
+ local_path_cache,
2403
+ )
2404
+ logger.important(f"moved local artifact to cache: {local_path_cache}")
2405
+ return self
2406
+
2407
+ def restore(self) -> None:
2408
+ """Restore from trash.
2409
+
2410
+ Examples:
2411
+ >>> artifact.restore()
2412
+ """
2413
+ self._branch_code = 1
2414
+ self.save()
2415
+
2416
+ def describe(self) -> None:
2417
+ """Describe relations of record.
2418
+
2419
+ Examples:
2420
+ >>> artifact.describe()
2421
+ """
2422
+ return describe_artifact_collection(self)
2423
+
2424
+ def _populate_subsequent_runs(self, run: Run) -> None:
2425
+ _populate_subsequent_runs_(self, run)
2426
+
2427
+
2428
+ # can't really just call .cache in .load because of double tracking
2429
+ def _synchronize_cleanup_on_error(
2430
+ filepath: UPath, cache_key: str | None = None
2431
+ ) -> UPath:
2432
+ try:
2433
+ cache_path = setup_settings.paths.cloud_to_local(
2434
+ filepath, cache_key=cache_key, print_progress=True
2435
+ )
2436
+ except Exception as e:
2437
+ if not isinstance(filepath, LocalPathClasses):
2438
+ cache_path = setup_settings.paths.cloud_to_local_no_update(
2439
+ filepath, cache_key=cache_key
2440
+ )
2441
+ if cache_path.is_dir():
2442
+ shutil.rmtree(cache_path)
2443
+ else:
2444
+ cache_path.unlink(missing_ok=True)
2445
+ raise e
2446
+ return cache_path
2447
+
2448
+
2449
+ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
2450
+ super(Artifact, artifact).delete(*args, **kwargs)
2451
+
2452
+
2453
+ def _save_skip_storage(artifact, **kwargs) -> None:
2454
+ save_staged_feature_sets(artifact)
2455
+ super(Artifact, artifact).save(**kwargs)
2456
+ save_schema_links(artifact)
2457
+
2458
+
2459
+ class ArtifactFeatureValue(BasicRecord, LinkORM, TracksRun):
2460
+ id: int = models.BigAutoField(primary_key=True)
2461
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
2462
+ # we follow the lower() case convention rather than snake case for link models
2463
+ featurevalue = ForeignKey(FeatureValue, PROTECT, related_name="+")
2464
+
2465
+ class Meta:
2466
+ unique_together = ("artifact", "featurevalue")
2467
+
2468
+
2469
+ class ArtifactParamValue(BasicRecord, LinkORM, TracksRun):
2470
+ id: int = models.BigAutoField(primary_key=True)
2471
+ artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="+")
2472
+ # we follow the lower() case convention rather than snake case for link models
2473
+ paramvalue: ParamValue = ForeignKey(ParamValue, PROTECT, related_name="+")
2474
+
2475
+ class Meta:
2476
+ unique_together = ("artifact", "paramvalue")
2477
+
2478
+
2479
+ def _track_run_input(
2480
+ data: Artifact
2481
+ | Iterable[Artifact], # can also be Collection | Iterable[Collection]
2482
+ is_run_input: bool | Run | None = None,
2483
+ run: Run | None = None,
2484
+ ):
2485
+ from lamindb import settings
2486
+
2487
+ from .._tracked import get_current_tracked_run
2488
+ from ..core._context import context
2489
+ from .collection import Collection
2490
+
2491
+ if isinstance(is_run_input, Run):
2492
+ run = is_run_input
2493
+ is_run_input = True
2494
+ elif run is None:
2495
+ run = get_current_tracked_run()
2496
+ if run is None:
2497
+ run = context.run
2498
+ # consider that data is an iterable of Data
2499
+ data_iter: Iterable[Artifact] | Iterable[Collection] = (
2500
+ [data] if isinstance(data, (Artifact, Collection)) else data
2501
+ )
2502
+ track_run_input = False
2503
+ input_data = []
2504
+ if run is not None:
2505
+ # avoid cycles: data can't be both input and output
2506
+ def is_valid_input(data: Artifact | Collection):
2507
+ is_valid = False
2508
+ if data._state.db == "default":
2509
+ # things are OK if the record is on the default db
2510
+ is_valid = True
2511
+ elif data._state.db is None:
2512
+ # if a record is not yet saved, it can't be an input
2513
+ # we silently ignore because what likely happens is that
2514
+ # the user works with an object that's about to be saved
2515
+ # in the current Python session
2516
+ is_valid = False
2517
+ else:
2518
+ # record is on another db
2519
+ # we have to save the record into the current db with
2520
+ # the run being attached to a transfer transform
2521
+ logger.important(
2522
+ f"completing transfer to track {data.__class__.__name__}('{data.uid[:8]}') as input"
2523
+ )
2524
+ data.save()
2525
+ is_valid = True
2526
+ return (
2527
+ data.run_id != run.id
2528
+ and not data._state.adding # this seems duplicated with data._state.db is None
2529
+ and is_valid
2530
+ )
2531
+
2532
+ input_data = [data for data in data_iter if is_valid_input(data)]
2533
+ input_data_ids = [data.id for data in input_data]
2534
+ if input_data:
2535
+ data_class_name = input_data[0].__class__.__name__.lower()
2536
+ # let us first look at the case in which the user does not
2537
+ # provide a boolean value for `is_run_input`
2538
+ # hence, we need to determine whether we actually want to
2539
+ # track a run or not
2540
+ if is_run_input is None:
2541
+ # we don't have a run record
2542
+ if run is None:
2543
+ if settings.track_run_inputs:
2544
+ # here we check that this is not a read-only connection
2545
+ # normally for our connection strings the read-only role name has _read in it
2546
+ # not absolutely safe but the worst case is that the warning is not shown
2547
+ instance = setup_settings.instance
2548
+ if instance.dialect != "postgresql" or "_read" not in instance.db:
2549
+ logger.warning(WARNING_NO_INPUT)
2550
+ # assume we have a run record
2551
+ else:
2552
+ # assume there is non-cyclic candidate input data
2553
+ if input_data:
2554
+ if settings.track_run_inputs:
2555
+ transform_note = ""
2556
+ if len(input_data) == 1:
2557
+ if input_data[0].transform is not None:
2558
+ transform_note = (
2559
+ ", adding parent transform"
2560
+ f" {input_data[0].transform.id}"
2561
+ )
2562
+ logger.info(
2563
+ f"adding {data_class_name} ids {input_data_ids} as inputs for run"
2564
+ f" {run.id}{transform_note}"
2565
+ )
2566
+ track_run_input = True
2567
+ else:
2568
+ logger.hint(
2569
+ "track these data as a run input by passing `is_run_input=True`"
2570
+ )
2571
+ else:
2572
+ track_run_input = is_run_input
2573
+ if track_run_input:
2574
+ if run is None:
2575
+ raise ValueError("No run context set. Call `ln.track()`.")
2576
+ # avoid adding the same run twice
2577
+ run.save()
2578
+ if data_class_name == "artifact":
2579
+ LinkORM = run.input_artifacts.through
2580
+ links = [
2581
+ LinkORM(run_id=run.id, artifact_id=data_id)
2582
+ for data_id in input_data_ids
2583
+ ]
2584
+ else:
2585
+ LinkORM = run.input_collections.through
2586
+ links = [
2587
+ LinkORM(run_id=run.id, collection_id=data_id)
2588
+ for data_id in input_data_ids
2589
+ ]
2590
+ LinkORM.objects.bulk_create(links, ignore_conflicts=True)
2591
+ # generalize below for more than one data batch
2592
+ if len(input_data) == 1:
2593
+ if input_data[0].transform is not None:
2594
+ run.transform.predecessors.add(input_data[0].transform)
2595
+
2596
+
2597
+ # privates currently dealt with separately
2598
+ # mypy: ignore-errors
2599
+ Artifact._delete_skip_storage = _delete_skip_storage
2600
+ Artifact._save_skip_storage = _save_skip_storage
2601
+ Artifact.view_lineage = view_lineage