lamindb 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. lamindb/__init__.py +33 -26
  2. lamindb/_finish.py +9 -1
  3. lamindb/_tracked.py +26 -3
  4. lamindb/_view.py +2 -3
  5. lamindb/base/__init__.py +1 -1
  6. lamindb/base/ids.py +1 -10
  7. lamindb/base/users.py +1 -4
  8. lamindb/core/__init__.py +7 -65
  9. lamindb/core/_compat.py +60 -0
  10. lamindb/core/_context.py +50 -22
  11. lamindb/core/_mapped_collection.py +4 -2
  12. lamindb/core/_settings.py +6 -6
  13. lamindb/core/_sync_git.py +1 -1
  14. lamindb/core/_track_environment.py +2 -1
  15. lamindb/core/datasets/_small.py +3 -3
  16. lamindb/core/loaders.py +43 -20
  17. lamindb/core/storage/_anndata_accessor.py +8 -3
  18. lamindb/core/storage/_backed_access.py +14 -7
  19. lamindb/core/storage/_pyarrow_dataset.py +24 -9
  20. lamindb/core/storage/_tiledbsoma.py +8 -6
  21. lamindb/core/storage/_zarr.py +104 -25
  22. lamindb/core/storage/objects.py +63 -28
  23. lamindb/core/storage/paths.py +16 -13
  24. lamindb/core/types.py +10 -0
  25. lamindb/curators/__init__.py +176 -149
  26. lamindb/errors.py +1 -1
  27. lamindb/integrations/_vitessce.py +4 -4
  28. lamindb/migrations/0089_subsequent_runs.py +159 -0
  29. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  30. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  31. lamindb/models/__init__.py +79 -0
  32. lamindb/{core → models}/_describe.py +3 -3
  33. lamindb/{core → models}/_django.py +8 -5
  34. lamindb/{core → models}/_feature_manager.py +103 -87
  35. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  36. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  37. lamindb/{core → models}/_label_manager.py +10 -17
  38. lamindb/{core/relations.py → models/_relations.py} +8 -1
  39. lamindb/models/artifact.py +2602 -0
  40. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  41. lamindb/models/collection.py +683 -0
  42. lamindb/models/core.py +135 -0
  43. lamindb/models/feature.py +643 -0
  44. lamindb/models/flextable.py +163 -0
  45. lamindb/{_parents.py → models/has_parents.py} +55 -49
  46. lamindb/models/project.py +384 -0
  47. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  48. lamindb/{_query_set.py → models/query_set.py} +64 -32
  49. lamindb/models/record.py +1762 -0
  50. lamindb/models/run.py +563 -0
  51. lamindb/{_save.py → models/save.py} +18 -8
  52. lamindb/models/schema.py +732 -0
  53. lamindb/models/transform.py +360 -0
  54. lamindb/models/ulabel.py +249 -0
  55. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
  56. lamindb-1.2.0.dist-info/RECORD +95 -0
  57. lamindb/_artifact.py +0 -1361
  58. lamindb/_collection.py +0 -440
  59. lamindb/_feature.py +0 -316
  60. lamindb/_is_versioned.py +0 -40
  61. lamindb/_record.py +0 -1065
  62. lamindb/_run.py +0 -60
  63. lamindb/_schema.py +0 -347
  64. lamindb/_storage.py +0 -15
  65. lamindb/_transform.py +0 -170
  66. lamindb/_ulabel.py +0 -56
  67. lamindb/_utils.py +0 -9
  68. lamindb/base/validation.py +0 -63
  69. lamindb/core/_data.py +0 -491
  70. lamindb/core/fields.py +0 -12
  71. lamindb/models.py +0 -4435
  72. lamindb-1.1.0.dist-info/RECORD +0 -95
  73. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
  74. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
lamindb/_artifact.py DELETED
@@ -1,1361 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import shutil
5
- from pathlib import Path, PurePath, PurePosixPath
6
- from typing import TYPE_CHECKING, Any
7
-
8
- import fsspec
9
- import lamindb_setup as ln_setup
10
- import pandas as pd
11
- from anndata import AnnData
12
- from django.db.models import Q
13
- from lamin_utils import colors, logger
14
- from lamindb_setup import settings as setup_settings
15
- from lamindb_setup._init_instance import register_storage_in_instance
16
- from lamindb_setup.core._docs import doc_args
17
- from lamindb_setup.core._settings_storage import init_storage
18
- from lamindb_setup.core.hashing import hash_dir, hash_file
19
- from lamindb_setup.core.upath import (
20
- create_path,
21
- extract_suffix_from_path,
22
- get_stat_dir_cloud,
23
- get_stat_file_cloud,
24
- )
25
-
26
- from lamindb._record import _get_record_kwargs
27
- from lamindb.errors import FieldValidationError
28
- from lamindb.models import Artifact, FeatureManager, ParamManager, Run, Storage
29
-
30
- from ._parents import view_lineage
31
- from ._utils import attach_func_to_class_method
32
- from .core._data import (
33
- _track_run_input,
34
- describe,
35
- get_run,
36
- save_schema_links,
37
- save_staged_feature_sets,
38
- )
39
- from .core._settings import settings
40
- from .core.loaders import load_to_memory
41
- from .core.storage import (
42
- LocalPathClasses,
43
- UPath,
44
- delete_storage,
45
- infer_suffix,
46
- write_to_disk,
47
- )
48
- from .core.storage._anndata_accessor import _anndata_n_observations
49
- from .core.storage._pyarrow_dataset import PYARROW_SUFFIXES
50
- from .core.storage._tiledbsoma import _soma_n_observations
51
- from .core.storage.objects import _mudata_is_installed
52
- from .core.storage.paths import (
53
- AUTO_KEY_PREFIX,
54
- auto_storage_key_from_artifact,
55
- auto_storage_key_from_artifact_uid,
56
- check_path_is_child_of_root,
57
- filepath_cache_key_from_artifact,
58
- filepath_from_artifact,
59
- )
60
- from .core.versioning import (
61
- create_uid,
62
- message_update_key_in_version_family,
63
- )
64
- from .errors import IntegrityError, InvalidArgument
65
-
66
- try:
67
- from .core.storage._zarr import zarr_is_adata
68
- except ImportError:
69
-
70
- def zarr_is_adata(storepath): # type: ignore
71
- raise ImportError("Please install zarr: pip install zarr<=2.18.4")
72
-
73
-
74
- if TYPE_CHECKING:
75
- from lamindb_setup.core.types import UPathStr
76
- from mudata import MuData
77
- from pyarrow.dataset import Dataset as PyArrowDataset
78
- from tiledbsoma import Collection as SOMACollection
79
- from tiledbsoma import Experiment as SOMAExperiment
80
- from tiledbsoma import Measurement as SOMAMeasurement
81
-
82
- from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
83
-
84
-
85
- def process_pathlike(
86
- filepath: UPath,
87
- default_storage: Storage,
88
- using_key: str | None,
89
- skip_existence_check: bool = False,
90
- ) -> tuple[Storage, bool]:
91
- """Determines the appropriate storage for a given path and whether to use an existing storage key."""
92
- if not skip_existence_check:
93
- try: # check if file exists
94
- if not filepath.exists():
95
- raise FileNotFoundError(filepath)
96
- except PermissionError:
97
- pass
98
- if check_path_is_child_of_root(filepath, default_storage.root):
99
- use_existing_storage_key = True
100
- return default_storage, use_existing_storage_key
101
- else:
102
- # check whether the path is part of one of the existing
103
- # already-registered storage locations
104
- result = False
105
- # within the hub, we don't want to perform check_path_in_existing_storage
106
- if using_key is None:
107
- result = check_path_in_existing_storage(filepath, using_key)
108
- if isinstance(result, Storage):
109
- use_existing_storage_key = True
110
- return result, use_existing_storage_key
111
- else:
112
- # if the path is in the cloud, we have a good candidate
113
- # for the storage root: the bucket
114
- if not isinstance(filepath, LocalPathClasses):
115
- # for a cloud path, new_root is always the bucket name
116
- if filepath.protocol == "hf":
117
- hf_path = filepath.fs.resolve_path(filepath.as_posix())
118
- hf_path.path_in_repo = ""
119
- new_root = "hf://" + hf_path.unresolve()
120
- else:
121
- if filepath.protocol == "s3":
122
- # check that endpoint_url didn't propagate here
123
- # as a part of the path string
124
- assert "?" not in filepath.path # noqa: S101
125
- new_root = list(filepath.parents)[-1]
126
- # do not register remote storage locations on hub if the current instance
127
- # is not managed on the hub
128
- storage_settings, _ = init_storage(
129
- new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
130
- )
131
- storage_record = register_storage_in_instance(storage_settings)
132
- use_existing_storage_key = True
133
- return storage_record, use_existing_storage_key
134
- # if the filepath is local
135
- else:
136
- use_existing_storage_key = False
137
- # if the default storage is local we'll throw an error if the user
138
- # doesn't provide a key
139
- if default_storage.type == "local":
140
- return default_storage, use_existing_storage_key
141
- # if the default storage is in the cloud (the file is going to
142
- # be uploaded upon saving it), we treat the filepath as a cache
143
- else:
144
- return default_storage, use_existing_storage_key
145
-
146
-
147
- def process_data(
148
- provisional_uid: str,
149
- data: UPathStr | pd.DataFrame | AnnData,
150
- format: str | None,
151
- key: str | None,
152
- default_storage: Storage,
153
- using_key: str | None,
154
- skip_existence_check: bool = False,
155
- ) -> tuple[Any, Path | UPath, str, Storage, bool]:
156
- """Serialize a data object that's provided as file or in memory."""
157
- # if not overwritten, data gets stored in default storage
158
- if _mudata_is_installed():
159
- from mudata import MuData
160
-
161
- data_types = (pd.DataFrame, AnnData, MuData)
162
- else:
163
- data_types = (pd.DataFrame, AnnData) # type:ignore
164
-
165
- if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
166
- access_token = (
167
- default_storage._access_token
168
- if hasattr(default_storage, "_access_token")
169
- else None
170
- )
171
- path = create_path(data, access_token=access_token).resolve()
172
- storage, use_existing_storage_key = process_pathlike(
173
- path,
174
- default_storage=default_storage,
175
- using_key=using_key,
176
- skip_existence_check=skip_existence_check,
177
- )
178
- suffix = extract_suffix_from_path(path)
179
- memory_rep = None
180
- elif isinstance(data, data_types):
181
- storage = default_storage
182
- memory_rep = data
183
- if key is not None:
184
- key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
185
- # use suffix as the (adata) format if the format is not provided
186
- if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
187
- format = key_suffix[1:]
188
- else:
189
- key_suffix = None
190
- suffix = infer_suffix(data, format)
191
- if key_suffix is not None and key_suffix != suffix:
192
- raise InvalidArgument(
193
- f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
194
- f" be '{suffix}'."
195
- )
196
- cache_name = f"{provisional_uid}{suffix}"
197
- path = settings.cache_dir / cache_name
198
- # Alex: I don't understand the line below
199
- if path.suffixes == []:
200
- path = path.with_suffix(suffix)
201
- write_to_disk(data, path)
202
- use_existing_storage_key = False
203
- else:
204
- raise NotImplementedError(
205
- f"Do not know how to create a artifact object from {data}, pass a path instead!"
206
- )
207
- return memory_rep, path, suffix, storage, use_existing_storage_key
208
-
209
-
210
- def get_stat_or_artifact(
211
- path: UPath,
212
- key: str | None = None,
213
- check_hash: bool = True,
214
- is_replace: bool = False,
215
- instance: str | None = None,
216
- ) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
217
- """Retrieves file statistics or an existing artifact based on the path, hash, and key."""
218
- n_files = None
219
- if settings.creation.artifact_skip_size_hash:
220
- return None, None, None, n_files, None
221
- stat = path.stat() # one network request
222
- if not isinstance(path, LocalPathClasses):
223
- size, hash, hash_type = None, None, None
224
- if stat is not None:
225
- # convert UPathStatResult to fsspec info dict
226
- stat = stat.as_info()
227
- if (store_type := stat["type"]) == "file":
228
- size, hash, hash_type = get_stat_file_cloud(stat)
229
- elif store_type == "directory":
230
- size, hash, hash_type, n_files = get_stat_dir_cloud(path)
231
- if hash is None:
232
- logger.warning(f"did not add hash for {path}")
233
- return size, hash, hash_type, n_files, None
234
- else:
235
- if path.is_dir():
236
- size, hash, hash_type, n_files = hash_dir(path)
237
- else:
238
- hash, hash_type = hash_file(path)
239
- size = stat.st_size
240
- if not check_hash:
241
- return size, hash, hash_type, n_files, None
242
- previous_artifact_version = None
243
- if key is None or is_replace:
244
- result = Artifact.objects.using(instance).filter(hash=hash).all()
245
- artifact_with_same_hash_exists = len(result) > 0
246
- else:
247
- storage_id = settings.storage.id
248
- result = (
249
- Artifact.objects.using(instance)
250
- .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
251
- .order_by("-created_at")
252
- .all()
253
- )
254
- artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
255
- if not artifact_with_same_hash_exists and len(result) > 0:
256
- logger.important(
257
- f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
258
- )
259
- previous_artifact_version = result[0]
260
- if artifact_with_same_hash_exists:
261
- message = "found artifact with same hash"
262
- if result[0]._branch_code == -1:
263
- result[0].restore()
264
- message = "restored artifact with same hash from trash"
265
- logger.important(
266
- f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
267
- )
268
- return result[0]
269
- else:
270
- return size, hash, hash_type, n_files, previous_artifact_version
271
-
272
-
273
- def check_path_in_existing_storage(
274
- path: Path | UPath, using_key: str | None = None
275
- ) -> Storage | bool:
276
- for storage in Storage.objects.using(using_key).filter().all():
277
- # if path is part of storage, return it
278
- if check_path_is_child_of_root(path, root=storage.root):
279
- return storage
280
- return False
281
-
282
-
283
- def get_relative_path_to_directory(
284
- path: PurePath | Path | UPath, directory: PurePath | Path | UPath
285
- ) -> PurePath | Path:
286
- if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
287
- # UPath.relative_to() is not behaving as it should (2023-04-07)
288
- # need to lstrip otherwise inconsistent behavior across trailing slashes
289
- # see test_artifact.py: test_get_relative_path_to_directory
290
- relpath = PurePath(
291
- path.as_posix().replace(directory.as_posix(), "").lstrip("/")
292
- )
293
- elif isinstance(directory, Path):
294
- relpath = path.resolve().relative_to(directory.resolve()) # type: ignore
295
- elif isinstance(directory, PurePath):
296
- relpath = path.relative_to(directory)
297
- else:
298
- raise TypeError("Directory not of type Path or UPath")
299
- return relpath
300
-
301
-
302
- def get_artifact_kwargs_from_data(
303
- *,
304
- data: Path | UPath | str | pd.DataFrame | AnnData | MuData,
305
- key: str | None,
306
- run: Run | None,
307
- format: str | None,
308
- provisional_uid: str,
309
- version: str | None,
310
- default_storage: Storage,
311
- using_key: str | None = None,
312
- is_replace: bool = False,
313
- skip_check_exists: bool = False,
314
- ):
315
- run = get_run(run)
316
- memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
317
- provisional_uid,
318
- data,
319
- format,
320
- key,
321
- default_storage,
322
- using_key,
323
- skip_check_exists,
324
- )
325
- stat_or_artifact = get_stat_or_artifact(
326
- path=path,
327
- key=key,
328
- instance=using_key,
329
- is_replace=is_replace,
330
- )
331
- if isinstance(stat_or_artifact, Artifact):
332
- artifact = stat_or_artifact
333
- # update the run of the existing artifact
334
- if run is not None:
335
- # save the information that this artifact was previously produced by
336
- # another run
337
- # note: same logic exists for _output_collections_with_later_updates
338
- if artifact.run is not None and artifact.run != run:
339
- artifact.run._output_artifacts_with_later_updates.add(artifact)
340
- # update the run of the artifact with the latest run
341
- stat_or_artifact.run = run
342
- return artifact, None
343
- else:
344
- size, hash, hash_type, n_files, revises = stat_or_artifact
345
-
346
- if revises is not None: # update provisional_uid
347
- provisional_uid, revises = create_uid(revises=revises, version=version)
348
- if settings.cache_dir in path.parents:
349
- path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
350
-
351
- check_path_in_storage = False
352
- if use_existing_storage_key:
353
- inferred_key = get_relative_path_to_directory(
354
- path=path, directory=UPath(storage.root)
355
- ).as_posix()
356
- if key is None:
357
- key = inferred_key
358
- else:
359
- if not key == inferred_key:
360
- raise InvalidArgument(
361
- f"The path '{data}' is already in registered storage"
362
- f" '{storage.root}' with key '{inferred_key}'\nYou passed"
363
- f" conflicting key '{key}': please move the file before"
364
- " registering it."
365
- )
366
- check_path_in_storage = True
367
- else:
368
- storage = default_storage
369
-
370
- log_storage_hint(
371
- check_path_in_storage=check_path_in_storage,
372
- storage=storage,
373
- key=key,
374
- uid=provisional_uid,
375
- suffix=suffix,
376
- is_dir=n_files is not None,
377
- )
378
-
379
- # do we use a virtual or an actual storage key?
380
- key_is_virtual = settings.creation._artifact_use_virtual_keys
381
-
382
- # if the file is already in storage, independent of the default
383
- # we use an actual storage key
384
- if check_path_in_storage:
385
- key_is_virtual = False
386
-
387
- kwargs = {
388
- "uid": provisional_uid,
389
- "suffix": suffix,
390
- "hash": hash,
391
- "_hash_type": hash_type,
392
- "key": key,
393
- "size": size,
394
- "storage_id": storage.id,
395
- # passing both the id and the object
396
- # to make them both available immediately
397
- # after object creation
398
- "n_files": n_files,
399
- "_overwrite_versions": n_files is not None, # True for folder, False for file
400
- "n_observations": None, # to implement
401
- "run_id": run.id if run is not None else None,
402
- "run": run,
403
- "_key_is_virtual": key_is_virtual,
404
- "revises": revises,
405
- }
406
- if not isinstance(path, LocalPathClasses):
407
- local_filepath = None
408
- cloud_filepath = path
409
- else:
410
- local_filepath = path
411
- cloud_filepath = None
412
- privates = {
413
- "local_filepath": local_filepath,
414
- "cloud_filepath": cloud_filepath,
415
- "memory_rep": memory_rep,
416
- "check_path_in_storage": check_path_in_storage,
417
- }
418
- return kwargs, privates
419
-
420
-
421
- def log_storage_hint(
422
- *,
423
- check_path_in_storage: bool,
424
- storage: Storage | None,
425
- key: str | None,
426
- uid: str,
427
- suffix: str,
428
- is_dir: bool,
429
- ) -> None:
430
- hint = ""
431
- if check_path_in_storage:
432
- display_root = storage.root # type: ignore
433
- # check whether path is local
434
- if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore
435
- # if it's a local path, check whether it's in the current working directory
436
- root_path = Path(storage.root) # type: ignore
437
- if check_path_is_child_of_root(root_path, Path.cwd()):
438
- # only display the relative path, not the fully resolved path
439
- display_root = root_path.relative_to(Path.cwd()) # type: ignore
440
- hint += f"path in storage '{display_root}'" # type: ignore
441
- else:
442
- hint += "path content will be copied to default storage upon `save()`"
443
- if key is None:
444
- storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
445
- hint += f" with key `None` ('{storage_key}')"
446
- else:
447
- hint += f" with key '{key}'"
448
- logger.hint(hint)
449
-
450
-
451
- def data_is_anndata(data: AnnData | UPathStr) -> bool:
452
- if isinstance(data, AnnData):
453
- return True
454
- if isinstance(data, (str, Path, UPath)):
455
- data_path = UPath(data)
456
- if data_path.suffix == ".h5ad":
457
- return True
458
- elif data_path.suffix == ".zarr":
459
- # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
460
- if ".anndata" in data_path.suffixes:
461
- return True
462
- # check only for local, expensive for cloud
463
- if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
464
- return zarr_is_adata(data_path)
465
- else:
466
- logger.warning("We do not check if cloud zarr is AnnData or not")
467
- return False
468
- return False
469
-
470
-
471
- def data_is_mudata(data: MuData | UPathStr) -> bool:
472
- if _mudata_is_installed():
473
- from mudata import MuData
474
-
475
- if isinstance(data, MuData):
476
- return True
477
- if isinstance(data, (str, Path)):
478
- return UPath(data).suffix == ".h5mu"
479
- return False
480
-
481
-
482
- def _check_otype_artifact(data: Any, otype: str | None = None):
483
- if otype is None:
484
- if isinstance(data, pd.DataFrame):
485
- logger.warning("data is a DataFrame, please use .from_df()")
486
- otype = "DataFrame"
487
- return otype
488
-
489
- data_is_path = isinstance(data, (str, Path))
490
- if data_is_anndata(data):
491
- if not data_is_path:
492
- logger.warning("data is an AnnData, please use .from_anndata()")
493
- otype = "AnnData"
494
- elif data_is_mudata(data):
495
- if not data_is_path:
496
- logger.warning("data is a MuData, please use .from_mudata()")
497
- otype = "MuData"
498
- elif not data_is_path: # UPath is a subclass of Path
499
- raise TypeError("data has to be a string, Path, UPath")
500
- return otype
501
-
502
-
503
- def __init__(artifact: Artifact, *args, **kwargs):
504
- artifact.features = FeatureManager(artifact) # type: ignore
505
- artifact.params = ParamManager(artifact) # type: ignore
506
- # Below checks for the Django-internal call in from_db()
507
- # it'd be better if we could avoid this, but not being able to create a Artifact
508
- # from data with the default constructor renders the central class of the API
509
- # essentially useless
510
- # The danger below is not that a user might pass as many args (12 of it), but rather
511
- # that at some point the Django API might change; on the other hand, this
512
- # condition of for calling the constructor based on kwargs should always
513
- # stay robust
514
- if len(args) == len(artifact._meta.concrete_fields):
515
- super(Artifact, artifact).__init__(*args, **kwargs)
516
- return None
517
- # now we proceed with the user-facing constructor
518
- if len(args) > 1:
519
- raise ValueError("Only one non-keyword arg allowed: data")
520
-
521
- data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
522
- kind: str = kwargs.pop("kind") if "kind" in kwargs else None
523
- key: str | None = kwargs.pop("key") if "key" in kwargs else None
524
- run: Run | None = kwargs.pop("run") if "run" in kwargs else None
525
- description: str | None = (
526
- kwargs.pop("description") if "description" in kwargs else None
527
- )
528
- revises: Artifact | None = kwargs.pop("revises") if "revises" in kwargs else None
529
- version: str | None = kwargs.pop("version") if "version" in kwargs else None
530
- if "visibility" in kwargs:
531
- _branch_code = kwargs.pop("visibility")
532
- elif "_branch_code" in kwargs:
533
- _branch_code = kwargs.pop("_branch_code")
534
- else:
535
- _branch_code = 1
536
- format = kwargs.pop("format") if "format" in kwargs else None
537
- _is_internal_call = kwargs.pop("_is_internal_call", False)
538
- skip_check_exists = (
539
- kwargs.pop("skip_check_exists") if "skip_check_exists" in kwargs else False
540
- )
541
- if "default_storage" in kwargs:
542
- default_storage = kwargs.pop("default_storage")
543
- else:
544
- if setup_settings.instance.keep_artifacts_local:
545
- default_storage = setup_settings.instance.storage_local.record
546
- else:
547
- default_storage = setup_settings.instance.storage.record
548
- using_key = (
549
- kwargs.pop("using_key") if "using_key" in kwargs else settings._using_key
550
- )
551
- otype = kwargs.pop("otype") if "otype" in kwargs else None
552
- otype = _check_otype_artifact(data=data, otype=otype)
553
- if "type" in kwargs:
554
- logger.warning("`type` will be removed soon, please use `kind`")
555
- kind = kwargs.pop("type")
556
- if not len(kwargs) == 0:
557
- valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
558
- raise FieldValidationError(
559
- f"Only {valid_keywords} can be passed, you passed: {kwargs}"
560
- )
561
- if revises is not None and key is not None and revises.key != key:
562
- note = message_update_key_in_version_family(
563
- suid=revises.stem_uid,
564
- existing_key=revises.key,
565
- new_key=key,
566
- registry="Artifact",
567
- )
568
- raise ValueError(
569
- f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
570
- )
571
- if revises is not None:
572
- if not isinstance(revises, Artifact):
573
- raise TypeError("`revises` has to be of type `Artifact`")
574
- if description is None:
575
- description = revises.description
576
- if key is not None and AUTO_KEY_PREFIX in key:
577
- raise ValueError(
578
- f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
579
- )
580
- # below is for internal calls that require defining the storage location
581
- # ahead of constructing the Artifact
582
- if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
583
- if _is_internal_call:
584
- is_automanaged_path = True
585
- user_provided_key = key
586
- key = None
587
- else:
588
- raise ValueError(
589
- f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
590
- )
591
- else:
592
- is_automanaged_path = False
593
- provisional_uid, revises = create_uid(revises=revises, version=version)
594
- kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
595
- data=data,
596
- key=key,
597
- run=run,
598
- format=format,
599
- provisional_uid=provisional_uid,
600
- version=version,
601
- default_storage=default_storage,
602
- using_key=using_key,
603
- skip_check_exists=skip_check_exists,
604
- )
605
-
606
- # an object with the same hash already exists
607
- if isinstance(kwargs_or_artifact, Artifact):
608
- from ._record import init_self_from_db, update_attributes
609
-
610
- init_self_from_db(artifact, kwargs_or_artifact)
611
- # adding "key" here is dangerous because key might be auto-populated
612
- attr_to_update = {"description": description}
613
- if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None:
614
- attr_to_update["key"] = key
615
- elif artifact.key != key and key is not None:
616
- logger.warning(
617
- f"key {artifact.key} on existing artifact differs from passed key {key}"
618
- )
619
- update_attributes(artifact, attr_to_update)
620
- return None
621
- else:
622
- kwargs = kwargs_or_artifact
623
-
624
- if revises is None:
625
- revises = kwargs_or_artifact.pop("revises")
626
-
627
- if data is not None:
628
- artifact._local_filepath = privates["local_filepath"]
629
- artifact._cloud_filepath = privates["cloud_filepath"]
630
- artifact._memory_rep = privates["memory_rep"]
631
- artifact._to_store = not privates["check_path_in_storage"]
632
-
633
- if is_automanaged_path and _is_internal_call:
634
- kwargs["_key_is_virtual"] = True
635
- assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
636
- uid = kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
637
- kwargs["key"] = user_provided_key
638
- if revises is not None:
639
- assert uid.startswith(revises.stem_uid) # noqa: S101
640
- if len(uid) == 16:
641
- if revises is None:
642
- uid += "0000"
643
- else:
644
- uid, revises = create_uid(revises=revises, version=version)
645
- kwargs["uid"] = uid
646
-
647
- # only set key now so that we don't do a look-up on it in case revises is passed
648
- if revises is not None:
649
- kwargs["key"] = revises.key
650
-
651
- kwargs["kind"] = kind
652
- kwargs["version"] = version
653
- kwargs["description"] = description
654
- kwargs["_branch_code"] = _branch_code
655
- kwargs["otype"] = otype
656
- kwargs["revises"] = revises
657
- # this check needs to come down here because key might be populated from an
658
- # existing file path during get_artifact_kwargs_from_data()
659
- if (
660
- kwargs["key"] is None
661
- and kwargs["description"] is None
662
- and kwargs["run"] is None
663
- ):
664
- raise ValueError("Pass one of key, run or description as a parameter")
665
-
666
- super(Artifact, artifact).__init__(**kwargs)
667
-
668
-
669
- @classmethod # type: ignore
670
- @doc_args(Artifact.from_df.__doc__)
671
- def from_df(
672
- cls,
673
- df: pd.DataFrame,
674
- *,
675
- key: str | None = None,
676
- description: str | None = None,
677
- run: Run | None = None,
678
- revises: Artifact | None = None,
679
- **kwargs,
680
- ) -> Artifact:
681
- """{}""" # noqa: D415
682
- artifact = Artifact( # type: ignore
683
- data=df,
684
- key=key,
685
- run=run,
686
- description=description,
687
- revises=revises,
688
- otype="DataFrame",
689
- kind="dataset",
690
- **kwargs,
691
- )
692
- return artifact
693
-
694
-
695
- @classmethod # type: ignore
696
- @doc_args(Artifact.from_anndata.__doc__)
697
- def from_anndata(
698
- cls,
699
- adata: AnnData | UPathStr,
700
- *,
701
- key: str | None = None,
702
- description: str | None = None,
703
- run: Run | None = None,
704
- revises: Artifact | None = None,
705
- **kwargs,
706
- ) -> Artifact:
707
- """{}""" # noqa: D415
708
- if not data_is_anndata(adata):
709
- raise ValueError("data has to be an AnnData object or a path to AnnData-like")
710
- _anndata_n_observations(adata)
711
- artifact = Artifact( # type: ignore
712
- data=adata,
713
- key=key,
714
- run=run,
715
- description=description,
716
- revises=revises,
717
- otype="AnnData",
718
- kind="dataset",
719
- **kwargs,
720
- )
721
- # this is done instead of _anndata_n_observations(adata)
722
- # because we need a proper path through create_path for cloud paths
723
- # for additional upath options etc that create_path adds
724
- obj_for_obs: AnnData | UPath
725
- if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
726
- obj_for_obs = artifact._memory_rep
727
- else:
728
- # returns ._local_filepath for local files
729
- # and the proper path through create_path for cloud paths
730
- obj_for_obs = artifact.path
731
- artifact.n_observations = _anndata_n_observations(obj_for_obs)
732
- return artifact
733
-
734
-
735
- @classmethod # type: ignore
736
- @doc_args(Artifact.from_mudata.__doc__)
737
- def from_mudata(
738
- cls,
739
- mdata: MuData,
740
- *,
741
- key: str | None = None,
742
- description: str | None = None,
743
- run: Run | None = None,
744
- revises: Artifact | None = None,
745
- **kwargs,
746
- ) -> Artifact:
747
- """{}""" # noqa: D415
748
- artifact = Artifact( # type: ignore
749
- data=mdata,
750
- key=key,
751
- run=run,
752
- description=description,
753
- revises=revises,
754
- otype="MuData",
755
- kind="dataset",
756
- **kwargs,
757
- )
758
- artifact.n_observations = mdata.n_obs
759
- return artifact
760
-
761
-
762
- @classmethod # type: ignore
763
- @doc_args(Artifact.from_tiledbsoma.__doc__)
764
- def from_tiledbsoma(
765
- cls,
766
- path: UPathStr,
767
- *,
768
- key: str | None = None,
769
- description: str | None = None,
770
- run: Run | None = None,
771
- revises: Artifact | None = None,
772
- **kwargs,
773
- ) -> Artifact:
774
- """{}""" # noqa: D415
775
- if UPath(path).suffix != ".tiledbsoma":
776
- raise ValueError(
777
- "A tiledbsoma store should have .tiledbsoma suffix to be registered."
778
- )
779
- artifact = Artifact( # type: ignore
780
- data=path,
781
- key=key,
782
- run=run,
783
- description=description,
784
- revises=revises,
785
- otype="tiledbsoma",
786
- kind="dataset",
787
- **kwargs,
788
- )
789
- artifact.n_observations = _soma_n_observations(artifact.path)
790
- return artifact
791
-
792
-
793
- @classmethod # type: ignore
794
- @doc_args(Artifact.from_dir.__doc__)
795
- def from_dir(
796
- cls,
797
- path: UPathStr,
798
- *,
799
- key: str | None = None,
800
- run: Run | None = None,
801
- ) -> list[Artifact]:
802
- """{}""" # noqa: D415
803
- folderpath: UPath = create_path(path) # returns Path for local
804
- default_storage = settings.storage.record
805
- using_key = settings._using_key
806
- storage, use_existing_storage = process_pathlike(
807
- folderpath, default_storage, using_key
808
- )
809
- folder_key_path: PurePath | Path
810
- if key is None:
811
- if not use_existing_storage:
812
- logger.warning(
813
- "folder is outside existing storage location, will copy files from"
814
- f" {path} to {storage.root}/{folderpath.name}"
815
- )
816
- folder_key_path = Path(folderpath.name)
817
- else:
818
- # maintain the hierachy within an existing storage location
819
- folder_key_path = get_relative_path_to_directory(
820
- folderpath, UPath(storage.root)
821
- )
822
- else:
823
- folder_key_path = Path(key)
824
-
825
- folder_key = folder_key_path.as_posix()
826
- # silence fine-grained logging
827
- verbosity = settings.verbosity
828
- verbosity_int = settings._verbosity_int
829
- if verbosity_int >= 1:
830
- settings.verbosity = "warning"
831
- artifacts_dict = {}
832
- for filepath in folderpath.rglob("*"):
833
- if filepath.is_file():
834
- relative_path = get_relative_path_to_directory(filepath, folderpath)
835
- artifact_key = folder_key + "/" + relative_path.as_posix()
836
- # if creating from rglob, we don't need to check for existence
837
- artifact = Artifact(
838
- filepath, run=run, key=artifact_key, skip_check_exists=True
839
- )
840
- artifacts_dict[artifact.uid] = artifact
841
- settings.verbosity = verbosity
842
-
843
- # run sanity check on hashes
844
- hashes = [
845
- artifact.hash
846
- for artifact in artifacts_dict.values()
847
- if artifact.hash is not None
848
- ]
849
- uids = artifacts_dict.keys()
850
- n_unique_hashes = len(set(hashes))
851
- if n_unique_hashes == len(hashes):
852
- artifacts = list(artifacts_dict.values())
853
- else:
854
- # consider exact duplicates (same id, same hash)
855
- # below can't happen anymore because artifacts is a dict now
856
- # if len(set(uids)) == len(set(hashes)):
857
- # logger.warning("dropping duplicate records in list of artifact records")
858
- # artifacts = list(set(uids))
859
- # consider false duplicates (different id, same hash)
860
- if not len(set(uids)) == n_unique_hashes:
861
- seen_hashes = set()
862
- non_unique_artifacts = {
863
- hash: artifact
864
- for hash, artifact in artifacts_dict.items()
865
- if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore
866
- }
867
- display_non_unique = "\n ".join(
868
- f"{artifact}" for artifact in non_unique_artifacts
869
- )
870
- logger.warning(
871
- "there are multiple artifact uids with the same hashes, dropping"
872
- f" {len(non_unique_artifacts)} duplicates out of"
873
- f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
874
- )
875
- artifacts = [
876
- artifact
877
- for artifact in artifacts_dict.values()
878
- if artifact not in non_unique_artifacts.values()
879
- ]
880
- logger.success(
881
- f"created {len(artifacts)} artifacts from directory using storage"
882
- f" {storage.root} and key = {folder_key}/"
883
- )
884
- return artifacts
885
-
886
-
887
- # docstring handled through attach_func_to_class_method
888
- def replace(
889
- self,
890
- data: UPathStr | pd.DataFrame | AnnData | MuData,
891
- run: Run | None = None,
892
- format: str | None = None,
893
- ) -> None:
894
- default_storage = settings.storage.record
895
- kwargs, privates = get_artifact_kwargs_from_data(
896
- provisional_uid=self.uid,
897
- data=data,
898
- key=self.key,
899
- run=run,
900
- format=format,
901
- default_storage=default_storage,
902
- version=None,
903
- is_replace=True,
904
- )
905
-
906
- # this artifact already exists
907
- if privates is None:
908
- return kwargs
909
-
910
- check_path_in_storage = privates["check_path_in_storage"]
911
- if check_path_in_storage:
912
- err_msg = (
913
- "Can only replace with a local path not in any Storage. "
914
- f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}."
915
- )
916
- raise ValueError(err_msg)
917
-
918
- _overwrite_versions = kwargs["_overwrite_versions"]
919
- if self._overwrite_versions != _overwrite_versions:
920
- err_msg = "It is not allowed to replace "
921
- err_msg += "a folder" if self._overwrite_versions else "a file"
922
- err_msg += " with " + ("a folder." if _overwrite_versions else "a file.")
923
- raise ValueError(err_msg)
924
-
925
- if self.key is not None and not self._key_is_virtual:
926
- key_path = PurePosixPath(self.key)
927
- new_filename = f"{key_path.stem}{kwargs['suffix']}"
928
- # the following will only be true if the suffix changes!
929
- if key_path.name != new_filename:
930
- self._clear_storagekey = self.key
931
- self.key = str(key_path.with_name(new_filename))
932
- # update old key with the new one so that checks in record pass
933
- self._old_key = self.key
934
- logger.warning(
935
- f"replacing the file will replace key '{key_path}' with '{self.key}'"
936
- f" and delete '{key_path}' upon `save()`"
937
- )
938
- else:
939
- old_storage = auto_storage_key_from_artifact(self)
940
- is_dir = self.n_files is not None
941
- new_storage = auto_storage_key_from_artifact_uid(
942
- self.uid, kwargs["suffix"], is_dir
943
- )
944
- if old_storage != new_storage:
945
- self._clear_storagekey = old_storage
946
- if self.key is not None:
947
- new_key_path = PurePosixPath(self.key).with_suffix(kwargs["suffix"])
948
- self.key = str(new_key_path)
949
- # update old key with the new one so that checks in record pass
950
- self._old_key = self.key
951
-
952
- self.suffix = kwargs["suffix"]
953
- self.size = kwargs["size"]
954
- self.hash = kwargs["hash"]
955
- self._hash_type = kwargs["_hash_type"]
956
- self.run_id = kwargs["run_id"]
957
- self.run = kwargs["run"]
958
- self.n_files = kwargs["n_files"]
959
-
960
- self._local_filepath = privates["local_filepath"]
961
- self._cloud_filepath = privates["cloud_filepath"]
962
- self._memory_rep = privates["memory_rep"]
963
- # no need to upload if new file is already in storage
964
- self._to_store = not check_path_in_storage
965
-
966
-
967
- inconsistent_state_msg = (
968
- "Trying to read a folder artifact from an outdated version, "
969
- "this can result in an incosistent state.\n"
970
- "Read from the latest version: artifact.versions.filter(is_latest=True).one()"
971
- )
972
-
973
-
974
- # docstring handled through attach_func_to_class_method
975
- def open(
976
- self, mode: str = "r", is_run_input: bool | None = None
977
- ) -> (
978
- AnnDataAccessor
979
- | BackedAccessor
980
- | SOMACollection
981
- | SOMAExperiment
982
- | SOMAMeasurement
983
- | PyArrowDataset
984
- ):
985
- if self._overwrite_versions and not self.is_latest:
986
- raise ValueError(inconsistent_state_msg)
987
- # ignore empty suffix for now
988
- suffixes = (
989
- "",
990
- ".h5",
991
- ".hdf5",
992
- ".h5ad",
993
- ".zarr",
994
- ".anndata.zarr",
995
- ".tiledbsoma",
996
- ) + PYARROW_SUFFIXES
997
- if self.suffix not in suffixes:
998
- raise ValueError(
999
- "Artifact should have a zarr, h5, tiledbsoma object"
1000
- " or a compatible `pyarrow.dataset.dataset` directory"
1001
- " as the underlying data, please use one of the following suffixes"
1002
- f" for the object name: {', '.join(suffixes[1:])}."
1003
- f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
1004
- " (no mixing allowed)."
1005
- )
1006
- if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
1007
- raise ValueError("Only a tiledbsoma store can be openened with `mode!='r'`.")
1008
-
1009
- from lamindb.core.storage._backed_access import _track_writes_factory, backed_access
1010
-
1011
- using_key = settings._using_key
1012
- filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
1013
- is_tiledbsoma_w = (
1014
- filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
1015
- ) and mode == "w"
1016
- # consider the case where an object is already locally cached
1017
- localpath = setup_settings.paths.cloud_to_local_no_update(
1018
- filepath, cache_key=cache_key
1019
- )
1020
- if is_tiledbsoma_w:
1021
- open_cache = False
1022
- else:
1023
- open_cache = not isinstance(
1024
- filepath, LocalPathClasses
1025
- ) and not filepath.synchronize(localpath, just_check=True)
1026
- if open_cache:
1027
- try:
1028
- access = backed_access(localpath, mode, using_key)
1029
- except Exception as e:
1030
- if isinstance(filepath, LocalPathClasses):
1031
- raise e
1032
- logger.warning(
1033
- f"The cache might be corrupted: {e}. Trying to open directly."
1034
- )
1035
- access = backed_access(filepath, mode, using_key)
1036
- # happens only if backed_access has been successful
1037
- # delete the corrupted cache
1038
- if localpath.is_dir():
1039
- shutil.rmtree(localpath)
1040
- else:
1041
- localpath.unlink(missing_ok=True)
1042
- else:
1043
- access = backed_access(filepath, mode, using_key)
1044
- if is_tiledbsoma_w:
1045
-
1046
- def finalize():
1047
- nonlocal self, filepath, localpath
1048
- if not isinstance(filepath, LocalPathClasses):
1049
- _, hash, _, _ = get_stat_dir_cloud(filepath)
1050
- else:
1051
- # this can be very slow
1052
- _, hash, _, _ = hash_dir(filepath)
1053
- if self.hash != hash:
1054
- from ._record import init_self_from_db
1055
-
1056
- new_version = Artifact(
1057
- filepath, revises=self, _is_internal_call=True
1058
- ).save()
1059
- init_self_from_db(self, new_version)
1060
-
1061
- if localpath != filepath and localpath.exists():
1062
- shutil.rmtree(localpath)
1063
-
1064
- access = _track_writes_factory(access, finalize)
1065
- # only call if open is successfull
1066
- _track_run_input(self, is_run_input)
1067
- return access
1068
-
1069
-
1070
- # can't really just call .cache in .load because of double tracking
1071
- def _synchronize_cleanup_on_error(
1072
- filepath: UPath, cache_key: str | None = None
1073
- ) -> UPath:
1074
- try:
1075
- cache_path = setup_settings.paths.cloud_to_local(
1076
- filepath, cache_key=cache_key, print_progress=True
1077
- )
1078
- except Exception as e:
1079
- if not isinstance(filepath, LocalPathClasses):
1080
- cache_path = setup_settings.paths.cloud_to_local_no_update(
1081
- filepath, cache_key=cache_key
1082
- )
1083
- if cache_path.is_dir():
1084
- shutil.rmtree(cache_path)
1085
- else:
1086
- cache_path.unlink(missing_ok=True)
1087
- raise e
1088
- return cache_path
1089
-
1090
-
1091
- # docstring handled through attach_func_to_class_method
1092
- def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
1093
- if self._overwrite_versions and not self.is_latest:
1094
- raise ValueError(inconsistent_state_msg)
1095
-
1096
- if hasattr(self, "_memory_rep") and self._memory_rep is not None:
1097
- access_memory = self._memory_rep
1098
- else:
1099
- filepath, cache_key = filepath_cache_key_from_artifact(
1100
- self, using_key=settings._using_key
1101
- )
1102
- cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1103
- try:
1104
- # cache_path is local so doesn't trigger any sync in load_to_memory
1105
- access_memory = load_to_memory(cache_path, **kwargs)
1106
- except Exception as e:
1107
- # just raise the exception if the original path is local
1108
- if isinstance(filepath, LocalPathClasses):
1109
- raise e
1110
- logger.warning(
1111
- f"The cache might be corrupted: {e}. Retrying to synchronize."
1112
- )
1113
- # delete the existing cache
1114
- if cache_path.is_dir():
1115
- shutil.rmtree(cache_path)
1116
- else:
1117
- cache_path.unlink(missing_ok=True)
1118
- # download again and try to load into memory
1119
- cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1120
- access_memory = load_to_memory(cache_path, **kwargs)
1121
- # only call if load is successfull
1122
- _track_run_input(self, is_run_input)
1123
- return access_memory
1124
-
1125
-
1126
- # docstring handled through attach_func_to_class_method
1127
- def cache(self, is_run_input: bool | None = None) -> Path:
1128
- if self._overwrite_versions and not self.is_latest:
1129
- raise ValueError(inconsistent_state_msg)
1130
-
1131
- filepath, cache_key = filepath_cache_key_from_artifact(
1132
- self, using_key=settings._using_key
1133
- )
1134
- cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1135
- # only call if sync is successfull
1136
- _track_run_input(self, is_run_input)
1137
- return cache_path
1138
-
1139
-
1140
- # docstring handled through attach_func_to_class_method
1141
- def delete(
1142
- self,
1143
- permanent: bool | None = None,
1144
- storage: bool | None = None,
1145
- using_key: str | None = None,
1146
- ) -> None:
1147
- # this first check means an invalid delete fails fast rather than cascading through
1148
- # database and storage permission errors
1149
- if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
1150
- isettings = setup_settings.instance
1151
- if self.storage.instance_uid != isettings.uid and (storage or storage is None):
1152
- raise IntegrityError(
1153
- "Cannot simply delete artifacts outside of this instance's managed storage locations."
1154
- "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
1155
- f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
1156
- f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
1157
- )
1158
- # by default, we only move artifacts into the trash (_branch_code = -1)
1159
- trash__branch_code = -1
1160
- if self._branch_code > trash__branch_code and not permanent:
1161
- if storage is not None:
1162
- logger.warning("moving artifact to trash, storage arg is ignored")
1163
- # move to trash
1164
- self._branch_code = trash__branch_code
1165
- self.save()
1166
- logger.important(
1167
- f"moved artifact to trash (_branch_code = {trash__branch_code})"
1168
- )
1169
- return
1170
-
1171
- # if the artifact is already in the trash
1172
- # permanent delete skips the trash
1173
- if permanent is None:
1174
- # ask for confirmation of permanent delete
1175
- response = input(
1176
- "Artifact record is already in trash! Are you sure you want to permanently"
1177
- " delete it? (y/n) You can't undo this action."
1178
- )
1179
- delete_record = response == "y"
1180
- else:
1181
- assert permanent # noqa: S101
1182
- delete_record = True
1183
-
1184
- if delete_record:
1185
- # need to grab file path before deletion
1186
- try:
1187
- path, _ = filepath_from_artifact(self, using_key)
1188
- except OSError:
1189
- # we can still delete the record
1190
- logger.warning("Could not get path")
1191
- storage = False
1192
- # only delete in storage if DB delete is successful
1193
- # DB delete might error because of a foreign key constraint violated etc.
1194
- if self._overwrite_versions and self.is_latest:
1195
- # includes self
1196
- for version in self.versions.all():
1197
- _delete_skip_storage(version)
1198
- else:
1199
- self._delete_skip_storage()
1200
- # by default do not delete storage if deleting only a previous version
1201
- # and the underlying store is mutable
1202
- if self._overwrite_versions and not self.is_latest:
1203
- delete_in_storage = False
1204
- if storage:
1205
- logger.warning(
1206
- "Storage argument is ignored; can't delete storage on an previous version"
1207
- )
1208
- elif self.key is None or self._key_is_virtual:
1209
- # do not ask for confirmation also if storage is None
1210
- delete_in_storage = storage is None or storage
1211
- else:
1212
- # for artifacts with non-virtual semantic storage keys (key is not None)
1213
- # ask for extra-confirmation
1214
- if storage is None:
1215
- response = input(
1216
- f"Are you sure to want to delete {path}? (y/n) You can't undo"
1217
- " this action."
1218
- )
1219
- delete_in_storage = response == "y"
1220
- else:
1221
- delete_in_storage = storage
1222
- if not delete_in_storage:
1223
- logger.important(f"a file/folder remains here: {path}")
1224
- # we don't yet have logic to bring back the deleted metadata record
1225
- # in case storage deletion fails - this is important for ACID down the road
1226
- if delete_in_storage:
1227
- delete_msg = delete_storage(path, raise_file_not_found_error=False)
1228
- if delete_msg != "did-not-delete":
1229
- logger.success(f"deleted {colors.yellow(f'{path}')}")
1230
-
1231
-
1232
- def _delete_skip_storage(artifact, *args, **kwargs) -> None:
1233
- super(Artifact, artifact).delete(*args, **kwargs)
1234
-
1235
-
1236
- # docstring handled through attach_func_to_class_method
1237
- def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1238
- state_was_adding = self._state.adding
1239
- print_progress = kwargs.pop("print_progress", True)
1240
- access_token = kwargs.pop("access_token", None)
1241
- local_path = None
1242
- if upload and setup_settings.instance.keep_artifacts_local:
1243
- # switch local storage location to cloud
1244
- local_path = self.path
1245
- self.storage_id = setup_settings.instance.storage.id
1246
- self._local_filepath = local_path
1247
- # switch to virtual storage key upon upload
1248
- # the local filepath is already cached at that point
1249
- self._key_is_virtual = True
1250
- # ensure that the artifact is uploaded
1251
- self._to_store = True
1252
-
1253
- self._save_skip_storage(**kwargs)
1254
-
1255
- from lamindb._save import check_and_attempt_clearing, check_and_attempt_upload
1256
-
1257
- using_key = None
1258
- if "using" in kwargs:
1259
- using_key = kwargs["using"]
1260
- exception_upload = check_and_attempt_upload(
1261
- self, using_key, access_token=access_token, print_progress=print_progress
1262
- )
1263
- if exception_upload is not None:
1264
- # we do not want to raise file not found on cleanup if upload of a file failed
1265
- # often it is ACID in the filesystem itself
1266
- # for example, s3 won't have the failed file, so just skip the delete in this case
1267
- raise_file_not_found_error = False
1268
- self._delete_skip_storage()
1269
- else:
1270
- # this is the case when it is cleaned on .replace
1271
- raise_file_not_found_error = True
1272
- # this is triggered by an exception in check_and_attempt_upload or by replace.
1273
- exception_clear = check_and_attempt_clearing(
1274
- self, raise_file_not_found_error=raise_file_not_found_error, using_key=using_key
1275
- )
1276
- if exception_upload is not None:
1277
- raise RuntimeError(exception_upload)
1278
- if exception_clear is not None:
1279
- raise RuntimeError(exception_clear)
1280
- # this is only for keep_artifacts_local
1281
- if local_path is not None and not state_was_adding:
1282
- # only move the local artifact to cache if it was not newly created
1283
- local_path_cache = ln_setup.settings.cache_dir / local_path.name
1284
- # don't use Path.rename here because of cross-device link error
1285
- # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
1286
- shutil.move(
1287
- local_path, # type: ignore
1288
- local_path_cache,
1289
- )
1290
- logger.important(f"moved local artifact to cache: {local_path_cache}")
1291
- return self
1292
-
1293
-
1294
- def _save_skip_storage(file, **kwargs) -> None:
1295
- save_staged_feature_sets(file)
1296
- super(Artifact, file).save(**kwargs)
1297
- save_schema_links(file)
1298
-
1299
-
1300
- @property # type: ignore
1301
- @doc_args(Artifact.path.__doc__)
1302
- def path(self) -> Path | UPath:
1303
- """{}""" # noqa: D415
1304
- # return only the path, without StorageSettings
1305
- filepath, _ = filepath_from_artifact(self, using_key=settings._using_key)
1306
- return filepath
1307
-
1308
-
1309
- # get cache path without triggering sync
1310
- @property # type: ignore
1311
- def _cache_path(self) -> UPath:
1312
- filepath, cache_key = filepath_cache_key_from_artifact(
1313
- self, using_key=settings._using_key
1314
- )
1315
- if isinstance(filepath, LocalPathClasses):
1316
- return filepath
1317
- return setup_settings.paths.cloud_to_local_no_update(filepath, cache_key=cache_key)
1318
-
1319
-
1320
- # docstring handled through attach_func_to_class_method
1321
- def restore(self) -> None:
1322
- self._branch_code = 1
1323
- self.save()
1324
-
1325
-
1326
- METHOD_NAMES = [
1327
- "__init__",
1328
- "from_anndata",
1329
- "from_df",
1330
- "from_mudata",
1331
- "from_tiledbsoma",
1332
- "open",
1333
- "cache",
1334
- "load",
1335
- "delete",
1336
- "save",
1337
- "replace",
1338
- "from_dir",
1339
- "restore",
1340
- ]
1341
-
1342
- if ln_setup._TESTING:
1343
- from inspect import signature
1344
-
1345
- SIGS = {
1346
- name: signature(getattr(Artifact, name))
1347
- for name in METHOD_NAMES
1348
- if name != "__init__"
1349
- }
1350
-
1351
- for name in METHOD_NAMES:
1352
- attach_func_to_class_method(name, Artifact, globals())
1353
-
1354
- # privates currently dealt with separately
1355
- # mypy: ignore-errors
1356
- Artifact._delete_skip_storage = _delete_skip_storage
1357
- Artifact._save_skip_storage = _save_skip_storage
1358
- Artifact._cache_path = _cache_path
1359
- Artifact.path = path
1360
- Artifact.describe = describe
1361
- Artifact.view_lineage = view_lineage