lamindb 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. lamindb/__init__.py +30 -25
  2. lamindb/_tracked.py +1 -1
  3. lamindb/_view.py +2 -3
  4. lamindb/base/__init__.py +1 -1
  5. lamindb/base/ids.py +1 -10
  6. lamindb/core/__init__.py +7 -65
  7. lamindb/core/_compat.py +60 -0
  8. lamindb/core/_context.py +43 -20
  9. lamindb/core/_settings.py +6 -6
  10. lamindb/core/_sync_git.py +1 -1
  11. lamindb/core/loaders.py +30 -19
  12. lamindb/core/storage/_backed_access.py +4 -2
  13. lamindb/core/storage/_tiledbsoma.py +8 -6
  14. lamindb/core/storage/_zarr.py +104 -25
  15. lamindb/core/storage/objects.py +63 -28
  16. lamindb/core/storage/paths.py +4 -1
  17. lamindb/core/types.py +10 -0
  18. lamindb/curators/__init__.py +100 -85
  19. lamindb/errors.py +1 -1
  20. lamindb/integrations/_vitessce.py +4 -4
  21. lamindb/migrations/0089_subsequent_runs.py +159 -0
  22. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  23. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  24. lamindb/models/__init__.py +79 -0
  25. lamindb/{core → models}/_describe.py +3 -3
  26. lamindb/{core → models}/_django.py +8 -5
  27. lamindb/{core → models}/_feature_manager.py +103 -87
  28. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  29. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  30. lamindb/{core → models}/_label_manager.py +10 -17
  31. lamindb/{core/relations.py → models/_relations.py} +8 -1
  32. lamindb/models/artifact.py +2602 -0
  33. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  34. lamindb/models/collection.py +683 -0
  35. lamindb/models/core.py +135 -0
  36. lamindb/models/feature.py +643 -0
  37. lamindb/models/flextable.py +163 -0
  38. lamindb/{_parents.py → models/has_parents.py} +55 -49
  39. lamindb/models/project.py +384 -0
  40. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  41. lamindb/{_query_set.py → models/query_set.py} +40 -26
  42. lamindb/models/record.py +1762 -0
  43. lamindb/models/run.py +563 -0
  44. lamindb/{_save.py → models/save.py} +9 -7
  45. lamindb/models/schema.py +732 -0
  46. lamindb/models/transform.py +360 -0
  47. lamindb/models/ulabel.py +249 -0
  48. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
  49. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/RECORD +51 -51
  50. lamindb/_artifact.py +0 -1379
  51. lamindb/_collection.py +0 -440
  52. lamindb/_feature.py +0 -316
  53. lamindb/_is_versioned.py +0 -40
  54. lamindb/_record.py +0 -1064
  55. lamindb/_run.py +0 -60
  56. lamindb/_schema.py +0 -347
  57. lamindb/_storage.py +0 -15
  58. lamindb/_transform.py +0 -170
  59. lamindb/_ulabel.py +0 -56
  60. lamindb/_utils.py +0 -9
  61. lamindb/base/validation.py +0 -63
  62. lamindb/core/_data.py +0 -491
  63. lamindb/core/fields.py +0 -12
  64. lamindb/models.py +0 -4475
  65. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
  66. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
lamindb/_artifact.py DELETED
@@ -1,1379 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import shutil
5
- from pathlib import Path, PurePath, PurePosixPath
6
- from typing import TYPE_CHECKING, Any
7
-
8
- import fsspec
9
- import lamindb_setup as ln_setup
10
- import pandas as pd
11
- from anndata import AnnData
12
- from django.db.models import Q
13
- from lamin_utils import colors, logger
14
- from lamindb_setup import settings as setup_settings
15
- from lamindb_setup._init_instance import register_storage_in_instance
16
- from lamindb_setup.core._docs import doc_args
17
- from lamindb_setup.core._settings_storage import init_storage
18
- from lamindb_setup.core.hashing import hash_dir, hash_file
19
- from lamindb_setup.core.upath import (
20
- create_path,
21
- extract_suffix_from_path,
22
- get_stat_dir_cloud,
23
- get_stat_file_cloud,
24
- )
25
-
26
- from lamindb._record import _get_record_kwargs
27
- from lamindb.errors import FieldValidationError
28
- from lamindb.models import Artifact, FeatureManager, ParamManager, Run, Storage
29
-
30
- from ._parents import view_lineage
31
- from ._utils import attach_func_to_class_method
32
- from .core._data import (
33
- _track_run_input,
34
- describe,
35
- get_run,
36
- save_schema_links,
37
- save_staged_feature_sets,
38
- )
39
- from .core._settings import settings
40
- from .core.loaders import load_to_memory
41
- from .core.storage import (
42
- LocalPathClasses,
43
- UPath,
44
- delete_storage,
45
- infer_suffix,
46
- write_to_disk,
47
- )
48
- from .core.storage._anndata_accessor import _anndata_n_observations
49
- from .core.storage._pyarrow_dataset import PYARROW_SUFFIXES
50
- from .core.storage._tiledbsoma import _soma_n_observations
51
- from .core.storage.objects import _mudata_is_installed
52
- from .core.storage.paths import (
53
- AUTO_KEY_PREFIX,
54
- auto_storage_key_from_artifact,
55
- auto_storage_key_from_artifact_uid,
56
- check_path_is_child_of_root,
57
- filepath_cache_key_from_artifact,
58
- filepath_from_artifact,
59
- )
60
- from .core.versioning import (
61
- create_uid,
62
- message_update_key_in_version_family,
63
- )
64
- from .errors import IntegrityError, InvalidArgument
65
-
66
- try:
67
- from .core.storage._zarr import zarr_is_adata
68
- except ImportError:
69
-
70
- def zarr_is_adata(storepath): # type: ignore
71
- raise ImportError("Please install zarr: pip install zarr<=2.18.4")
72
-
73
-
74
- if TYPE_CHECKING:
75
- from lamindb_setup.core.types import UPathStr
76
- from mudata import MuData
77
- from pyarrow.dataset import Dataset as PyArrowDataset
78
- from tiledbsoma import Collection as SOMACollection
79
- from tiledbsoma import Experiment as SOMAExperiment
80
- from tiledbsoma import Measurement as SOMAMeasurement
81
-
82
- from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
83
-
84
-
85
- def process_pathlike(
86
- filepath: UPath,
87
- default_storage: Storage,
88
- using_key: str | None,
89
- skip_existence_check: bool = False,
90
- ) -> tuple[Storage, bool]:
91
- """Determines the appropriate storage for a given path and whether to use an existing storage key."""
92
- if not skip_existence_check:
93
- try: # check if file exists
94
- if not filepath.exists():
95
- raise FileNotFoundError(filepath)
96
- except PermissionError:
97
- pass
98
- if check_path_is_child_of_root(filepath, default_storage.root):
99
- use_existing_storage_key = True
100
- return default_storage, use_existing_storage_key
101
- else:
102
- # check whether the path is part of one of the existing
103
- # already-registered storage locations
104
- result = False
105
- # within the hub, we don't want to perform check_path_in_existing_storage
106
- if using_key is None:
107
- result = check_path_in_existing_storage(filepath, using_key)
108
- if isinstance(result, Storage):
109
- use_existing_storage_key = True
110
- return result, use_existing_storage_key
111
- else:
112
- # if the path is in the cloud, we have a good candidate
113
- # for the storage root: the bucket
114
- if not isinstance(filepath, LocalPathClasses):
115
- # for a cloud path, new_root is always the bucket name
116
- if filepath.protocol == "hf":
117
- hf_path = filepath.fs.resolve_path(filepath.as_posix())
118
- hf_path.path_in_repo = ""
119
- new_root = "hf://" + hf_path.unresolve()
120
- else:
121
- if filepath.protocol == "s3":
122
- # check that endpoint_url didn't propagate here
123
- # as a part of the path string
124
- assert "?" not in filepath.path # noqa: S101
125
- new_root = list(filepath.parents)[-1]
126
- # do not register remote storage locations on hub if the current instance
127
- # is not managed on the hub
128
- storage_settings, _ = init_storage(
129
- new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
130
- )
131
- storage_record = register_storage_in_instance(storage_settings)
132
- use_existing_storage_key = True
133
- return storage_record, use_existing_storage_key
134
- # if the filepath is local
135
- else:
136
- use_existing_storage_key = False
137
- # if the default storage is local we'll throw an error if the user
138
- # doesn't provide a key
139
- if default_storage.type == "local":
140
- return default_storage, use_existing_storage_key
141
- # if the default storage is in the cloud (the file is going to
142
- # be uploaded upon saving it), we treat the filepath as a cache
143
- else:
144
- return default_storage, use_existing_storage_key
145
-
146
-
147
- def process_data(
148
- provisional_uid: str,
149
- data: UPathStr | pd.DataFrame | AnnData,
150
- format: str | None,
151
- key: str | None,
152
- default_storage: Storage,
153
- using_key: str | None,
154
- skip_existence_check: bool = False,
155
- is_replace: bool = False,
156
- ) -> tuple[Any, Path | UPath, str, Storage, bool]:
157
- """Serialize a data object that's provided as file or in memory."""
158
- # if not overwritten, data gets stored in default storage
159
- if _mudata_is_installed():
160
- from mudata import MuData
161
-
162
- data_types = (pd.DataFrame, AnnData, MuData)
163
- else:
164
- data_types = (pd.DataFrame, AnnData) # type:ignore
165
- if key is not None:
166
- key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
167
- # use suffix as the (adata) format if the format is not provided
168
- if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
169
- format = key_suffix[1:]
170
- else:
171
- key_suffix = None
172
- if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
173
- access_token = (
174
- default_storage._access_token
175
- if hasattr(default_storage, "_access_token")
176
- else None
177
- )
178
- path = create_path(data, access_token=access_token)
179
- # we don't resolve http links because they can resolve into a different domain
180
- # for example into a temporary url
181
- if path.protocol not in {"http", "https"}:
182
- path = path.resolve()
183
- storage, use_existing_storage_key = process_pathlike(
184
- path,
185
- default_storage=default_storage,
186
- using_key=using_key,
187
- skip_existence_check=skip_existence_check,
188
- )
189
- suffix = extract_suffix_from_path(path)
190
- memory_rep = None
191
- elif isinstance(data, data_types):
192
- storage = default_storage
193
- memory_rep = data
194
- suffix = infer_suffix(data, format)
195
- else:
196
- raise NotImplementedError(
197
- f"Do not know how to create a artifact object from {data}, pass a path instead!"
198
- )
199
- if key_suffix is not None and key_suffix != suffix and not is_replace:
200
- # consciously omitting a trailing period
201
- if isinstance(data, (str, Path, UPath)):
202
- message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
203
- else:
204
- message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
205
- raise InvalidArgument(message)
206
- # in case we have an in-memory representation, we need to write it to disk
207
- if isinstance(data, data_types):
208
- path = settings.cache_dir / f"{provisional_uid}{suffix}"
209
- write_to_disk(data, path)
210
- use_existing_storage_key = False
211
- return memory_rep, path, suffix, storage, use_existing_storage_key
212
-
213
-
214
- def get_stat_or_artifact(
215
- path: UPath,
216
- key: str | None = None,
217
- check_hash: bool = True,
218
- is_replace: bool = False,
219
- instance: str | None = None,
220
- ) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
221
- """Retrieves file statistics or an existing artifact based on the path, hash, and key."""
222
- n_files = None
223
- if settings.creation.artifact_skip_size_hash:
224
- return None, None, None, n_files, None
225
- stat = path.stat() # one network request
226
- if not isinstance(path, LocalPathClasses):
227
- size, hash, hash_type = None, None, None
228
- if stat is not None:
229
- # convert UPathStatResult to fsspec info dict
230
- stat = stat.as_info()
231
- if (store_type := stat["type"]) == "file":
232
- size, hash, hash_type = get_stat_file_cloud(stat)
233
- elif store_type == "directory":
234
- size, hash, hash_type, n_files = get_stat_dir_cloud(path)
235
- if hash is None:
236
- logger.warning(f"did not add hash for {path}")
237
- return size, hash, hash_type, n_files, None
238
- else:
239
- if path.is_dir():
240
- size, hash, hash_type, n_files = hash_dir(path)
241
- else:
242
- hash, hash_type = hash_file(path)
243
- size = stat.st_size
244
- if not check_hash:
245
- return size, hash, hash_type, n_files, None
246
- previous_artifact_version = None
247
- if key is None or is_replace:
248
- result = Artifact.objects.using(instance).filter(hash=hash).all()
249
- artifact_with_same_hash_exists = len(result) > 0
250
- else:
251
- storage_id = settings.storage.id
252
- result = (
253
- Artifact.objects.using(instance)
254
- .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
255
- .order_by("-created_at")
256
- .all()
257
- )
258
- artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
259
- if not artifact_with_same_hash_exists and len(result) > 0:
260
- logger.important(
261
- f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
262
- )
263
- previous_artifact_version = result[0]
264
- if artifact_with_same_hash_exists:
265
- message = "found artifact with same hash"
266
- if result[0]._branch_code == -1:
267
- result[0].restore()
268
- message = "restored artifact with same hash from trash"
269
- logger.important(
270
- f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
271
- )
272
- return result[0]
273
- else:
274
- return size, hash, hash_type, n_files, previous_artifact_version
275
-
276
-
277
- def check_path_in_existing_storage(
278
- path: Path | UPath, using_key: str | None = None
279
- ) -> Storage | bool:
280
- for storage in Storage.objects.using(using_key).filter().all():
281
- # if path is part of storage, return it
282
- if check_path_is_child_of_root(path, root=storage.root):
283
- return storage
284
- return False
285
-
286
-
287
- def get_relative_path_to_directory(
288
- path: PurePath | Path | UPath, directory: PurePath | Path | UPath
289
- ) -> PurePath | Path:
290
- if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
291
- # UPath.relative_to() is not behaving as it should (2023-04-07)
292
- # need to lstrip otherwise inconsistent behavior across trailing slashes
293
- # see test_artifact.py: test_get_relative_path_to_directory
294
- relpath = PurePath(
295
- path.as_posix().replace(directory.as_posix(), "").lstrip("/")
296
- )
297
- elif isinstance(directory, Path):
298
- relpath = path.resolve().relative_to(directory.resolve()) # type: ignore
299
- elif isinstance(directory, PurePath):
300
- relpath = path.relative_to(directory)
301
- else:
302
- raise TypeError("Directory not of type Path or UPath")
303
- return relpath
304
-
305
-
306
- def get_artifact_kwargs_from_data(
307
- *,
308
- data: Path | UPath | str | pd.DataFrame | AnnData | MuData,
309
- key: str | None,
310
- run: Run | None,
311
- format: str | None,
312
- provisional_uid: str,
313
- version: str | None,
314
- default_storage: Storage,
315
- using_key: str | None = None,
316
- is_replace: bool = False,
317
- skip_check_exists: bool = False,
318
- ):
319
- run = get_run(run)
320
- memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
321
- provisional_uid,
322
- data,
323
- format,
324
- key,
325
- default_storage,
326
- using_key,
327
- skip_check_exists,
328
- is_replace=is_replace,
329
- )
330
- stat_or_artifact = get_stat_or_artifact(
331
- path=path,
332
- key=key,
333
- instance=using_key,
334
- is_replace=is_replace,
335
- )
336
- if isinstance(stat_or_artifact, Artifact):
337
- artifact = stat_or_artifact
338
- # update the run of the existing artifact
339
- if run is not None:
340
- # save the information that this artifact was previously produced by
341
- # another run
342
- # note: same logic exists for _output_collections_with_later_updates
343
- if artifact.run is not None and artifact.run != run:
344
- artifact.run._output_artifacts_with_later_updates.add(artifact)
345
- # update the run of the artifact with the latest run
346
- stat_or_artifact.run = run
347
- return artifact, None
348
- else:
349
- size, hash, hash_type, n_files, revises = stat_or_artifact
350
-
351
- if revises is not None: # update provisional_uid
352
- provisional_uid, revises = create_uid(revises=revises, version=version)
353
- if settings.cache_dir in path.parents:
354
- path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
355
-
356
- check_path_in_storage = False
357
- if use_existing_storage_key:
358
- inferred_key = get_relative_path_to_directory(
359
- path=path, directory=UPath(storage.root)
360
- ).as_posix()
361
- if key is None:
362
- key = inferred_key
363
- else:
364
- if not key == inferred_key:
365
- raise InvalidArgument(
366
- f"The path '{data}' is already in registered storage"
367
- f" '{storage.root}' with key '{inferred_key}'\nYou passed"
368
- f" conflicting key '{key}': please move the file before"
369
- " registering it."
370
- )
371
- check_path_in_storage = True
372
- else:
373
- storage = default_storage
374
-
375
- log_storage_hint(
376
- check_path_in_storage=check_path_in_storage,
377
- storage=storage,
378
- key=key,
379
- uid=provisional_uid,
380
- suffix=suffix,
381
- is_dir=n_files is not None,
382
- )
383
-
384
- # do we use a virtual or an actual storage key?
385
- key_is_virtual = settings.creation._artifact_use_virtual_keys
386
-
387
- # if the file is already in storage, independent of the default
388
- # we use an actual storage key
389
- if check_path_in_storage:
390
- key_is_virtual = False
391
-
392
- kwargs = {
393
- "uid": provisional_uid,
394
- "suffix": suffix,
395
- "hash": hash,
396
- "_hash_type": hash_type,
397
- "key": key,
398
- "size": size,
399
- "storage_id": storage.id,
400
- # passing both the id and the object
401
- # to make them both available immediately
402
- # after object creation
403
- "n_files": n_files,
404
- "_overwrite_versions": n_files is not None, # True for folder, False for file
405
- "n_observations": None, # to implement
406
- "run_id": run.id if run is not None else None,
407
- "run": run,
408
- "_key_is_virtual": key_is_virtual,
409
- "revises": revises,
410
- }
411
- if not isinstance(path, LocalPathClasses):
412
- local_filepath = None
413
- cloud_filepath = path
414
- else:
415
- local_filepath = path
416
- cloud_filepath = None
417
- privates = {
418
- "local_filepath": local_filepath,
419
- "cloud_filepath": cloud_filepath,
420
- "memory_rep": memory_rep,
421
- "check_path_in_storage": check_path_in_storage,
422
- }
423
- return kwargs, privates
424
-
425
-
426
- def log_storage_hint(
427
- *,
428
- check_path_in_storage: bool,
429
- storage: Storage | None,
430
- key: str | None,
431
- uid: str,
432
- suffix: str,
433
- is_dir: bool,
434
- ) -> None:
435
- hint = ""
436
- if check_path_in_storage:
437
- display_root = storage.root # type: ignore
438
- # check whether path is local
439
- if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore
440
- # if it's a local path, check whether it's in the current working directory
441
- root_path = Path(storage.root) # type: ignore
442
- if check_path_is_child_of_root(root_path, Path.cwd()):
443
- # only display the relative path, not the fully resolved path
444
- display_root = root_path.relative_to(Path.cwd()) # type: ignore
445
- hint += f"path in storage '{display_root}'" # type: ignore
446
- else:
447
- hint += "path content will be copied to default storage upon `save()`"
448
- if key is None:
449
- storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
450
- hint += f" with key `None` ('{storage_key}')"
451
- else:
452
- hint += f" with key '{key}'"
453
- logger.hint(hint)
454
-
455
-
456
- def data_is_anndata(data: AnnData | UPathStr) -> bool:
457
- if isinstance(data, AnnData):
458
- return True
459
- if isinstance(data, (str, Path, UPath)):
460
- data_path = UPath(data)
461
- if ".h5ad" in data_path.suffixes: # ".h5ad.gz" is a valid suffix
462
- return True
463
- elif data_path.suffix == ".zarr":
464
- # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
465
- if ".anndata" in data_path.suffixes:
466
- return True
467
- # check only for local, expensive for cloud
468
- if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
469
- return zarr_is_adata(data_path)
470
- else:
471
- logger.warning("We do not check if cloud zarr is AnnData or not")
472
- return False
473
- return False
474
-
475
-
476
- def data_is_mudata(data: MuData | UPathStr) -> bool:
477
- if _mudata_is_installed():
478
- from mudata import MuData
479
-
480
- if isinstance(data, MuData):
481
- return True
482
- if isinstance(data, (str, Path)):
483
- return UPath(data).suffix == ".h5mu"
484
- return False
485
-
486
-
487
- def _check_otype_artifact(data: Any, otype: str | None = None):
488
- if otype is None:
489
- if isinstance(data, pd.DataFrame):
490
- logger.warning("data is a DataFrame, please use .from_df()")
491
- otype = "DataFrame"
492
- return otype
493
-
494
- data_is_path = isinstance(data, (str, Path))
495
- if data_is_anndata(data):
496
- if not data_is_path:
497
- logger.warning("data is an AnnData, please use .from_anndata()")
498
- otype = "AnnData"
499
- elif data_is_mudata(data):
500
- if not data_is_path:
501
- logger.warning("data is a MuData, please use .from_mudata()")
502
- otype = "MuData"
503
- elif not data_is_path: # UPath is a subclass of Path
504
- raise TypeError("data has to be a string, Path, UPath")
505
- return otype
506
-
507
-
508
- def __init__(artifact: Artifact, *args, **kwargs):
509
- artifact.features = FeatureManager(artifact) # type: ignore
510
- artifact.params = ParamManager(artifact) # type: ignore
511
- # Below checks for the Django-internal call in from_db()
512
- # it'd be better if we could avoid this, but not being able to create a Artifact
513
- # from data with the default constructor renders the central class of the API
514
- # essentially useless
515
- # The danger below is not that a user might pass as many args (12 of it), but rather
516
- # that at some point the Django API might change; on the other hand, this
517
- # condition of for calling the constructor based on kwargs should always
518
- # stay robust
519
- if len(args) == len(artifact._meta.concrete_fields):
520
- super(Artifact, artifact).__init__(*args, **kwargs)
521
- return None
522
- # now we proceed with the user-facing constructor
523
- if len(args) > 1:
524
- raise ValueError("Only one non-keyword arg allowed: data")
525
-
526
- data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
527
- kind: str = kwargs.pop("kind") if "kind" in kwargs else None
528
- key: str | None = kwargs.pop("key") if "key" in kwargs else None
529
- run: Run | None = kwargs.pop("run") if "run" in kwargs else None
530
- description: str | None = (
531
- kwargs.pop("description") if "description" in kwargs else None
532
- )
533
- revises: Artifact | None = kwargs.pop("revises") if "revises" in kwargs else None
534
- version: str | None = kwargs.pop("version") if "version" in kwargs else None
535
- if "visibility" in kwargs:
536
- _branch_code = kwargs.pop("visibility")
537
- elif "_branch_code" in kwargs:
538
- _branch_code = kwargs.pop("_branch_code")
539
- else:
540
- _branch_code = 1
541
- format = kwargs.pop("format") if "format" in kwargs else None
542
- _is_internal_call = kwargs.pop("_is_internal_call", False)
543
- skip_check_exists = (
544
- kwargs.pop("skip_check_exists") if "skip_check_exists" in kwargs else False
545
- )
546
- if "default_storage" in kwargs:
547
- default_storage = kwargs.pop("default_storage")
548
- else:
549
- if setup_settings.instance.keep_artifacts_local:
550
- default_storage = setup_settings.instance.storage_local.record
551
- else:
552
- default_storage = setup_settings.instance.storage.record
553
- using_key = (
554
- kwargs.pop("using_key") if "using_key" in kwargs else settings._using_key
555
- )
556
- otype = kwargs.pop("otype") if "otype" in kwargs else None
557
- otype = _check_otype_artifact(data=data, otype=otype)
558
- if "type" in kwargs:
559
- logger.warning("`type` will be removed soon, please use `kind`")
560
- kind = kwargs.pop("type")
561
- if not len(kwargs) == 0:
562
- valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
563
- raise FieldValidationError(
564
- f"Only {valid_keywords} can be passed, you passed: {kwargs}"
565
- )
566
- if revises is not None and key is not None and revises.key != key:
567
- note = message_update_key_in_version_family(
568
- suid=revises.stem_uid,
569
- existing_key=revises.key,
570
- new_key=key,
571
- registry="Artifact",
572
- )
573
- raise ValueError(
574
- f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
575
- )
576
- if revises is not None:
577
- if not isinstance(revises, Artifact):
578
- raise TypeError("`revises` has to be of type `Artifact`")
579
- if description is None:
580
- description = revises.description
581
- if key is not None and AUTO_KEY_PREFIX in key:
582
- raise ValueError(
583
- f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
584
- )
585
- # below is for internal calls that require defining the storage location
586
- # ahead of constructing the Artifact
587
- if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
588
- if _is_internal_call:
589
- is_automanaged_path = True
590
- user_provided_key = key
591
- key = None
592
- else:
593
- raise ValueError(
594
- f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
595
- )
596
- else:
597
- is_automanaged_path = False
598
- provisional_uid, revises = create_uid(revises=revises, version=version)
599
- kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
600
- data=data,
601
- key=key,
602
- run=run,
603
- format=format,
604
- provisional_uid=provisional_uid,
605
- version=version,
606
- default_storage=default_storage,
607
- using_key=using_key,
608
- skip_check_exists=skip_check_exists,
609
- )
610
-
611
- # an object with the same hash already exists
612
- if isinstance(kwargs_or_artifact, Artifact):
613
- from ._record import init_self_from_db, update_attributes
614
-
615
- init_self_from_db(artifact, kwargs_or_artifact)
616
- # adding "key" here is dangerous because key might be auto-populated
617
- attr_to_update = {"description": description}
618
- if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None:
619
- attr_to_update["key"] = key
620
- elif artifact.key != key and key is not None:
621
- logger.warning(
622
- f"key {artifact.key} on existing artifact differs from passed key {key}"
623
- )
624
- update_attributes(artifact, attr_to_update)
625
- return None
626
- else:
627
- kwargs = kwargs_or_artifact
628
-
629
- if revises is None:
630
- revises = kwargs_or_artifact.pop("revises")
631
-
632
- if data is not None:
633
- artifact._local_filepath = privates["local_filepath"]
634
- artifact._cloud_filepath = privates["cloud_filepath"]
635
- artifact._memory_rep = privates["memory_rep"]
636
- artifact._to_store = not privates["check_path_in_storage"]
637
-
638
- if is_automanaged_path and _is_internal_call:
639
- kwargs["_key_is_virtual"] = True
640
- assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
641
- uid = kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
642
- kwargs["key"] = user_provided_key
643
- if revises is not None:
644
- assert uid.startswith(revises.stem_uid) # noqa: S101
645
- if len(uid) == 16:
646
- if revises is None:
647
- uid += "0000"
648
- else:
649
- uid, revises = create_uid(revises=revises, version=version)
650
- kwargs["uid"] = uid
651
-
652
- # only set key now so that we don't do a look-up on it in case revises is passed
653
- if revises is not None:
654
- kwargs["key"] = revises.key
655
-
656
- kwargs["kind"] = kind
657
- kwargs["version"] = version
658
- kwargs["description"] = description
659
- kwargs["_branch_code"] = _branch_code
660
- kwargs["otype"] = otype
661
- kwargs["revises"] = revises
662
- # this check needs to come down here because key might be populated from an
663
- # existing file path during get_artifact_kwargs_from_data()
664
- if (
665
- kwargs["key"] is None
666
- and kwargs["description"] is None
667
- and kwargs["run"] is None
668
- ):
669
- raise ValueError("Pass one of key, run or description as a parameter")
670
-
671
- super(Artifact, artifact).__init__(**kwargs)
672
-
673
-
674
- @classmethod # type: ignore
675
- @doc_args(Artifact.from_df.__doc__)
676
- def from_df(
677
- cls,
678
- df: pd.DataFrame,
679
- *,
680
- key: str | None = None,
681
- description: str | None = None,
682
- run: Run | None = None,
683
- revises: Artifact | None = None,
684
- **kwargs,
685
- ) -> Artifact:
686
- """{}""" # noqa: D415
687
- artifact = Artifact( # type: ignore
688
- data=df,
689
- key=key,
690
- run=run,
691
- description=description,
692
- revises=revises,
693
- otype="DataFrame",
694
- kind="dataset",
695
- **kwargs,
696
- )
697
- artifact.n_observations = len(df)
698
- return artifact
699
-
700
-
701
- @classmethod # type: ignore
702
- @doc_args(Artifact.from_anndata.__doc__)
703
- def from_anndata(
704
- cls,
705
- adata: AnnData | UPathStr,
706
- *,
707
- key: str | None = None,
708
- description: str | None = None,
709
- run: Run | None = None,
710
- revises: Artifact | None = None,
711
- **kwargs,
712
- ) -> Artifact:
713
- """{}""" # noqa: D415
714
- if not data_is_anndata(adata):
715
- raise ValueError("data has to be an AnnData object or a path to AnnData-like")
716
- _anndata_n_observations(adata)
717
- artifact = Artifact( # type: ignore
718
- data=adata,
719
- key=key,
720
- run=run,
721
- description=description,
722
- revises=revises,
723
- otype="AnnData",
724
- kind="dataset",
725
- **kwargs,
726
- )
727
- # this is done instead of _anndata_n_observations(adata)
728
- # because we need a proper path through create_path for cloud paths
729
- # for additional upath options etc that create_path adds
730
- obj_for_obs: AnnData | UPath
731
- if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
732
- obj_for_obs = artifact._memory_rep
733
- else:
734
- # returns ._local_filepath for local files
735
- # and the proper path through create_path for cloud paths
736
- obj_for_obs = artifact.path
737
- artifact.n_observations = _anndata_n_observations(obj_for_obs)
738
- return artifact
739
-
740
-
741
- @classmethod # type: ignore
742
- @doc_args(Artifact.from_mudata.__doc__)
743
- def from_mudata(
744
- cls,
745
- mdata: MuData,
746
- *,
747
- key: str | None = None,
748
- description: str | None = None,
749
- run: Run | None = None,
750
- revises: Artifact | None = None,
751
- **kwargs,
752
- ) -> Artifact:
753
- """{}""" # noqa: D415
754
- artifact = Artifact( # type: ignore
755
- data=mdata,
756
- key=key,
757
- run=run,
758
- description=description,
759
- revises=revises,
760
- otype="MuData",
761
- kind="dataset",
762
- **kwargs,
763
- )
764
- artifact.n_observations = mdata.n_obs
765
- return artifact
766
-
767
-
768
- @classmethod # type: ignore
769
- @doc_args(Artifact.from_tiledbsoma.__doc__)
770
- def from_tiledbsoma(
771
- cls,
772
- path: UPathStr,
773
- *,
774
- key: str | None = None,
775
- description: str | None = None,
776
- run: Run | None = None,
777
- revises: Artifact | None = None,
778
- **kwargs,
779
- ) -> Artifact:
780
- """{}""" # noqa: D415
781
- if UPath(path).suffix != ".tiledbsoma":
782
- raise ValueError(
783
- "A tiledbsoma store should have .tiledbsoma suffix to be registered."
784
- )
785
- artifact = Artifact( # type: ignore
786
- data=path,
787
- key=key,
788
- run=run,
789
- description=description,
790
- revises=revises,
791
- otype="tiledbsoma",
792
- kind="dataset",
793
- **kwargs,
794
- )
795
- artifact.n_observations = _soma_n_observations(artifact.path)
796
- return artifact
797
-
798
-
799
- @classmethod # type: ignore
800
- @doc_args(Artifact.from_dir.__doc__)
801
- def from_dir(
802
- cls,
803
- path: UPathStr,
804
- *,
805
- key: str | None = None,
806
- run: Run | None = None,
807
- ) -> list[Artifact]:
808
- """{}""" # noqa: D415
809
- folderpath: UPath = create_path(path) # returns Path for local
810
- default_storage = settings.storage.record
811
- using_key = settings._using_key
812
- storage, use_existing_storage = process_pathlike(
813
- folderpath, default_storage, using_key
814
- )
815
- folder_key_path: PurePath | Path
816
- if key is None:
817
- if not use_existing_storage:
818
- logger.warning(
819
- "folder is outside existing storage location, will copy files from"
820
- f" {path} to {storage.root}/{folderpath.name}"
821
- )
822
- folder_key_path = Path(folderpath.name)
823
- else:
824
- # maintain the hierachy within an existing storage location
825
- folder_key_path = get_relative_path_to_directory(
826
- folderpath, UPath(storage.root)
827
- )
828
- else:
829
- folder_key_path = Path(key)
830
-
831
- folder_key = folder_key_path.as_posix()
832
- # silence fine-grained logging
833
- verbosity = settings.verbosity
834
- verbosity_int = settings._verbosity_int
835
- if verbosity_int >= 1:
836
- settings.verbosity = "warning"
837
- artifacts_dict = {}
838
- for filepath in folderpath.rglob("*"):
839
- if filepath.is_file():
840
- relative_path = get_relative_path_to_directory(filepath, folderpath)
841
- artifact_key = folder_key + "/" + relative_path.as_posix()
842
- # if creating from rglob, we don't need to check for existence
843
- artifact = Artifact(
844
- filepath, run=run, key=artifact_key, skip_check_exists=True
845
- )
846
- artifacts_dict[artifact.uid] = artifact
847
- settings.verbosity = verbosity
848
-
849
- # run sanity check on hashes
850
- hashes = [
851
- artifact.hash
852
- for artifact in artifacts_dict.values()
853
- if artifact.hash is not None
854
- ]
855
- uids = artifacts_dict.keys()
856
- n_unique_hashes = len(set(hashes))
857
- if n_unique_hashes == len(hashes):
858
- artifacts = list(artifacts_dict.values())
859
- else:
860
- # consider exact duplicates (same id, same hash)
861
- # below can't happen anymore because artifacts is a dict now
862
- # if len(set(uids)) == len(set(hashes)):
863
- # logger.warning("dropping duplicate records in list of artifact records")
864
- # artifacts = list(set(uids))
865
- # consider false duplicates (different id, same hash)
866
- if not len(set(uids)) == n_unique_hashes:
867
- seen_hashes = set()
868
- non_unique_artifacts = {
869
- hash: artifact
870
- for hash, artifact in artifacts_dict.items()
871
- if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore
872
- }
873
- display_non_unique = "\n ".join(
874
- f"{artifact}" for artifact in non_unique_artifacts
875
- )
876
- logger.warning(
877
- "there are multiple artifact uids with the same hashes, dropping"
878
- f" {len(non_unique_artifacts)} duplicates out of"
879
- f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
880
- )
881
- artifacts = [
882
- artifact
883
- for artifact in artifacts_dict.values()
884
- if artifact not in non_unique_artifacts.values()
885
- ]
886
- logger.success(
887
- f"created {len(artifacts)} artifacts from directory using storage"
888
- f" {storage.root} and key = {folder_key}/"
889
- )
890
- return artifacts
891
-
892
-
893
- # docstring handled through attach_func_to_class_method
894
- def replace(
895
- self,
896
- data: UPathStr | pd.DataFrame | AnnData | MuData,
897
- run: Run | None = None,
898
- format: str | None = None,
899
- ) -> None:
900
- default_storage = settings.storage.record
901
- kwargs, privates = get_artifact_kwargs_from_data(
902
- provisional_uid=self.uid,
903
- data=data,
904
- key=self.key,
905
- run=run,
906
- format=format,
907
- default_storage=default_storage,
908
- version=None,
909
- is_replace=True,
910
- )
911
-
912
- # this artifact already exists
913
- if privates is None:
914
- return kwargs
915
-
916
- check_path_in_storage = privates["check_path_in_storage"]
917
- if check_path_in_storage:
918
- err_msg = (
919
- "Can only replace with a local path not in any Storage. "
920
- f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}."
921
- )
922
- raise ValueError(err_msg)
923
-
924
- _overwrite_versions = kwargs["_overwrite_versions"]
925
- if self._overwrite_versions != _overwrite_versions:
926
- err_msg = "It is not allowed to replace "
927
- err_msg += "a folder" if self._overwrite_versions else "a file"
928
- err_msg += " with " + ("a folder." if _overwrite_versions else "a file.")
929
- raise ValueError(err_msg)
930
-
931
- if self.key is not None and not self._key_is_virtual:
932
- key_path = PurePosixPath(self.key)
933
- new_filename = f"{key_path.stem}{kwargs['suffix']}"
934
- # the following will only be true if the suffix changes!
935
- if key_path.name != new_filename:
936
- self._clear_storagekey = self.key
937
- self.key = str(key_path.with_name(new_filename))
938
- # update old key with the new one so that checks in record pass
939
- self._old_key = self.key
940
- logger.warning(
941
- f"replacing the file will replace key '{key_path}' with '{self.key}'"
942
- f" and delete '{key_path}' upon `save()`"
943
- )
944
- else:
945
- old_storage = auto_storage_key_from_artifact(self)
946
- is_dir = self.n_files is not None
947
- new_storage = auto_storage_key_from_artifact_uid(
948
- self.uid, kwargs["suffix"], is_dir
949
- )
950
- if old_storage != new_storage:
951
- self._clear_storagekey = old_storage
952
- if self.key is not None:
953
- new_key_path = PurePosixPath(self.key).with_suffix(kwargs["suffix"])
954
- self.key = str(new_key_path)
955
- # update old key with the new one so that checks in record pass
956
- self._old_key = self.key
957
-
958
- self.suffix = kwargs["suffix"]
959
- self.size = kwargs["size"]
960
- self.hash = kwargs["hash"]
961
- self._hash_type = kwargs["_hash_type"]
962
- self.run_id = kwargs["run_id"]
963
- self.run = kwargs["run"]
964
- self.n_files = kwargs["n_files"]
965
-
966
- self._local_filepath = privates["local_filepath"]
967
- self._cloud_filepath = privates["cloud_filepath"]
968
- self._memory_rep = privates["memory_rep"]
969
- # no need to upload if new file is already in storage
970
- self._to_store = not check_path_in_storage
971
-
972
-
973
- inconsistent_state_msg = (
974
- "Trying to read a folder artifact from an outdated version, "
975
- "this can result in an incosistent state.\n"
976
- "Read from the latest version: artifact.versions.filter(is_latest=True).one()"
977
- )
978
-
979
-
980
- # docstring handled through attach_func_to_class_method
981
- def open(
982
- self, mode: str = "r", is_run_input: bool | None = None, **kwargs
983
- ) -> (
984
- AnnDataAccessor
985
- | BackedAccessor
986
- | SOMACollection
987
- | SOMAExperiment
988
- | SOMAMeasurement
989
- | PyArrowDataset
990
- ):
991
- if self._overwrite_versions and not self.is_latest:
992
- raise ValueError(inconsistent_state_msg)
993
- # all hdf5 suffixes including gzipped
994
- h5_suffixes = [".h5", ".hdf5", ".h5ad"]
995
- h5_suffixes += [s + ".gz" for s in h5_suffixes]
996
- # ignore empty suffix for now
997
- suffixes = (
998
- (
999
- "",
1000
- ".zarr",
1001
- ".anndata.zarr",
1002
- ".tiledbsoma",
1003
- )
1004
- + tuple(h5_suffixes)
1005
- + PYARROW_SUFFIXES
1006
- + tuple(
1007
- s + ".gz" for s in PYARROW_SUFFIXES
1008
- ) # this doesn't work for externally gzipped files, REMOVE LATER
1009
- )
1010
- if self.suffix not in suffixes:
1011
- raise ValueError(
1012
- "Artifact should have a zarr, h5, tiledbsoma object"
1013
- " or a compatible `pyarrow.dataset.dataset` directory"
1014
- " as the underlying data, please use one of the following suffixes"
1015
- f" for the object name: {', '.join(suffixes[1:])}."
1016
- f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
1017
- " (no mixing allowed)."
1018
- )
1019
- if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
1020
- raise ValueError("Only a tiledbsoma store can be openened with `mode!='r'`.")
1021
-
1022
- from lamindb.core.storage._backed_access import _track_writes_factory, backed_access
1023
-
1024
- using_key = settings._using_key
1025
- filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
1026
- is_tiledbsoma_w = (
1027
- filepath.name == "soma" or self.suffix == ".tiledbsoma"
1028
- ) and mode == "w"
1029
- # consider the case where an object is already locally cached
1030
- localpath = setup_settings.paths.cloud_to_local_no_update(
1031
- filepath, cache_key=cache_key
1032
- )
1033
- if is_tiledbsoma_w:
1034
- open_cache = False
1035
- else:
1036
- open_cache = not isinstance(
1037
- filepath, LocalPathClasses
1038
- ) and not filepath.synchronize(localpath, just_check=True)
1039
- if open_cache:
1040
- try:
1041
- access = backed_access(localpath, mode, using_key, **kwargs)
1042
- except Exception as e:
1043
- if isinstance(filepath, LocalPathClasses):
1044
- raise e
1045
- logger.warning(
1046
- f"The cache might be corrupted: {e}. Trying to open directly."
1047
- )
1048
- access = backed_access(filepath, mode, using_key, **kwargs)
1049
- # happens only if backed_access has been successful
1050
- # delete the corrupted cache
1051
- if localpath.is_dir():
1052
- shutil.rmtree(localpath)
1053
- else:
1054
- localpath.unlink(missing_ok=True)
1055
- else:
1056
- access = backed_access(filepath, mode, using_key, **kwargs)
1057
- if is_tiledbsoma_w:
1058
-
1059
- def finalize():
1060
- nonlocal self, filepath, localpath
1061
- if not isinstance(filepath, LocalPathClasses):
1062
- _, hash, _, _ = get_stat_dir_cloud(filepath)
1063
- else:
1064
- # this can be very slow
1065
- _, hash, _, _ = hash_dir(filepath)
1066
- if self.hash != hash:
1067
- from ._record import init_self_from_db
1068
-
1069
- new_version = Artifact(
1070
- filepath, revises=self, _is_internal_call=True
1071
- ).save()
1072
- init_self_from_db(self, new_version)
1073
-
1074
- if localpath != filepath and localpath.exists():
1075
- shutil.rmtree(localpath)
1076
-
1077
- access = _track_writes_factory(access, finalize)
1078
- # only call if open is successfull
1079
- _track_run_input(self, is_run_input)
1080
- return access
1081
-
1082
-
1083
- # can't really just call .cache in .load because of double tracking
1084
- def _synchronize_cleanup_on_error(
1085
- filepath: UPath, cache_key: str | None = None
1086
- ) -> UPath:
1087
- try:
1088
- cache_path = setup_settings.paths.cloud_to_local(
1089
- filepath, cache_key=cache_key, print_progress=True
1090
- )
1091
- except Exception as e:
1092
- if not isinstance(filepath, LocalPathClasses):
1093
- cache_path = setup_settings.paths.cloud_to_local_no_update(
1094
- filepath, cache_key=cache_key
1095
- )
1096
- if cache_path.is_dir():
1097
- shutil.rmtree(cache_path)
1098
- else:
1099
- cache_path.unlink(missing_ok=True)
1100
- raise e
1101
- return cache_path
1102
-
1103
-
1104
- # docstring handled through attach_func_to_class_method
1105
- def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
1106
- if self._overwrite_versions and not self.is_latest:
1107
- raise ValueError(inconsistent_state_msg)
1108
-
1109
- if hasattr(self, "_memory_rep") and self._memory_rep is not None:
1110
- access_memory = self._memory_rep
1111
- else:
1112
- filepath, cache_key = filepath_cache_key_from_artifact(
1113
- self, using_key=settings._using_key
1114
- )
1115
- cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1116
- try:
1117
- # cache_path is local so doesn't trigger any sync in load_to_memory
1118
- access_memory = load_to_memory(cache_path, **kwargs)
1119
- except Exception as e:
1120
- # just raise the exception if the original path is local
1121
- if isinstance(filepath, LocalPathClasses):
1122
- raise e
1123
- logger.warning(
1124
- f"The cache might be corrupted: {e}. Retrying to synchronize."
1125
- )
1126
- # delete the existing cache
1127
- if cache_path.is_dir():
1128
- shutil.rmtree(cache_path)
1129
- else:
1130
- cache_path.unlink(missing_ok=True)
1131
- # download again and try to load into memory
1132
- cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1133
- access_memory = load_to_memory(cache_path, **kwargs)
1134
- # only call if load is successfull
1135
- _track_run_input(self, is_run_input)
1136
- return access_memory
1137
-
1138
-
1139
- # docstring handled through attach_func_to_class_method
1140
- def cache(self, is_run_input: bool | None = None) -> Path:
1141
- if self._overwrite_versions and not self.is_latest:
1142
- raise ValueError(inconsistent_state_msg)
1143
-
1144
- filepath, cache_key = filepath_cache_key_from_artifact(
1145
- self, using_key=settings._using_key
1146
- )
1147
- cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1148
- # only call if sync is successfull
1149
- _track_run_input(self, is_run_input)
1150
- return cache_path
1151
-
1152
-
1153
- # docstring handled through attach_func_to_class_method
1154
- def delete(
1155
- self,
1156
- permanent: bool | None = None,
1157
- storage: bool | None = None,
1158
- using_key: str | None = None,
1159
- ) -> None:
1160
- # this first check means an invalid delete fails fast rather than cascading through
1161
- # database and storage permission errors
1162
- if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
1163
- isettings = setup_settings.instance
1164
- if self.storage.instance_uid != isettings.uid and (storage or storage is None):
1165
- raise IntegrityError(
1166
- "Cannot simply delete artifacts outside of this instance's managed storage locations."
1167
- "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
1168
- f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
1169
- f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
1170
- )
1171
- # by default, we only move artifacts into the trash (_branch_code = -1)
1172
- trash__branch_code = -1
1173
- if self._branch_code > trash__branch_code and not permanent:
1174
- if storage is not None:
1175
- logger.warning("moving artifact to trash, storage arg is ignored")
1176
- # move to trash
1177
- self._branch_code = trash__branch_code
1178
- self.save()
1179
- logger.important(
1180
- f"moved artifact to trash (_branch_code = {trash__branch_code})"
1181
- )
1182
- return
1183
-
1184
- # if the artifact is already in the trash
1185
- # permanent delete skips the trash
1186
- if permanent is None:
1187
- # ask for confirmation of permanent delete
1188
- response = input(
1189
- "Artifact record is already in trash! Are you sure you want to permanently"
1190
- " delete it? (y/n) You can't undo this action."
1191
- )
1192
- delete_record = response == "y"
1193
- else:
1194
- assert permanent # noqa: S101
1195
- delete_record = True
1196
-
1197
- if delete_record:
1198
- # need to grab file path before deletion
1199
- try:
1200
- path, _ = filepath_from_artifact(self, using_key)
1201
- except OSError:
1202
- # we can still delete the record
1203
- logger.warning("Could not get path")
1204
- storage = False
1205
- # only delete in storage if DB delete is successful
1206
- # DB delete might error because of a foreign key constraint violated etc.
1207
- if self._overwrite_versions and self.is_latest:
1208
- # includes self
1209
- for version in self.versions.all():
1210
- _delete_skip_storage(version)
1211
- else:
1212
- self._delete_skip_storage()
1213
- # by default do not delete storage if deleting only a previous version
1214
- # and the underlying store is mutable
1215
- if self._overwrite_versions and not self.is_latest:
1216
- delete_in_storage = False
1217
- if storage:
1218
- logger.warning(
1219
- "Storage argument is ignored; can't delete storage on an previous version"
1220
- )
1221
- elif self.key is None or self._key_is_virtual:
1222
- # do not ask for confirmation also if storage is None
1223
- delete_in_storage = storage is None or storage
1224
- else:
1225
- # for artifacts with non-virtual semantic storage keys (key is not None)
1226
- # ask for extra-confirmation
1227
- if storage is None:
1228
- response = input(
1229
- f"Are you sure to want to delete {path}? (y/n) You can't undo"
1230
- " this action."
1231
- )
1232
- delete_in_storage = response == "y"
1233
- else:
1234
- delete_in_storage = storage
1235
- if not delete_in_storage:
1236
- logger.important(f"a file/folder remains here: {path}")
1237
- # we don't yet have logic to bring back the deleted metadata record
1238
- # in case storage deletion fails - this is important for ACID down the road
1239
- if delete_in_storage:
1240
- delete_msg = delete_storage(path, raise_file_not_found_error=False)
1241
- if delete_msg != "did-not-delete":
1242
- logger.success(f"deleted {colors.yellow(f'{path}')}")
1243
-
1244
-
1245
- def _delete_skip_storage(artifact, *args, **kwargs) -> None:
1246
- super(Artifact, artifact).delete(*args, **kwargs)
1247
-
1248
-
1249
- # docstring handled through attach_func_to_class_method
1250
- def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1251
- state_was_adding = self._state.adding
1252
- print_progress = kwargs.pop("print_progress", True)
1253
- store_kwargs = kwargs.pop("store_kwargs", {}) # kwargs for .upload_from in the end
1254
- access_token = kwargs.pop("access_token", None)
1255
- local_path = None
1256
- if upload and setup_settings.instance.keep_artifacts_local:
1257
- # switch local storage location to cloud
1258
- local_path = self.path
1259
- self.storage_id = setup_settings.instance.storage.id
1260
- self._local_filepath = local_path
1261
- # switch to virtual storage key upon upload
1262
- # the local filepath is already cached at that point
1263
- self._key_is_virtual = True
1264
- # ensure that the artifact is uploaded
1265
- self._to_store = True
1266
-
1267
- self._save_skip_storage(**kwargs)
1268
-
1269
- from lamindb._save import check_and_attempt_clearing, check_and_attempt_upload
1270
-
1271
- using_key = None
1272
- if "using" in kwargs:
1273
- using_key = kwargs["using"]
1274
- exception_upload = check_and_attempt_upload(
1275
- self,
1276
- using_key,
1277
- access_token=access_token,
1278
- print_progress=print_progress,
1279
- **store_kwargs,
1280
- )
1281
- if exception_upload is not None:
1282
- # we do not want to raise file not found on cleanup if upload of a file failed
1283
- # often it is ACID in the filesystem itself
1284
- # for example, s3 won't have the failed file, so just skip the delete in this case
1285
- raise_file_not_found_error = False
1286
- self._delete_skip_storage()
1287
- else:
1288
- # this is the case when it is cleaned on .replace
1289
- raise_file_not_found_error = True
1290
- # this is triggered by an exception in check_and_attempt_upload or by replace.
1291
- exception_clear = check_and_attempt_clearing(
1292
- self, raise_file_not_found_error=raise_file_not_found_error, using_key=using_key
1293
- )
1294
- if exception_upload is not None:
1295
- raise RuntimeError(exception_upload)
1296
- if exception_clear is not None:
1297
- raise RuntimeError(exception_clear)
1298
- # this is only for keep_artifacts_local
1299
- if local_path is not None and not state_was_adding:
1300
- # only move the local artifact to cache if it was not newly created
1301
- local_path_cache = ln_setup.settings.cache_dir / local_path.name
1302
- # don't use Path.rename here because of cross-device link error
1303
- # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
1304
- shutil.move(
1305
- local_path, # type: ignore
1306
- local_path_cache,
1307
- )
1308
- logger.important(f"moved local artifact to cache: {local_path_cache}")
1309
- return self
1310
-
1311
-
1312
- def _save_skip_storage(file, **kwargs) -> None:
1313
- save_staged_feature_sets(file)
1314
- super(Artifact, file).save(**kwargs)
1315
- save_schema_links(file)
1316
-
1317
-
1318
- @property # type: ignore
1319
- @doc_args(Artifact.path.__doc__)
1320
- def path(self) -> Path | UPath:
1321
- """{}""" # noqa: D415
1322
- # return only the path, without StorageSettings
1323
- filepath, _ = filepath_from_artifact(self, using_key=settings._using_key)
1324
- return filepath
1325
-
1326
-
1327
- # get cache path without triggering sync
1328
- @property # type: ignore
1329
- def _cache_path(self) -> UPath:
1330
- filepath, cache_key = filepath_cache_key_from_artifact(
1331
- self, using_key=settings._using_key
1332
- )
1333
- if isinstance(filepath, LocalPathClasses):
1334
- return filepath
1335
- return setup_settings.paths.cloud_to_local_no_update(filepath, cache_key=cache_key)
1336
-
1337
-
1338
- # docstring handled through attach_func_to_class_method
1339
- def restore(self) -> None:
1340
- self._branch_code = 1
1341
- self.save()
1342
-
1343
-
1344
- METHOD_NAMES = [
1345
- "__init__",
1346
- "from_anndata",
1347
- "from_df",
1348
- "from_mudata",
1349
- "from_tiledbsoma",
1350
- "open",
1351
- "cache",
1352
- "load",
1353
- "delete",
1354
- "save",
1355
- "replace",
1356
- "from_dir",
1357
- "restore",
1358
- ]
1359
-
1360
- if ln_setup._TESTING:
1361
- from inspect import signature
1362
-
1363
- SIGS = {
1364
- name: signature(getattr(Artifact, name))
1365
- for name in METHOD_NAMES
1366
- if name != "__init__"
1367
- }
1368
-
1369
- for name in METHOD_NAMES:
1370
- attach_func_to_class_method(name, Artifact, globals())
1371
-
1372
- # privates currently dealt with separately
1373
- # mypy: ignore-errors
1374
- Artifact._delete_skip_storage = _delete_skip_storage
1375
- Artifact._save_skip_storage = _save_skip_storage
1376
- Artifact._cache_path = _cache_path
1377
- Artifact.path = path
1378
- Artifact.describe = describe
1379
- Artifact.view_lineage = view_lineage