lamindb 0.76.7__py3-none-any.whl → 0.76.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1178
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +387 -387
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -295
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +574 -574
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -867
  27. lamindb/core/_label_manager.py +253 -253
  28. lamindb/core/_mapped_collection.py +597 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +571 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -77
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -164
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -204
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -141
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/METADATA +3 -3
  59. lamindb-0.76.8.dist-info/RECORD +60 -0
  60. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.7.dist-info/RECORD +0 -60
lamindb/_artifact.py CHANGED
@@ -1,1178 +1,1205 @@
1
- from __future__ import annotations
2
-
3
- import os
4
- import shutil
5
- from pathlib import Path, PurePath, PurePosixPath
6
- from typing import TYPE_CHECKING, Any, Mapping
7
-
8
- import fsspec
9
- import lamindb_setup as ln_setup
10
- import pandas as pd
11
- from anndata import AnnData
12
- from django.db.models import Q, QuerySet
13
- from lamin_utils import colors, logger
14
- from lamindb_setup import settings as setup_settings
15
- from lamindb_setup._init_instance import register_storage_in_instance
16
- from lamindb_setup.core._docs import doc_args
17
- from lamindb_setup.core._settings_storage import init_storage
18
- from lamindb_setup.core.hashing import hash_dir, hash_file
19
- from lamindb_setup.core.upath import (
20
- create_path,
21
- extract_suffix_from_path,
22
- get_stat_dir_cloud,
23
- get_stat_file_cloud,
24
- )
25
- from lnschema_core.models import Artifact, FeatureManager, ParamManager, Run, Storage
26
- from lnschema_core.types import (
27
- VisibilityChoice,
28
- )
29
-
30
- from lamindb._utils import attach_func_to_class_method
31
- from lamindb.core._data import _track_run_input, describe, view_lineage
32
- from lamindb.core._settings import settings
33
- from lamindb.core.exceptions import IntegrityError
34
- from lamindb.core.loaders import load_to_memory
35
- from lamindb.core.storage import (
36
- LocalPathClasses,
37
- UPath,
38
- delete_storage,
39
- infer_suffix,
40
- write_to_disk,
41
- )
42
- from lamindb.core.storage.paths import (
43
- auto_storage_key_from_artifact,
44
- auto_storage_key_from_artifact_uid,
45
- check_path_is_child_of_root,
46
- filepath_from_artifact,
47
- )
48
- from lamindb.core.versioning import (
49
- create_uid,
50
- message_update_key_in_version_family,
51
- )
52
-
53
- from .core._data import (
54
- add_transform_to_kwargs,
55
- get_run,
56
- save_feature_set_links,
57
- save_feature_sets,
58
- )
59
- from .core.storage.objects import _mudata_is_installed
60
- from .core.storage.paths import AUTO_KEY_PREFIX
61
-
62
- try:
63
- from .core.storage._zarr import zarr_is_adata
64
- except ImportError:
65
-
66
- def zarr_is_adata(storepath): # type: ignore
67
- raise ImportError("Please install zarr: pip install zarr")
68
-
69
-
70
- if TYPE_CHECKING:
71
- from lamindb_setup.core.types import UPathStr
72
- from mudata import MuData
73
- from tiledbsoma import Collection as SOMACollection
74
- from tiledbsoma import Experiment as SOMAExperiment
75
-
76
- from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
77
-
78
-
79
- def process_pathlike(
80
- filepath: UPath,
81
- default_storage: Storage,
82
- using_key: str | None,
83
- skip_existence_check: bool = False,
84
- ) -> tuple[Storage, bool]:
85
- if not skip_existence_check:
86
- try: # check if file exists
87
- if not filepath.exists():
88
- raise FileNotFoundError(filepath)
89
- except PermissionError:
90
- pass
91
- if isinstance(filepath, LocalPathClasses):
92
- filepath = filepath.resolve()
93
- if check_path_is_child_of_root(filepath, default_storage.root):
94
- use_existing_storage_key = True
95
- return default_storage, use_existing_storage_key
96
- else:
97
- # check whether the path is part of one of the existing
98
- # already-registered storage locations
99
- result = False
100
- # within the hub, we don't want to perform check_path_in_existing_storage
101
- if using_key is None:
102
- result = check_path_in_existing_storage(filepath, using_key)
103
- if isinstance(result, Storage):
104
- use_existing_storage_key = True
105
- return result, use_existing_storage_key
106
- else:
107
- # if the path is in the cloud, we have a good candidate
108
- # for the storage root: the bucket
109
- if not isinstance(filepath, LocalPathClasses):
110
- # for a cloud path, new_root is always the bucket name
111
- new_root = list(filepath.parents)[-1]
112
- # do not register remote storage locations on hub if the current instance
113
- # is not managed on the hub
114
- storage_settings, _ = init_storage(
115
- new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
116
- )
117
- storage_record = register_storage_in_instance(storage_settings)
118
- use_existing_storage_key = True
119
- return storage_record, use_existing_storage_key
120
- # if the filepath is local
121
- else:
122
- use_existing_storage_key = False
123
- # if the default storage is local we'll throw an error if the user
124
- # doesn't provide a key
125
- if default_storage.type == "local":
126
- return default_storage, use_existing_storage_key
127
- # if the default storage is in the cloud (the file is going to
128
- # be uploaded upon saving it), we treat the filepath as a cache
129
- else:
130
- return default_storage, use_existing_storage_key
131
-
132
-
133
- def process_data(
134
- provisional_uid: str,
135
- data: UPathStr | pd.DataFrame | AnnData,
136
- format: str | None,
137
- key: str | None,
138
- default_storage: Storage,
139
- using_key: str | None,
140
- skip_existence_check: bool = False,
141
- ) -> tuple[Any, Path | UPath, str, Storage, bool]:
142
- """Serialize a data object that's provided as file or in memory."""
143
- # if not overwritten, data gets stored in default storage
144
- if _mudata_is_installed():
145
- from mudata import MuData
146
-
147
- data_types = (pd.DataFrame, AnnData, MuData)
148
- else:
149
- data_types = (pd.DataFrame, AnnData) # type:ignore
150
-
151
- if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
152
- access_token = (
153
- default_storage._access_token
154
- if hasattr(default_storage, "_access_token")
155
- else None
156
- )
157
- path = create_path(data, access_token=access_token)
158
- storage, use_existing_storage_key = process_pathlike(
159
- path,
160
- default_storage=default_storage,
161
- using_key=using_key,
162
- skip_existence_check=skip_existence_check,
163
- )
164
- suffix = extract_suffix_from_path(path)
165
- memory_rep = None
166
- elif isinstance(data, data_types):
167
- storage = default_storage
168
- memory_rep = data
169
- if key is not None:
170
- key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
171
- # use suffix as the (adata) format if the format is not provided
172
- if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
173
- format = key_suffix[1:]
174
- else:
175
- key_suffix = None
176
- suffix = infer_suffix(data, format)
177
- if key_suffix is not None and key_suffix != suffix:
178
- raise ValueError(
179
- f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
180
- f" be '{suffix}'."
181
- )
182
- cache_name = f"{provisional_uid}{suffix}"
183
- path = settings._storage_settings.cache_dir / cache_name
184
- # Alex: I don't understand the line below
185
- if path.suffixes == []:
186
- path = path.with_suffix(suffix)
187
- write_to_disk(data, path)
188
- use_existing_storage_key = False
189
- else:
190
- raise NotImplementedError(
191
- f"Do not know how to create a artifact object from {data}, pass a path"
192
- " instead!"
193
- )
194
- return memory_rep, path, suffix, storage, use_existing_storage_key
195
-
196
-
197
- def get_stat_or_artifact(
198
- path: UPath,
199
- key: str | None = None,
200
- check_hash: bool = True,
201
- is_replace: bool = False,
202
- instance: str | None = None,
203
- ) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
204
- n_objects = None
205
- if settings.creation.artifact_skip_size_hash:
206
- return None, None, None, n_objects, None
207
- stat = path.stat() # one network request
208
- if not isinstance(path, LocalPathClasses):
209
- size, hash, hash_type = None, None, None
210
- if stat is not None:
211
- # convert UPathStatResult to fsspec info dict
212
- stat = stat.as_info()
213
- if "ETag" in stat: # is file
214
- size, hash, hash_type = get_stat_file_cloud(stat)
215
- elif stat["type"] == "directory":
216
- size, hash, hash_type, n_objects = get_stat_dir_cloud(path)
217
- if hash is None:
218
- logger.warning(f"did not add hash for {path}")
219
- return size, hash, hash_type, n_objects, None
220
- else:
221
- if path.is_dir():
222
- size, hash, hash_type, n_objects = hash_dir(path)
223
- else:
224
- hash, hash_type = hash_file(path)
225
- size = stat.st_size
226
- if not check_hash:
227
- return size, hash, hash_type, n_objects, None
228
- previous_artifact_version = None
229
- if key is None or is_replace:
230
- result = Artifact.objects.using(instance).filter(hash=hash).all()
231
- artifact_with_same_hash_exists = len(result) > 0
232
- else:
233
- storage_id = settings.storage.id
234
- result = (
235
- Artifact.objects.using(instance)
236
- .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
237
- .order_by("-created_at")
238
- .all()
239
- )
240
- artifact_with_same_hash_exists = len(result.filter(hash=hash).all()) > 0
241
- if not artifact_with_same_hash_exists and len(result) > 0:
242
- logger.important(
243
- f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
244
- )
245
- previous_artifact_version = result[0]
246
- if artifact_with_same_hash_exists:
247
- if settings.creation.artifact_if_hash_exists == "error":
248
- msg = f"artifact with same hash exists: {result[0]}"
249
- hint = (
250
- "💡 you can make this error a warning:\n"
251
- " ln.settings.creation.artifact_if_hash_exists"
252
- )
253
- raise FileExistsError(f"{msg}\n{hint}")
254
- elif settings.creation.artifact_if_hash_exists == "warn_create_new":
255
- logger.warning(
256
- "creating new Artifact object despite existing artifact with same hash:"
257
- f" {result[0]}"
258
- )
259
- return size, hash, hash_type, n_objects, None
260
- else:
261
- if result[0].visibility == -1:
262
- raise FileExistsError(
263
- f"You're trying to re-create this artifact in trash: {result[0]}"
264
- "Either permanently delete it with `artifact.delete(permanent=True)` or restore it with `artifact.restore()`"
265
- )
266
- logger.important(f"returning existing artifact with same hash: {result[0]}")
267
- return result[0]
268
- else:
269
- return size, hash, hash_type, n_objects, previous_artifact_version
270
-
271
-
272
- def check_path_in_existing_storage(
273
- path: Path | UPath, using_key: str | None = None
274
- ) -> Storage | bool:
275
- for storage in Storage.objects.using(using_key).filter().all():
276
- # if path is part of storage, return it
277
- if check_path_is_child_of_root(path, root=storage.root):
278
- return storage
279
- return False
280
-
281
-
282
- def get_relative_path_to_directory(
283
- path: PurePath | Path | UPath, directory: PurePath | Path | UPath
284
- ) -> PurePath | Path:
285
- if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
286
- # UPath.relative_to() is not behaving as it should (2023-04-07)
287
- # need to lstrip otherwise inconsistent behavior across trailing slashes
288
- # see test_artifact.py: test_get_relative_path_to_directory
289
- relpath = PurePath(
290
- path.as_posix().replace(directory.as_posix(), "").lstrip("/")
291
- )
292
- elif isinstance(directory, Path):
293
- relpath = path.resolve().relative_to(directory.resolve()) # type: ignore
294
- elif isinstance(directory, PurePath):
295
- relpath = path.relative_to(directory)
296
- else:
297
- raise TypeError("Directory not of type Path or UPath")
298
- return relpath
299
-
300
-
301
- def get_artifact_kwargs_from_data(
302
- *,
303
- data: Path | UPath | str | pd.DataFrame | AnnData | MuData,
304
- key: str | None,
305
- run: Run | None,
306
- format: str | None,
307
- provisional_uid: str,
308
- version: str | None,
309
- default_storage: Storage,
310
- using_key: str | None = None,
311
- is_replace: bool = False,
312
- skip_check_exists: bool = False,
313
- ):
314
- run = get_run(run)
315
- memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
316
- provisional_uid,
317
- data,
318
- format,
319
- key,
320
- default_storage,
321
- using_key,
322
- skip_check_exists,
323
- )
324
- stat_or_artifact = get_stat_or_artifact(
325
- path=path,
326
- key=key,
327
- instance=using_key,
328
- is_replace=is_replace,
329
- )
330
- if isinstance(stat_or_artifact, Artifact):
331
- artifact = stat_or_artifact
332
- # update the run of the existing artifact
333
- if run is not None:
334
- # save the information that this artifact was previously
335
- # produced by another run
336
- if artifact.run is not None:
337
- artifact.run._output_artifacts_with_later_updates.add(artifact)
338
- # update the run of the artifact with the latest run
339
- stat_or_artifact.run = run
340
- stat_or_artifact.transform = run.transform
341
- return artifact, None
342
- else:
343
- size, hash, hash_type, n_objects, revises = stat_or_artifact
344
-
345
- if revises is not None: # update provisional_uid
346
- provisional_uid, revises = create_uid(revises=revises, version=version)
347
- if path.as_posix().startswith(settings._storage_settings.cache_dir.as_posix()):
348
- path = path.rename(f"{provisional_uid}{suffix}")
349
-
350
- check_path_in_storage = False
351
- if use_existing_storage_key:
352
- inferred_key = get_relative_path_to_directory(
353
- path=path, directory=UPath(storage.root)
354
- ).as_posix()
355
- if key is None:
356
- key = inferred_key
357
- else:
358
- if not key == inferred_key:
359
- raise ValueError(
360
- f"The path '{data}' is already in registered storage"
361
- f" '{storage.root}' with key '{inferred_key}'\nYou passed"
362
- f" conflicting key '{key}': please move the file before"
363
- " registering it."
364
- )
365
- check_path_in_storage = True
366
- else:
367
- storage = default_storage
368
-
369
- log_storage_hint(
370
- check_path_in_storage=check_path_in_storage,
371
- storage=storage,
372
- key=key,
373
- uid=provisional_uid,
374
- suffix=suffix,
375
- is_dir=n_objects is not None,
376
- )
377
-
378
- # do we use a virtual or an actual storage key?
379
- key_is_virtual = settings.creation._artifact_use_virtual_keys
380
-
381
- # if the file is already in storage, independent of the default
382
- # we use an actual storage key
383
- if check_path_in_storage:
384
- key_is_virtual = False
385
-
386
- kwargs = {
387
- "uid": provisional_uid,
388
- "suffix": suffix,
389
- "hash": hash,
390
- "_hash_type": hash_type,
391
- "key": key,
392
- "size": size,
393
- "storage_id": storage.id,
394
- # passing both the id and the object
395
- # to make them both available immediately
396
- # after object creation
397
- "n_objects": n_objects,
398
- "n_observations": None, # to implement
399
- "run_id": run.id if run is not None else None,
400
- "run": run,
401
- "_key_is_virtual": key_is_virtual,
402
- }
403
- if not isinstance(path, LocalPathClasses):
404
- local_filepath = None
405
- cloud_filepath = path
406
- else:
407
- local_filepath = path
408
- cloud_filepath = None
409
- privates = {
410
- "local_filepath": local_filepath,
411
- "cloud_filepath": cloud_filepath,
412
- "memory_rep": memory_rep,
413
- "check_path_in_storage": check_path_in_storage,
414
- }
415
- return kwargs, privates
416
-
417
-
418
- def log_storage_hint(
419
- *,
420
- check_path_in_storage: bool,
421
- storage: Storage | None,
422
- key: str | None,
423
- uid: str,
424
- suffix: str,
425
- is_dir: bool,
426
- ) -> None:
427
- hint = ""
428
- if check_path_in_storage:
429
- display_root = storage.root # type: ignore
430
- # check whether path is local
431
- if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore
432
- # if it's a local path, check whether it's in the current working directory
433
- root_path = Path(storage.root) # type: ignore
434
- if check_path_is_child_of_root(root_path, Path.cwd()):
435
- # only display the relative path, not the fully resolved path
436
- display_root = root_path.relative_to(Path.cwd())
437
- hint += f"path in storage '{display_root}'" # type: ignore
438
- else:
439
- hint += "path content will be copied to default storage upon `save()`"
440
- if key is None:
441
- storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
442
- hint += f" with key `None` ('{storage_key}')"
443
- else:
444
- hint += f" with key '{key}'"
445
- logger.hint(hint)
446
-
447
-
448
- def data_is_anndata(data: AnnData | UPathStr) -> bool:
449
- if isinstance(data, AnnData):
450
- return True
451
- if isinstance(data, (str, Path, UPath)):
452
- data_path = UPath(data)
453
- if data_path.suffix == ".h5ad":
454
- return True
455
- elif data_path.suffix == ".zarr":
456
- # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
457
- if ".anndata" in data_path.suffixes:
458
- return True
459
- # check only for local, expensive for cloud
460
- if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
461
- return zarr_is_adata(data_path)
462
- else:
463
- logger.warning("We do not check if cloud zarr is AnnData or not.")
464
- return False
465
- return False
466
-
467
-
468
- def data_is_mudata(data: MuData | UPathStr) -> bool:
469
- if _mudata_is_installed():
470
- from mudata import MuData
471
-
472
- if isinstance(data, MuData):
473
- return True
474
- if isinstance(data, (str, Path)):
475
- return UPath(data).suffix in {".h5mu"}
476
- return False
477
-
478
-
479
- def _check_accessor_artifact(data: Any, accessor: str | None = None):
480
- if accessor is None:
481
- if isinstance(data, pd.DataFrame):
482
- logger.warning("data is a DataFrame, please use .from_df()")
483
- accessor = "DataFrame"
484
- return accessor
485
-
486
- data_is_path = isinstance(data, (str, Path))
487
- if data_is_anndata(data):
488
- if not data_is_path:
489
- logger.warning("data is an AnnData, please use .from_anndata()")
490
- accessor = "AnnData"
491
- elif data_is_mudata(data):
492
- if not data_is_path:
493
- logger.warning("data is a MuData, please use .from_mudata()")
494
- accessor = "MuData"
495
- elif not data_is_path: # UPath is a subclass of Path
496
- raise TypeError("data has to be a string, Path, UPath")
497
- return accessor
498
-
499
-
500
- def __init__(artifact: Artifact, *args, **kwargs):
501
- artifact.features = FeatureManager(artifact)
502
- artifact.params = ParamManager(artifact)
503
- # Below checks for the Django-internal call in from_db()
504
- # it'd be better if we could avoid this, but not being able to create a Artifact
505
- # from data with the default constructor renders the central class of the API
506
- # essentially useless
507
- # The danger below is not that a user might pass as many args (12 of it), but rather
508
- # that at some point the Django API might change; on the other hand, this
509
- # condition of for calling the constructor based on kwargs should always
510
- # stay robust
511
- if len(args) == len(artifact._meta.concrete_fields):
512
- super(Artifact, artifact).__init__(*args, **kwargs)
513
- return None
514
- # now we proceed with the user-facing constructor
515
- if len(args) > 1:
516
- raise ValueError("Only one non-keyword arg allowed: data")
517
-
518
- data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
519
- type: str = kwargs.pop("type") if "type" in kwargs else None
520
- key: str | None = kwargs.pop("key") if "key" in kwargs else None
521
- run: Run | None = kwargs.pop("run") if "run" in kwargs else None
522
- description: str | None = (
523
- kwargs.pop("description") if "description" in kwargs else None
524
- )
525
- revises: Artifact | None = kwargs.pop("revises") if "revises" in kwargs else None
526
- version: str | None = kwargs.pop("version") if "version" in kwargs else None
527
- visibility: int | None = (
528
- kwargs.pop("visibility")
529
- if "visibility" in kwargs
530
- else VisibilityChoice.default.value
531
- )
532
- format = kwargs.pop("format") if "format" in kwargs else None
533
- _is_internal_call = kwargs.pop("_is_internal_call", False)
534
- skip_check_exists = (
535
- kwargs.pop("skip_check_exists") if "skip_check_exists" in kwargs else False
536
- )
537
- if "default_storage" in kwargs:
538
- default_storage = kwargs.pop("default_storage")
539
- else:
540
- if setup_settings.instance.keep_artifacts_local:
541
- default_storage = setup_settings.instance.storage_local.record
542
- else:
543
- default_storage = setup_settings.instance.storage.record
544
- using_key = (
545
- kwargs.pop("using_key") if "using_key" in kwargs else settings._using_key
546
- )
547
- accessor = kwargs.pop("_accessor") if "_accessor" in kwargs else None
548
- accessor = _check_accessor_artifact(data=data, accessor=accessor)
549
- if "is_new_version_of" in kwargs:
550
- logger.warning("`is_new_version_of` will be removed soon, please use `revises`")
551
- revises = kwargs.pop("is_new_version_of")
552
- if not len(kwargs) == 0:
553
- raise ValueError(
554
- "Only data, key, run, description, version, revises, visibility"
555
- f" can be passed, you passed: {kwargs}"
556
- )
557
- if revises is not None and key is not None and revises.key != key:
558
- note = message_update_key_in_version_family(
559
- suid=revises.stem_uid,
560
- existing_key=revises.key,
561
- new_key=key,
562
- registry="Artifact",
563
- )
564
- raise ValueError(
565
- f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
566
- )
567
- if revises is not None:
568
- if not isinstance(revises, Artifact):
569
- raise TypeError("`revises` has to be of type `Artifact`")
570
- if description is None:
571
- description = revises.description
572
- if key is not None and AUTO_KEY_PREFIX in key:
573
- raise ValueError(
574
- f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
575
- )
576
- # below is for internal calls that require defining the storage location
577
- # ahead of constructing the Artifact
578
- if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
579
- if _is_internal_call:
580
- is_automanaged_path = True
581
- user_provided_key = key
582
- key = None
583
- else:
584
- raise ValueError(
585
- f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
586
- )
587
- else:
588
- is_automanaged_path = False
589
- provisional_uid, revises = create_uid(revises=revises, version=version)
590
- kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
591
- data=data,
592
- key=key,
593
- run=run,
594
- format=format,
595
- provisional_uid=provisional_uid,
596
- version=version,
597
- default_storage=default_storage,
598
- using_key=using_key,
599
- skip_check_exists=skip_check_exists,
600
- )
601
-
602
- # an object with the same hash already exists
603
- if isinstance(kwargs_or_artifact, Artifact):
604
- from ._record import init_self_from_db, update_attributes
605
-
606
- init_self_from_db(artifact, kwargs_or_artifact)
607
- # adding "key" here is dangerous because key might be auto-populated
608
- update_attributes(artifact, {"description": description})
609
- if artifact.key != key and key is not None:
610
- logger.warning(
611
- f"key {artifact.key} on existing artifact differs from passed key {key}"
612
- )
613
- return None
614
- else:
615
- kwargs = kwargs_or_artifact
616
-
617
- if data is not None:
618
- artifact._local_filepath = privates["local_filepath"]
619
- artifact._cloud_filepath = privates["cloud_filepath"]
620
- artifact._memory_rep = privates["memory_rep"]
621
- artifact._to_store = not privates["check_path_in_storage"]
622
-
623
- if is_automanaged_path and _is_internal_call:
624
- kwargs["_key_is_virtual"] = True
625
- assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
626
- uid = kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
627
- kwargs["key"] = user_provided_key
628
- if revises is not None:
629
- assert uid.startswith(revises.stem_uid) # noqa: S101
630
- if len(uid) == 16:
631
- if revises is None:
632
- uid += "0000"
633
- else:
634
- uid, revises = create_uid(revises=revises, version=version)
635
- kwargs["uid"] = uid
636
-
637
- # only set key now so that we don't do a look-up on it in case revises is passed
638
- if revises is not None:
639
- kwargs["key"] = revises.key
640
-
641
- kwargs["type"] = type
642
- kwargs["version"] = version
643
- kwargs["description"] = description
644
- kwargs["visibility"] = visibility
645
- kwargs["_accessor"] = accessor
646
- kwargs["revises"] = revises
647
- # this check needs to come down here because key might be populated from an
648
- # existing file path during get_artifact_kwargs_from_data()
649
- if (
650
- kwargs["key"] is None
651
- and kwargs["description"] is None
652
- and kwargs["run"] is None
653
- ):
654
- raise ValueError("Pass one of key, run or description as a parameter")
655
-
656
- add_transform_to_kwargs(kwargs, kwargs["run"])
657
-
658
- super(Artifact, artifact).__init__(**kwargs)
659
-
660
-
661
- @classmethod # type: ignore
662
- @doc_args(Artifact.from_df.__doc__)
663
- def from_df(
664
- cls,
665
- df: pd.DataFrame,
666
- key: str | None = None,
667
- description: str | None = None,
668
- run: Run | None = None,
669
- revises: Artifact | None = None,
670
- **kwargs,
671
- ) -> Artifact:
672
- """{}""" # noqa: D415
673
- artifact = Artifact(
674
- data=df,
675
- key=key,
676
- run=run,
677
- description=description,
678
- revises=revises,
679
- _accessor="DataFrame",
680
- type="dataset",
681
- **kwargs,
682
- )
683
- return artifact
684
-
685
-
686
- @classmethod # type: ignore
687
- @doc_args(Artifact.from_anndata.__doc__)
688
- def from_anndata(
689
- cls,
690
- adata: AnnData | UPathStr,
691
- key: str | None = None,
692
- description: str | None = None,
693
- run: Run | None = None,
694
- revises: Artifact | None = None,
695
- **kwargs,
696
- ) -> Artifact:
697
- """{}""" # noqa: D415
698
- if not data_is_anndata(adata):
699
- raise ValueError("data has to be an AnnData object or a path to AnnData-like")
700
- artifact = Artifact(
701
- data=adata,
702
- key=key,
703
- run=run,
704
- description=description,
705
- revises=revises,
706
- _accessor="AnnData",
707
- type="dataset",
708
- **kwargs,
709
- )
710
- return artifact
711
-
712
-
713
- @classmethod # type: ignore
714
- @doc_args(Artifact.from_mudata.__doc__)
715
- def from_mudata(
716
- cls,
717
- mdata: MuData,
718
- key: str | None = None,
719
- description: str | None = None,
720
- run: Run | None = None,
721
- revises: Artifact | None = None,
722
- **kwargs,
723
- ) -> Artifact:
724
- """{}""" # noqa: D415
725
- artifact = Artifact(
726
- data=mdata,
727
- key=key,
728
- run=run,
729
- description=description,
730
- revises=revises,
731
- _accessor="MuData",
732
- type="dataset",
733
- **kwargs,
734
- )
735
- return artifact
736
-
737
-
738
- @classmethod # type: ignore
739
- @doc_args(Artifact.from_dir.__doc__)
740
- def from_dir(
741
- cls,
742
- path: UPathStr,
743
- key: str | None = None,
744
- *,
745
- run: Run | None = None,
746
- ) -> list[Artifact]:
747
- """{}""" # noqa: D415
748
- logger.warning(
749
- "this creates one artifact per file in the directory - consider"
750
- " ln.Artifact(dir_path) to get one artifact for the entire directory"
751
- )
752
- folderpath: UPath = create_path(path) # returns Path for local
753
- default_storage = settings._storage_settings.record
754
- using_key = settings._using_key
755
- storage, use_existing_storage = process_pathlike(
756
- folderpath, default_storage, using_key
757
- )
758
- folder_key_path: PurePath | Path
759
- if key is None:
760
- if not use_existing_storage:
761
- logger.warning(
762
- "folder is outside existing storage location, will copy files from"
763
- f" {path} to {storage.root}/{folderpath.name}"
764
- )
765
- folder_key_path = Path(folderpath.name)
766
- else:
767
- # maintain the hierachy within an existing storage location
768
- folder_key_path = get_relative_path_to_directory(
769
- folderpath, UPath(storage.root)
770
- )
771
- else:
772
- folder_key_path = Path(key)
773
-
774
- # always sanitize by stripping a trailing slash
775
- folder_key = folder_key_path.as_posix().rstrip("/")
776
-
777
- # TODO: (non-local) UPath doesn't list the first level artifacts and dirs with "*"
778
- pattern = "" if not isinstance(folderpath, LocalPathClasses) else "*"
779
-
780
- # silence fine-grained logging
781
- verbosity = settings.verbosity
782
- verbosity_int = settings._verbosity_int
783
- if verbosity_int >= 1:
784
- settings.verbosity = "warning"
785
- artifacts_dict = {}
786
- for filepath in folderpath.rglob(pattern):
787
- if filepath.is_file():
788
- relative_path = get_relative_path_to_directory(filepath, folderpath)
789
- artifact_key = folder_key + "/" + relative_path.as_posix()
790
- # if creating from rglob, we don't need to check for existence
791
- artifact = Artifact(
792
- filepath, run=run, key=artifact_key, skip_check_exists=True
793
- )
794
- artifacts_dict[artifact.uid] = artifact
795
- settings.verbosity = verbosity
796
-
797
- # run sanity check on hashes
798
- hashes = [
799
- artifact.hash
800
- for artifact in artifacts_dict.values()
801
- if artifact.hash is not None
802
- ]
803
- uids = artifacts_dict.keys()
804
- if len(set(hashes)) == len(hashes):
805
- artifacts = list(artifacts_dict.values())
806
- else:
807
- # consider exact duplicates (same id, same hash)
808
- # below can't happen anymore because artifacts is a dict now
809
- # if len(set(uids)) == len(set(hashes)):
810
- # logger.warning("dropping duplicate records in list of artifact records")
811
- # artifacts = list(set(uids))
812
- # consider false duplicates (different id, same hash)
813
- if not len(set(uids)) == len(set(hashes)):
814
- seen_hashes = set()
815
- non_unique_artifacts = {
816
- hash: artifact
817
- for hash, artifact in artifacts_dict.items()
818
- if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore
819
- }
820
- display_non_unique = "\n ".join(
821
- f"{artifact}" for artifact in non_unique_artifacts
822
- )
823
- logger.warning(
824
- "there are multiple artifact uids with the same hashes, dropping"
825
- f" {len(non_unique_artifacts)} duplicates out of"
826
- f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
827
- )
828
- artifacts = [
829
- artifact
830
- for artifact in artifacts_dict.values()
831
- if artifact not in non_unique_artifacts.values()
832
- ]
833
- logger.success(
834
- f"created {len(artifacts)} artifacts from directory using storage"
835
- f" {storage.root} and key = {folder_key}/"
836
- )
837
- return artifacts
838
-
839
-
840
- # docstring handled through attach_func_to_class_method
841
- def replace(
842
- self,
843
- data: UPathStr,
844
- run: Run | None = None,
845
- format: str | None = None,
846
- ) -> None:
847
- default_storage = settings._storage_settings.record
848
- kwargs, privates = get_artifact_kwargs_from_data(
849
- provisional_uid=self.uid,
850
- data=data,
851
- key=self.key,
852
- run=run,
853
- format=format,
854
- default_storage=default_storage,
855
- version=None,
856
- is_replace=True,
857
- )
858
-
859
- # this artifact already exists
860
- if privates is None:
861
- return kwargs
862
-
863
- check_path_in_storage = privates["check_path_in_storage"]
864
- if check_path_in_storage:
865
- raise ValueError("Can only replace with a local file not in any Storage.")
866
-
867
- if self.key is not None and not self._key_is_virtual:
868
- key_path = PurePosixPath(self.key)
869
- new_filename = f"{key_path.stem}{kwargs['suffix']}"
870
- # the following will only be true if the suffix changes!
871
- if key_path.name != new_filename:
872
- self._clear_storagekey = self.key
873
- self.key = str(key_path.with_name(new_filename))
874
- logger.warning(
875
- f"replacing the file will replace key '{key_path}' with '{self.key}'"
876
- f" and delete '{key_path}' upon `save()`"
877
- )
878
- else:
879
- old_storage = auto_storage_key_from_artifact(self)
880
- is_dir = self.n_objects is not None
881
- new_storage = auto_storage_key_from_artifact_uid(
882
- self.uid, kwargs["suffix"], is_dir
883
- )
884
- if old_storage != new_storage:
885
- self._clear_storagekey = old_storage
886
- if self.key is not None:
887
- new_key_path = PurePosixPath(self.key).with_suffix(kwargs["suffix"])
888
- self.key = str(new_key_path)
889
-
890
- self.suffix = kwargs["suffix"]
891
- self.size = kwargs["size"]
892
- self.hash = kwargs["hash"]
893
- self._hash_type = kwargs["_hash_type"]
894
- self.run_id = kwargs["run_id"]
895
- self.run = kwargs["run"]
896
-
897
- self._local_filepath = privates["local_filepath"]
898
- self._cloud_filepath = privates["cloud_filepath"]
899
- self._memory_rep = privates["memory_rep"]
900
- # no need to upload if new file is already in storage
901
- self._to_store = not check_path_in_storage
902
-
903
-
904
- # docstring handled through attach_func_to_class_method
905
- def open(
906
- self, mode: str = "r", is_run_input: bool | None = None
907
- ) -> AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment:
908
- # ignore empty suffix for now
909
- suffixes = (".h5", ".hdf5", ".h5ad", ".zarr", ".tiledbsoma", "")
910
- if self.suffix not in suffixes:
911
- raise ValueError(
912
- "Artifact should have a zarr, h5 or tiledbsoma object as the underlying data, please"
913
- " use one of the following suffixes for the object name:"
914
- f" {', '.join(suffixes[:-1])}."
915
- )
916
- if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
917
- raise ValueError("Only a tiledbsoma store can be openened with `mode!='r'`.")
918
-
919
- from lamindb.core.storage._backed_access import _track_writes_factory, backed_access
920
-
921
- using_key = settings._using_key
922
- filepath = filepath_from_artifact(self, using_key=using_key)
923
- is_tiledbsoma_w = (
924
- filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
925
- ) and mode == "w"
926
- # consider the case where an object is already locally cached
927
- localpath = setup_settings.instance.storage.cloud_to_local_no_update(filepath)
928
- if not is_tiledbsoma_w and localpath.exists():
929
- access = backed_access(localpath, mode, using_key)
930
- else:
931
- access = backed_access(filepath, mode, using_key)
932
- if is_tiledbsoma_w:
933
-
934
- def finalize():
935
- nonlocal self, filepath, localpath
936
- if not isinstance(filepath, LocalPathClasses):
937
- _, hash, _, _ = get_stat_dir_cloud(filepath)
938
- else:
939
- # this can be very slow
940
- _, hash, _, _ = hash_dir(filepath)
941
- if self.hash != hash:
942
- from ._record import init_self_from_db
943
-
944
- new_version = Artifact(
945
- filepath, revises=self, _is_internal_call=True
946
- ).save()
947
- init_self_from_db(self, new_version)
948
-
949
- if localpath != filepath and localpath.exists():
950
- shutil.rmtree(localpath)
951
-
952
- access = _track_writes_factory(access, finalize)
953
- # only call if open is successfull
954
- _track_run_input(self, is_run_input)
955
- return access
956
-
957
-
958
- # can't really just call .cache in .load because of double tracking
959
- def _synchronize_cleanup_on_error(filepath: UPath) -> UPath:
960
- try:
961
- cache_path = setup_settings.instance.storage.cloud_to_local(
962
- filepath, print_progress=True
963
- )
964
- except Exception as e:
965
- if not isinstance(filepath, LocalPathClasses):
966
- cache_path = setup_settings.instance.storage.cloud_to_local_no_update(
967
- filepath
968
- )
969
- if cache_path.is_file():
970
- cache_path.unlink(missing_ok=True)
971
- elif cache_path.is_dir():
972
- shutil.rmtree(cache_path)
973
- raise e
974
- return cache_path
975
-
976
-
977
- # docstring handled through attach_func_to_class_method
978
- def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
979
- if hasattr(self, "_memory_rep") and self._memory_rep is not None:
980
- access_memory = self._memory_rep
981
- else:
982
- filepath = filepath_from_artifact(self, using_key=settings._using_key)
983
- cache_path = _synchronize_cleanup_on_error(filepath)
984
- access_memory = load_to_memory(cache_path, **kwargs)
985
- # only call if load is successfull
986
- _track_run_input(self, is_run_input)
987
- return access_memory
988
-
989
-
990
- # docstring handled through attach_func_to_class_method
991
- def cache(self, is_run_input: bool | None = None) -> Path:
992
- filepath = filepath_from_artifact(self, using_key=settings._using_key)
993
- cache_path = _synchronize_cleanup_on_error(filepath)
994
- # only call if sync is successfull
995
- _track_run_input(self, is_run_input)
996
- return cache_path
997
-
998
-
999
- # docstring handled through attach_func_to_class_method
1000
- def delete(
1001
- self,
1002
- permanent: bool | None = None,
1003
- storage: bool | None = None,
1004
- using_key: str | None = None,
1005
- ) -> None:
1006
- # this first check means an invalid delete fails fast rather than cascading through
1007
- # database and storage permission errors
1008
- if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
1009
- isettings = setup_settings.instance
1010
- if self.storage.instance_uid != isettings.uid and (storage or storage is None):
1011
- raise IntegrityError(
1012
- "Cannot simply delete artifacts outside of this instance's managed storage locations."
1013
- "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
1014
- f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
1015
- f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
1016
- )
1017
- # by default, we only move artifacts into the trash (visibility = -1)
1018
- trash_visibility = VisibilityChoice.trash.value
1019
- if self.visibility > trash_visibility and not permanent:
1020
- if storage is not None:
1021
- logger.warning("moving artifact to trash, storage arg is ignored")
1022
- # move to trash
1023
- self.visibility = trash_visibility
1024
- self.save()
1025
- logger.important(f"moved artifact to trash (visibility = {trash_visibility})")
1026
- return
1027
-
1028
- # if the artifact is already in the trash
1029
- # permanent delete skips the trash
1030
- if permanent is None:
1031
- # ask for confirmation of permanent delete
1032
- response = input(
1033
- "Artifact record is already in trash! Are you sure you want to permanently"
1034
- " delete it? (y/n) You can't undo this action."
1035
- )
1036
- delete_record = response == "y"
1037
- else:
1038
- assert permanent # noqa: S101
1039
- delete_record = True
1040
-
1041
- if delete_record:
1042
- # need to grab file path before deletion
1043
- try:
1044
- path = filepath_from_artifact(self, using_key)
1045
- except OSError:
1046
- # we can still delete the record
1047
- logger.warning("Could not get path")
1048
- storage = False
1049
- # only delete in storage if DB delete is successful
1050
- # DB delete might error because of a foreign key constraint violated etc.
1051
- self._delete_skip_storage()
1052
- if self.key is None or self._key_is_virtual:
1053
- # do not ask for confirmation also if storage is None
1054
- delete_in_storage = storage is None or storage
1055
- else:
1056
- # for artifacts with non-virtual semantic storage keys (key is not None)
1057
- # ask for extra-confirmation
1058
- if storage is None:
1059
- response = input(
1060
- f"Are you sure to want to delete {path}? (y/n) You can't undo"
1061
- " this action."
1062
- )
1063
- delete_in_storage = response == "y"
1064
- else:
1065
- delete_in_storage = storage
1066
- if not delete_in_storage:
1067
- logger.important(f"a file/folder remains here: {path}")
1068
- # we don't yet have logic to bring back the deleted metadata record
1069
- # in case storage deletion fails - this is important for ACID down the road
1070
- if delete_in_storage:
1071
- delete_msg = delete_storage(path, raise_file_not_found_error=False)
1072
- if delete_msg != "did-not-delete":
1073
- logger.success(f"deleted {colors.yellow(f'{path}')}")
1074
-
1075
-
1076
- def _delete_skip_storage(artifact, *args, **kwargs) -> None:
1077
- super(Artifact, artifact).delete(*args, **kwargs)
1078
-
1079
-
1080
- # docstring handled through attach_func_to_class_method
1081
- def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1082
- state_was_adding = self._state.adding
1083
- print_progress = kwargs.pop("print_progress", True)
1084
- access_token = kwargs.pop("access_token", None)
1085
- local_path = None
1086
- if upload and setup_settings.instance.keep_artifacts_local:
1087
- # switch local storage location to cloud
1088
- local_path = self.path
1089
- self.storage_id = setup_settings.instance.storage.id
1090
- self._local_filepath = local_path
1091
- # switch to virtual storage key upon upload
1092
- # the local filepath is already cached at that point
1093
- self._key_is_virtual = True
1094
- # ensure that the artifact is uploaded
1095
- self._to_store = True
1096
-
1097
- self._save_skip_storage(**kwargs)
1098
-
1099
- from lamindb._save import check_and_attempt_clearing, check_and_attempt_upload
1100
-
1101
- using_key = None
1102
- if "using" in kwargs:
1103
- using_key = kwargs["using"]
1104
- exception = check_and_attempt_upload(
1105
- self, using_key, access_token=access_token, print_progress=print_progress
1106
- )
1107
- if exception is not None:
1108
- self._delete_skip_storage()
1109
- raise RuntimeError(exception)
1110
- exception = check_and_attempt_clearing(self, using_key)
1111
- if exception is not None:
1112
- raise RuntimeError(exception)
1113
- if local_path is not None and not state_was_adding:
1114
- # only move the local artifact to cache if it was not newly created
1115
- local_path_cache = ln_setup.settings.storage.cache_dir / local_path.name
1116
- # don't use Path.rename here because of cross-device link error
1117
- # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
1118
- shutil.move(
1119
- local_path, # type: ignore
1120
- local_path_cache,
1121
- )
1122
- logger.important(f"moved local artifact to cache: {local_path_cache}")
1123
- return self
1124
-
1125
-
1126
- def _save_skip_storage(file, **kwargs) -> None:
1127
- save_feature_sets(file)
1128
- super(Artifact, file).save(**kwargs)
1129
- save_feature_set_links(file)
1130
-
1131
-
1132
- @property # type: ignore
1133
- @doc_args(Artifact.path.__doc__)
1134
- def path(self) -> Path | UPath:
1135
- """{}""" # noqa: D415
1136
- using_key = settings._using_key
1137
- return filepath_from_artifact(self, using_key)
1138
-
1139
-
1140
- # docstring handled through attach_func_to_class_method
1141
- def restore(self) -> None:
1142
- self.visibility = VisibilityChoice.default.value
1143
- self.save()
1144
-
1145
-
1146
- METHOD_NAMES = [
1147
- "__init__",
1148
- "from_anndata",
1149
- "from_df",
1150
- "from_mudata",
1151
- "open",
1152
- "cache",
1153
- "load",
1154
- "delete",
1155
- "save",
1156
- "replace",
1157
- "from_dir",
1158
- "restore",
1159
- ]
1160
-
1161
- if ln_setup._TESTING:
1162
- from inspect import signature
1163
-
1164
- SIGS = {
1165
- name: signature(getattr(Artifact, name))
1166
- for name in METHOD_NAMES
1167
- if name != "__init__"
1168
- }
1169
-
1170
- for name in METHOD_NAMES:
1171
- attach_func_to_class_method(name, Artifact, globals())
1172
-
1173
- # privates currently dealt with separately
1174
- Artifact._delete_skip_storage = _delete_skip_storage
1175
- Artifact._save_skip_storage = _save_skip_storage
1176
- Artifact.path = path
1177
- Artifact.describe = describe
1178
- Artifact.view_lineage = view_lineage
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shutil
5
+ from pathlib import Path, PurePath, PurePosixPath
6
+ from typing import TYPE_CHECKING, Any, Mapping
7
+
8
+ import fsspec
9
+ import lamindb_setup as ln_setup
10
+ import pandas as pd
11
+ from anndata import AnnData
12
+ from django.db.models import Q, QuerySet
13
+ from lamin_utils import colors, logger
14
+ from lamindb_setup import settings as setup_settings
15
+ from lamindb_setup._init_instance import register_storage_in_instance
16
+ from lamindb_setup.core._docs import doc_args
17
+ from lamindb_setup.core._settings_storage import init_storage
18
+ from lamindb_setup.core.hashing import hash_dir, hash_file
19
+ from lamindb_setup.core.upath import (
20
+ create_path,
21
+ extract_suffix_from_path,
22
+ get_stat_dir_cloud,
23
+ get_stat_file_cloud,
24
+ )
25
+ from lnschema_core.models import Artifact, FeatureManager, ParamManager, Run, Storage
26
+ from lnschema_core.types import (
27
+ VisibilityChoice,
28
+ )
29
+
30
+ from lamindb._utils import attach_func_to_class_method
31
+ from lamindb.core._data import _track_run_input, describe, view_lineage
32
+ from lamindb.core._settings import settings
33
+ from lamindb.core.exceptions import IntegrityError, InvalidArgument
34
+ from lamindb.core.loaders import load_to_memory
35
+ from lamindb.core.storage import (
36
+ LocalPathClasses,
37
+ UPath,
38
+ delete_storage,
39
+ infer_suffix,
40
+ write_to_disk,
41
+ )
42
+ from lamindb.core.storage.paths import (
43
+ auto_storage_key_from_artifact,
44
+ auto_storage_key_from_artifact_uid,
45
+ check_path_is_child_of_root,
46
+ filepath_cache_key_from_artifact,
47
+ filepath_from_artifact,
48
+ )
49
+ from lamindb.core.versioning import (
50
+ create_uid,
51
+ message_update_key_in_version_family,
52
+ )
53
+
54
+ from .core._data import (
55
+ add_transform_to_kwargs,
56
+ get_run,
57
+ save_feature_set_links,
58
+ save_feature_sets,
59
+ )
60
+ from .core.storage.objects import _mudata_is_installed
61
+ from .core.storage.paths import AUTO_KEY_PREFIX
62
+
63
+ try:
64
+ from .core.storage._zarr import zarr_is_adata
65
+ except ImportError:
66
+
67
+ def zarr_is_adata(storepath): # type: ignore
68
+ raise ImportError("Please install zarr: pip install zarr")
69
+
70
+
71
+ if TYPE_CHECKING:
72
+ from lamindb_setup.core.types import UPathStr
73
+ from mudata import MuData
74
+ from tiledbsoma import Collection as SOMACollection
75
+ from tiledbsoma import Experiment as SOMAExperiment
76
+
77
+ from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
78
+
79
+
80
+ def process_pathlike(
81
+ filepath: UPath,
82
+ default_storage: Storage,
83
+ using_key: str | None,
84
+ skip_existence_check: bool = False,
85
+ ) -> tuple[Storage, bool]:
86
+ if not skip_existence_check:
87
+ try: # check if file exists
88
+ if not filepath.exists():
89
+ raise FileNotFoundError(filepath)
90
+ except PermissionError:
91
+ pass
92
+ if check_path_is_child_of_root(filepath, default_storage.root):
93
+ use_existing_storage_key = True
94
+ return default_storage, use_existing_storage_key
95
+ else:
96
+ # check whether the path is part of one of the existing
97
+ # already-registered storage locations
98
+ result = False
99
+ # within the hub, we don't want to perform check_path_in_existing_storage
100
+ if using_key is None:
101
+ result = check_path_in_existing_storage(filepath, using_key)
102
+ if isinstance(result, Storage):
103
+ use_existing_storage_key = True
104
+ return result, use_existing_storage_key
105
+ else:
106
+ # if the path is in the cloud, we have a good candidate
107
+ # for the storage root: the bucket
108
+ if not isinstance(filepath, LocalPathClasses):
109
+ # for a cloud path, new_root is always the bucket name
110
+ new_root = list(filepath.parents)[-1]
111
+ # do not register remote storage locations on hub if the current instance
112
+ # is not managed on the hub
113
+ storage_settings, _ = init_storage(
114
+ new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
115
+ )
116
+ storage_record = register_storage_in_instance(storage_settings)
117
+ use_existing_storage_key = True
118
+ return storage_record, use_existing_storage_key
119
+ # if the filepath is local
120
+ else:
121
+ use_existing_storage_key = False
122
+ # if the default storage is local we'll throw an error if the user
123
+ # doesn't provide a key
124
+ if default_storage.type == "local":
125
+ return default_storage, use_existing_storage_key
126
+ # if the default storage is in the cloud (the file is going to
127
+ # be uploaded upon saving it), we treat the filepath as a cache
128
+ else:
129
+ return default_storage, use_existing_storage_key
130
+
131
+
132
+ def process_data(
133
+ provisional_uid: str,
134
+ data: UPathStr | pd.DataFrame | AnnData,
135
+ format: str | None,
136
+ key: str | None,
137
+ default_storage: Storage,
138
+ using_key: str | None,
139
+ skip_existence_check: bool = False,
140
+ ) -> tuple[Any, Path | UPath, str, Storage, bool]:
141
+ """Serialize a data object that's provided as file or in memory."""
142
+ # if not overwritten, data gets stored in default storage
143
+ if _mudata_is_installed():
144
+ from mudata import MuData
145
+
146
+ data_types = (pd.DataFrame, AnnData, MuData)
147
+ else:
148
+ data_types = (pd.DataFrame, AnnData) # type:ignore
149
+
150
+ if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
151
+ access_token = (
152
+ default_storage._access_token
153
+ if hasattr(default_storage, "_access_token")
154
+ else None
155
+ )
156
+ path = create_path(data, access_token=access_token).resolve()
157
+ storage, use_existing_storage_key = process_pathlike(
158
+ path,
159
+ default_storage=default_storage,
160
+ using_key=using_key,
161
+ skip_existence_check=skip_existence_check,
162
+ )
163
+ suffix = extract_suffix_from_path(path)
164
+ memory_rep = None
165
+ elif isinstance(data, data_types):
166
+ storage = default_storage
167
+ memory_rep = data
168
+ if key is not None:
169
+ key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
170
+ # use suffix as the (adata) format if the format is not provided
171
+ if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
172
+ format = key_suffix[1:]
173
+ else:
174
+ key_suffix = None
175
+ suffix = infer_suffix(data, format)
176
+ if key_suffix is not None and key_suffix != suffix:
177
+ raise InvalidArgument(
178
+ f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
179
+ f" be '{suffix}'."
180
+ )
181
+ cache_name = f"{provisional_uid}{suffix}"
182
+ path = settings.storage.cache_dir / cache_name
183
+ # Alex: I don't understand the line below
184
+ if path.suffixes == []:
185
+ path = path.with_suffix(suffix)
186
+ write_to_disk(data, path)
187
+ use_existing_storage_key = False
188
+ else:
189
+ raise NotImplementedError(
190
+ f"Do not know how to create a artifact object from {data}, pass a path"
191
+ " instead!"
192
+ )
193
+ return memory_rep, path, suffix, storage, use_existing_storage_key
194
+
195
+
196
+ def get_stat_or_artifact(
197
+ path: UPath,
198
+ key: str | None = None,
199
+ check_hash: bool = True,
200
+ is_replace: bool = False,
201
+ instance: str | None = None,
202
+ ) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
203
+ n_objects = None
204
+ if settings.creation.artifact_skip_size_hash:
205
+ return None, None, None, n_objects, None
206
+ stat = path.stat() # one network request
207
+ if not isinstance(path, LocalPathClasses):
208
+ size, hash, hash_type = None, None, None
209
+ if stat is not None:
210
+ # convert UPathStatResult to fsspec info dict
211
+ stat = stat.as_info()
212
+ if "ETag" in stat: # is file
213
+ size, hash, hash_type = get_stat_file_cloud(stat)
214
+ elif stat["type"] == "directory":
215
+ size, hash, hash_type, n_objects = get_stat_dir_cloud(path)
216
+ if hash is None:
217
+ logger.warning(f"did not add hash for {path}")
218
+ return size, hash, hash_type, n_objects, None
219
+ else:
220
+ if path.is_dir():
221
+ size, hash, hash_type, n_objects = hash_dir(path)
222
+ else:
223
+ hash, hash_type = hash_file(path)
224
+ size = stat.st_size
225
+ if not check_hash:
226
+ return size, hash, hash_type, n_objects, None
227
+ previous_artifact_version = None
228
+ if key is None or is_replace:
229
+ result = Artifact.objects.using(instance).filter(hash=hash).all()
230
+ artifact_with_same_hash_exists = len(result) > 0
231
+ else:
232
+ storage_id = settings.storage.id
233
+ result = (
234
+ Artifact.objects.using(instance)
235
+ .filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
236
+ .order_by("-created_at")
237
+ .all()
238
+ )
239
+ artifact_with_same_hash_exists = len(result.filter(hash=hash).all()) > 0
240
+ if not artifact_with_same_hash_exists and len(result) > 0:
241
+ logger.important(
242
+ f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
243
+ )
244
+ previous_artifact_version = result[0]
245
+ if artifact_with_same_hash_exists:
246
+ if settings.creation.artifact_if_hash_exists == "error":
247
+ msg = f"artifact with same hash exists: {result[0]}"
248
+ hint = (
249
+ "💡 you can make this error a warning:\n"
250
+ " ln.settings.creation.artifact_if_hash_exists"
251
+ )
252
+ raise FileExistsError(f"{msg}\n{hint}")
253
+ elif settings.creation.artifact_if_hash_exists == "warn_create_new":
254
+ logger.warning(
255
+ "creating new Artifact object despite existing artifact with same hash:"
256
+ f" {result[0]}"
257
+ )
258
+ return size, hash, hash_type, n_objects, None
259
+ else:
260
+ if result[0].visibility == -1:
261
+ raise FileExistsError(
262
+ f"You're trying to re-create this artifact in trash: {result[0]}"
263
+ "Either permanently delete it with `artifact.delete(permanent=True)` or restore it with `artifact.restore()`"
264
+ )
265
+ logger.important(f"returning existing artifact with same hash: {result[0]}")
266
+ return result[0]
267
+ else:
268
+ return size, hash, hash_type, n_objects, previous_artifact_version
269
+
270
+
271
+ def check_path_in_existing_storage(
272
+ path: Path | UPath, using_key: str | None = None
273
+ ) -> Storage | bool:
274
+ for storage in Storage.objects.using(using_key).filter().all():
275
+ # if path is part of storage, return it
276
+ if check_path_is_child_of_root(path, root=storage.root):
277
+ return storage
278
+ return False
279
+
280
+
281
+ def get_relative_path_to_directory(
282
+ path: PurePath | Path | UPath, directory: PurePath | Path | UPath
283
+ ) -> PurePath | Path:
284
+ if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
285
+ # UPath.relative_to() is not behaving as it should (2023-04-07)
286
+ # need to lstrip otherwise inconsistent behavior across trailing slashes
287
+ # see test_artifact.py: test_get_relative_path_to_directory
288
+ relpath = PurePath(
289
+ path.as_posix().replace(directory.as_posix(), "").lstrip("/")
290
+ )
291
+ elif isinstance(directory, Path):
292
+ relpath = path.resolve().relative_to(directory.resolve()) # type: ignore
293
+ elif isinstance(directory, PurePath):
294
+ relpath = path.relative_to(directory)
295
+ else:
296
+ raise TypeError("Directory not of type Path or UPath")
297
+ return relpath
298
+
299
+
300
+ def get_artifact_kwargs_from_data(
301
+ *,
302
+ data: Path | UPath | str | pd.DataFrame | AnnData | MuData,
303
+ key: str | None,
304
+ run: Run | None,
305
+ format: str | None,
306
+ provisional_uid: str,
307
+ version: str | None,
308
+ default_storage: Storage,
309
+ using_key: str | None = None,
310
+ is_replace: bool = False,
311
+ skip_check_exists: bool = False,
312
+ ):
313
+ run = get_run(run)
314
+ memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
315
+ provisional_uid,
316
+ data,
317
+ format,
318
+ key,
319
+ default_storage,
320
+ using_key,
321
+ skip_check_exists,
322
+ )
323
+ stat_or_artifact = get_stat_or_artifact(
324
+ path=path,
325
+ key=key,
326
+ instance=using_key,
327
+ is_replace=is_replace,
328
+ )
329
+ if isinstance(stat_or_artifact, Artifact):
330
+ artifact = stat_or_artifact
331
+ # update the run of the existing artifact
332
+ if run is not None:
333
+ # save the information that this artifact was previously
334
+ # produced by another run
335
+ if artifact.run is not None:
336
+ artifact.run._output_artifacts_with_later_updates.add(artifact)
337
+ # update the run of the artifact with the latest run
338
+ stat_or_artifact.run = run
339
+ stat_or_artifact.transform = run.transform
340
+ return artifact, None
341
+ else:
342
+ size, hash, hash_type, n_objects, revises = stat_or_artifact
343
+
344
+ if revises is not None: # update provisional_uid
345
+ provisional_uid, revises = create_uid(revises=revises, version=version)
346
+ if settings.storage.cache_dir in path.parents:
347
+ path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
348
+
349
+ check_path_in_storage = False
350
+ if use_existing_storage_key:
351
+ inferred_key = get_relative_path_to_directory(
352
+ path=path, directory=UPath(storage.root)
353
+ ).as_posix()
354
+ if key is None:
355
+ key = inferred_key
356
+ else:
357
+ if not key == inferred_key:
358
+ raise InvalidArgument(
359
+ f"The path '{data}' is already in registered storage"
360
+ f" '{storage.root}' with key '{inferred_key}'\nYou passed"
361
+ f" conflicting key '{key}': please move the file before"
362
+ " registering it."
363
+ )
364
+ check_path_in_storage = True
365
+ else:
366
+ storage = default_storage
367
+
368
+ log_storage_hint(
369
+ check_path_in_storage=check_path_in_storage,
370
+ storage=storage,
371
+ key=key,
372
+ uid=provisional_uid,
373
+ suffix=suffix,
374
+ is_dir=n_objects is not None,
375
+ )
376
+
377
+ # do we use a virtual or an actual storage key?
378
+ key_is_virtual = settings.creation._artifact_use_virtual_keys
379
+
380
+ # if the file is already in storage, independent of the default
381
+ # we use an actual storage key
382
+ if check_path_in_storage:
383
+ key_is_virtual = False
384
+
385
+ kwargs = {
386
+ "uid": provisional_uid,
387
+ "suffix": suffix,
388
+ "hash": hash,
389
+ "_hash_type": hash_type,
390
+ "key": key,
391
+ "size": size,
392
+ "storage_id": storage.id,
393
+ # passing both the id and the object
394
+ # to make them both available immediately
395
+ # after object creation
396
+ "n_objects": n_objects,
397
+ "n_observations": None, # to implement
398
+ "run_id": run.id if run is not None else None,
399
+ "run": run,
400
+ "_key_is_virtual": key_is_virtual,
401
+ "revises": revises,
402
+ }
403
+ if not isinstance(path, LocalPathClasses):
404
+ local_filepath = None
405
+ cloud_filepath = path
406
+ else:
407
+ local_filepath = path
408
+ cloud_filepath = None
409
+ privates = {
410
+ "local_filepath": local_filepath,
411
+ "cloud_filepath": cloud_filepath,
412
+ "memory_rep": memory_rep,
413
+ "check_path_in_storage": check_path_in_storage,
414
+ }
415
+ return kwargs, privates
416
+
417
+
418
+ def log_storage_hint(
419
+ *,
420
+ check_path_in_storage: bool,
421
+ storage: Storage | None,
422
+ key: str | None,
423
+ uid: str,
424
+ suffix: str,
425
+ is_dir: bool,
426
+ ) -> None:
427
+ hint = ""
428
+ if check_path_in_storage:
429
+ display_root = storage.root # type: ignore
430
+ # check whether path is local
431
+ if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore
432
+ # if it's a local path, check whether it's in the current working directory
433
+ root_path = Path(storage.root) # type: ignore
434
+ if check_path_is_child_of_root(root_path, Path.cwd()):
435
+ # only display the relative path, not the fully resolved path
436
+ display_root = root_path.relative_to(Path.cwd())
437
+ hint += f"path in storage '{display_root}'" # type: ignore
438
+ else:
439
+ hint += "path content will be copied to default storage upon `save()`"
440
+ if key is None:
441
+ storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
442
+ hint += f" with key `None` ('{storage_key}')"
443
+ else:
444
+ hint += f" with key '{key}'"
445
+ logger.hint(hint)
446
+
447
+
448
+ def data_is_anndata(data: AnnData | UPathStr) -> bool:
449
+ if isinstance(data, AnnData):
450
+ return True
451
+ if isinstance(data, (str, Path, UPath)):
452
+ data_path = UPath(data)
453
+ if data_path.suffix == ".h5ad":
454
+ return True
455
+ elif data_path.suffix == ".zarr":
456
+ # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
457
+ if ".anndata" in data_path.suffixes:
458
+ return True
459
+ # check only for local, expensive for cloud
460
+ if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
461
+ return zarr_is_adata(data_path)
462
+ else:
463
+ logger.warning("We do not check if cloud zarr is AnnData or not.")
464
+ return False
465
+ return False
466
+
467
+
468
+ def data_is_mudata(data: MuData | UPathStr) -> bool:
469
+ if _mudata_is_installed():
470
+ from mudata import MuData
471
+
472
+ if isinstance(data, MuData):
473
+ return True
474
+ if isinstance(data, (str, Path)):
475
+ return UPath(data).suffix in {".h5mu"}
476
+ return False
477
+
478
+
479
+ def _check_accessor_artifact(data: Any, accessor: str | None = None):
480
+ if accessor is None:
481
+ if isinstance(data, pd.DataFrame):
482
+ logger.warning("data is a DataFrame, please use .from_df()")
483
+ accessor = "DataFrame"
484
+ return accessor
485
+
486
+ data_is_path = isinstance(data, (str, Path))
487
+ if data_is_anndata(data):
488
+ if not data_is_path:
489
+ logger.warning("data is an AnnData, please use .from_anndata()")
490
+ accessor = "AnnData"
491
+ elif data_is_mudata(data):
492
+ if not data_is_path:
493
+ logger.warning("data is a MuData, please use .from_mudata()")
494
+ accessor = "MuData"
495
+ elif not data_is_path: # UPath is a subclass of Path
496
+ raise TypeError("data has to be a string, Path, UPath")
497
+ return accessor
498
+
499
+
500
+ def __init__(artifact: Artifact, *args, **kwargs):
501
+ artifact.features = FeatureManager(artifact)
502
+ artifact.params = ParamManager(artifact)
503
+ # Below checks for the Django-internal call in from_db()
504
+ # it'd be better if we could avoid this, but not being able to create a Artifact
505
+ # from data with the default constructor renders the central class of the API
506
+ # essentially useless
507
+ # The danger below is not that a user might pass as many args (12 of it), but rather
508
+ # that at some point the Django API might change; on the other hand, this
509
+ # condition of for calling the constructor based on kwargs should always
510
+ # stay robust
511
+ if len(args) == len(artifact._meta.concrete_fields):
512
+ super(Artifact, artifact).__init__(*args, **kwargs)
513
+ return None
514
+ # now we proceed with the user-facing constructor
515
+ if len(args) > 1:
516
+ raise ValueError("Only one non-keyword arg allowed: data")
517
+
518
+ data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
519
+ type: str = kwargs.pop("type") if "type" in kwargs else None
520
+ key: str | None = kwargs.pop("key") if "key" in kwargs else None
521
+ run: Run | None = kwargs.pop("run") if "run" in kwargs else None
522
+ description: str | None = (
523
+ kwargs.pop("description") if "description" in kwargs else None
524
+ )
525
+ revises: Artifact | None = kwargs.pop("revises") if "revises" in kwargs else None
526
+ version: str | None = kwargs.pop("version") if "version" in kwargs else None
527
+ visibility: int | None = (
528
+ kwargs.pop("visibility")
529
+ if "visibility" in kwargs
530
+ else VisibilityChoice.default.value
531
+ )
532
+ format = kwargs.pop("format") if "format" in kwargs else None
533
+ _is_internal_call = kwargs.pop("_is_internal_call", False)
534
+ skip_check_exists = (
535
+ kwargs.pop("skip_check_exists") if "skip_check_exists" in kwargs else False
536
+ )
537
+ if "default_storage" in kwargs:
538
+ default_storage = kwargs.pop("default_storage")
539
+ else:
540
+ if setup_settings.instance.keep_artifacts_local:
541
+ default_storage = setup_settings.instance.storage_local.record
542
+ else:
543
+ default_storage = setup_settings.instance.storage.record
544
+ using_key = (
545
+ kwargs.pop("using_key") if "using_key" in kwargs else settings._using_key
546
+ )
547
+ accessor = kwargs.pop("_accessor") if "_accessor" in kwargs else None
548
+ accessor = _check_accessor_artifact(data=data, accessor=accessor)
549
+ if "is_new_version_of" in kwargs:
550
+ logger.warning("`is_new_version_of` will be removed soon, please use `revises`")
551
+ revises = kwargs.pop("is_new_version_of")
552
+ if not len(kwargs) == 0:
553
+ raise ValueError(
554
+ "Only data, key, run, description, version, revises, visibility"
555
+ f" can be passed, you passed: {kwargs}"
556
+ )
557
+ if revises is not None and key is not None and revises.key != key:
558
+ note = message_update_key_in_version_family(
559
+ suid=revises.stem_uid,
560
+ existing_key=revises.key,
561
+ new_key=key,
562
+ registry="Artifact",
563
+ )
564
+ raise ValueError(
565
+ f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
566
+ )
567
+ if revises is not None:
568
+ if not isinstance(revises, Artifact):
569
+ raise TypeError("`revises` has to be of type `Artifact`")
570
+ if description is None:
571
+ description = revises.description
572
+ if key is not None and AUTO_KEY_PREFIX in key:
573
+ raise ValueError(
574
+ f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
575
+ )
576
+ # below is for internal calls that require defining the storage location
577
+ # ahead of constructing the Artifact
578
+ if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
579
+ if _is_internal_call:
580
+ is_automanaged_path = True
581
+ user_provided_key = key
582
+ key = None
583
+ else:
584
+ raise ValueError(
585
+ f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
586
+ )
587
+ else:
588
+ is_automanaged_path = False
589
+ provisional_uid, revises = create_uid(revises=revises, version=version)
590
+ kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
591
+ data=data,
592
+ key=key,
593
+ run=run,
594
+ format=format,
595
+ provisional_uid=provisional_uid,
596
+ version=version,
597
+ default_storage=default_storage,
598
+ using_key=using_key,
599
+ skip_check_exists=skip_check_exists,
600
+ )
601
+
602
+ # an object with the same hash already exists
603
+ if isinstance(kwargs_or_artifact, Artifact):
604
+ from ._record import init_self_from_db, update_attributes
605
+
606
+ init_self_from_db(artifact, kwargs_or_artifact)
607
+ # adding "key" here is dangerous because key might be auto-populated
608
+ update_attributes(artifact, {"description": description})
609
+ if artifact.key != key and key is not None:
610
+ logger.warning(
611
+ f"key {artifact.key} on existing artifact differs from passed key {key}"
612
+ )
613
+ return None
614
+ else:
615
+ kwargs = kwargs_or_artifact
616
+
617
+ if revises is None:
618
+ revises = kwargs_or_artifact.pop("revises")
619
+
620
+ if data is not None:
621
+ artifact._local_filepath = privates["local_filepath"]
622
+ artifact._cloud_filepath = privates["cloud_filepath"]
623
+ artifact._memory_rep = privates["memory_rep"]
624
+ artifact._to_store = not privates["check_path_in_storage"]
625
+
626
+ if is_automanaged_path and _is_internal_call:
627
+ kwargs["_key_is_virtual"] = True
628
+ assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
629
+ uid = kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
630
+ kwargs["key"] = user_provided_key
631
+ if revises is not None:
632
+ assert uid.startswith(revises.stem_uid) # noqa: S101
633
+ if len(uid) == 16:
634
+ if revises is None:
635
+ uid += "0000"
636
+ else:
637
+ uid, revises = create_uid(revises=revises, version=version)
638
+ kwargs["uid"] = uid
639
+
640
+ # only set key now so that we don't do a look-up on it in case revises is passed
641
+ if revises is not None:
642
+ kwargs["key"] = revises.key
643
+
644
+ kwargs["type"] = type
645
+ kwargs["version"] = version
646
+ kwargs["description"] = description
647
+ kwargs["visibility"] = visibility
648
+ kwargs["_accessor"] = accessor
649
+ kwargs["revises"] = revises
650
+ # this check needs to come down here because key might be populated from an
651
+ # existing file path during get_artifact_kwargs_from_data()
652
+ if (
653
+ kwargs["key"] is None
654
+ and kwargs["description"] is None
655
+ and kwargs["run"] is None
656
+ ):
657
+ raise ValueError("Pass one of key, run or description as a parameter")
658
+
659
+ add_transform_to_kwargs(kwargs, kwargs["run"])
660
+
661
+ super(Artifact, artifact).__init__(**kwargs)
662
+
663
+
664
+ @classmethod # type: ignore
665
+ @doc_args(Artifact.from_df.__doc__)
666
+ def from_df(
667
+ cls,
668
+ df: pd.DataFrame,
669
+ key: str | None = None,
670
+ description: str | None = None,
671
+ run: Run | None = None,
672
+ revises: Artifact | None = None,
673
+ **kwargs,
674
+ ) -> Artifact:
675
+ """{}""" # noqa: D415
676
+ artifact = Artifact(
677
+ data=df,
678
+ key=key,
679
+ run=run,
680
+ description=description,
681
+ revises=revises,
682
+ _accessor="DataFrame",
683
+ type="dataset",
684
+ **kwargs,
685
+ )
686
+ return artifact
687
+
688
+
689
+ @classmethod # type: ignore
690
+ @doc_args(Artifact.from_anndata.__doc__)
691
+ def from_anndata(
692
+ cls,
693
+ adata: AnnData | UPathStr,
694
+ key: str | None = None,
695
+ description: str | None = None,
696
+ run: Run | None = None,
697
+ revises: Artifact | None = None,
698
+ **kwargs,
699
+ ) -> Artifact:
700
+ """{}""" # noqa: D415
701
+ if not data_is_anndata(adata):
702
+ raise ValueError("data has to be an AnnData object or a path to AnnData-like")
703
+ artifact = Artifact(
704
+ data=adata,
705
+ key=key,
706
+ run=run,
707
+ description=description,
708
+ revises=revises,
709
+ _accessor="AnnData",
710
+ type="dataset",
711
+ **kwargs,
712
+ )
713
+ return artifact
714
+
715
+
716
+ @classmethod # type: ignore
717
+ @doc_args(Artifact.from_mudata.__doc__)
718
+ def from_mudata(
719
+ cls,
720
+ mdata: MuData,
721
+ key: str | None = None,
722
+ description: str | None = None,
723
+ run: Run | None = None,
724
+ revises: Artifact | None = None,
725
+ **kwargs,
726
+ ) -> Artifact:
727
+ """{}""" # noqa: D415
728
+ artifact = Artifact(
729
+ data=mdata,
730
+ key=key,
731
+ run=run,
732
+ description=description,
733
+ revises=revises,
734
+ _accessor="MuData",
735
+ type="dataset",
736
+ **kwargs,
737
+ )
738
+ return artifact
739
+
740
+
741
+ @classmethod # type: ignore
742
+ @doc_args(Artifact.from_dir.__doc__)
743
+ def from_dir(
744
+ cls,
745
+ path: UPathStr,
746
+ key: str | None = None,
747
+ *,
748
+ run: Run | None = None,
749
+ ) -> list[Artifact]:
750
+ """{}""" # noqa: D415
751
+ logger.warning(
752
+ "this creates one artifact per file in the directory - consider"
753
+ " ln.Artifact(dir_path) to get one artifact for the entire directory"
754
+ )
755
+ folderpath: UPath = create_path(path) # returns Path for local
756
+ default_storage = settings.storage.record
757
+ using_key = settings._using_key
758
+ storage, use_existing_storage = process_pathlike(
759
+ folderpath, default_storage, using_key
760
+ )
761
+ folder_key_path: PurePath | Path
762
+ if key is None:
763
+ if not use_existing_storage:
764
+ logger.warning(
765
+ "folder is outside existing storage location, will copy files from"
766
+ f" {path} to {storage.root}/{folderpath.name}"
767
+ )
768
+ folder_key_path = Path(folderpath.name)
769
+ else:
770
+ # maintain the hierachy within an existing storage location
771
+ folder_key_path = get_relative_path_to_directory(
772
+ folderpath, UPath(storage.root)
773
+ )
774
+ else:
775
+ folder_key_path = Path(key)
776
+
777
+ # always sanitize by stripping a trailing slash
778
+ folder_key = folder_key_path.as_posix().rstrip("/")
779
+
780
+ # TODO: (non-local) UPath doesn't list the first level artifacts and dirs with "*"
781
+ pattern = "" if not isinstance(folderpath, LocalPathClasses) else "*"
782
+
783
+ # silence fine-grained logging
784
+ verbosity = settings.verbosity
785
+ verbosity_int = settings._verbosity_int
786
+ if verbosity_int >= 1:
787
+ settings.verbosity = "warning"
788
+ artifacts_dict = {}
789
+ for filepath in folderpath.rglob(pattern):
790
+ if filepath.is_file():
791
+ relative_path = get_relative_path_to_directory(filepath, folderpath)
792
+ artifact_key = folder_key + "/" + relative_path.as_posix()
793
+ # if creating from rglob, we don't need to check for existence
794
+ artifact = Artifact(
795
+ filepath, run=run, key=artifact_key, skip_check_exists=True
796
+ )
797
+ artifacts_dict[artifact.uid] = artifact
798
+ settings.verbosity = verbosity
799
+
800
+ # run sanity check on hashes
801
+ hashes = [
802
+ artifact.hash
803
+ for artifact in artifacts_dict.values()
804
+ if artifact.hash is not None
805
+ ]
806
+ uids = artifacts_dict.keys()
807
+ if len(set(hashes)) == len(hashes):
808
+ artifacts = list(artifacts_dict.values())
809
+ else:
810
+ # consider exact duplicates (same id, same hash)
811
+ # below can't happen anymore because artifacts is a dict now
812
+ # if len(set(uids)) == len(set(hashes)):
813
+ # logger.warning("dropping duplicate records in list of artifact records")
814
+ # artifacts = list(set(uids))
815
+ # consider false duplicates (different id, same hash)
816
+ if not len(set(uids)) == len(set(hashes)):
817
+ seen_hashes = set()
818
+ non_unique_artifacts = {
819
+ hash: artifact
820
+ for hash, artifact in artifacts_dict.items()
821
+ if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore
822
+ }
823
+ display_non_unique = "\n ".join(
824
+ f"{artifact}" for artifact in non_unique_artifacts
825
+ )
826
+ logger.warning(
827
+ "there are multiple artifact uids with the same hashes, dropping"
828
+ f" {len(non_unique_artifacts)} duplicates out of"
829
+ f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
830
+ )
831
+ artifacts = [
832
+ artifact
833
+ for artifact in artifacts_dict.values()
834
+ if artifact not in non_unique_artifacts.values()
835
+ ]
836
+ logger.success(
837
+ f"created {len(artifacts)} artifacts from directory using storage"
838
+ f" {storage.root} and key = {folder_key}/"
839
+ )
840
+ return artifacts
841
+
842
+
843
+ # docstring handled through attach_func_to_class_method
844
+ def replace(
845
+ self,
846
+ data: UPathStr,
847
+ run: Run | None = None,
848
+ format: str | None = None,
849
+ ) -> None:
850
+ default_storage = settings.storage.record
851
+ kwargs, privates = get_artifact_kwargs_from_data(
852
+ provisional_uid=self.uid,
853
+ data=data,
854
+ key=self.key,
855
+ run=run,
856
+ format=format,
857
+ default_storage=default_storage,
858
+ version=None,
859
+ is_replace=True,
860
+ )
861
+
862
+ # this artifact already exists
863
+ if privates is None:
864
+ return kwargs
865
+
866
+ check_path_in_storage = privates["check_path_in_storage"]
867
+ if check_path_in_storage:
868
+ raise ValueError("Can only replace with a local file not in any Storage.")
869
+
870
+ if self.key is not None and not self._key_is_virtual:
871
+ key_path = PurePosixPath(self.key)
872
+ new_filename = f"{key_path.stem}{kwargs['suffix']}"
873
+ # the following will only be true if the suffix changes!
874
+ if key_path.name != new_filename:
875
+ self._clear_storagekey = self.key
876
+ self.key = str(key_path.with_name(new_filename))
877
+ logger.warning(
878
+ f"replacing the file will replace key '{key_path}' with '{self.key}'"
879
+ f" and delete '{key_path}' upon `save()`"
880
+ )
881
+ else:
882
+ old_storage = auto_storage_key_from_artifact(self)
883
+ is_dir = self.n_objects is not None
884
+ new_storage = auto_storage_key_from_artifact_uid(
885
+ self.uid, kwargs["suffix"], is_dir
886
+ )
887
+ if old_storage != new_storage:
888
+ self._clear_storagekey = old_storage
889
+ if self.key is not None:
890
+ new_key_path = PurePosixPath(self.key).with_suffix(kwargs["suffix"])
891
+ self.key = str(new_key_path)
892
+
893
+ self.suffix = kwargs["suffix"]
894
+ self.size = kwargs["size"]
895
+ self.hash = kwargs["hash"]
896
+ self._hash_type = kwargs["_hash_type"]
897
+ self.run_id = kwargs["run_id"]
898
+ self.run = kwargs["run"]
899
+
900
+ self._local_filepath = privates["local_filepath"]
901
+ self._cloud_filepath = privates["cloud_filepath"]
902
+ self._memory_rep = privates["memory_rep"]
903
+ # no need to upload if new file is already in storage
904
+ self._to_store = not check_path_in_storage
905
+
906
+
907
+ # docstring handled through attach_func_to_class_method
908
+ def open(
909
+ self, mode: str = "r", is_run_input: bool | None = None
910
+ ) -> AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment:
911
+ # ignore empty suffix for now
912
+ suffixes = (".h5", ".hdf5", ".h5ad", ".zarr", ".tiledbsoma", "")
913
+ if self.suffix not in suffixes:
914
+ raise ValueError(
915
+ "Artifact should have a zarr, h5 or tiledbsoma object as the underlying data, please"
916
+ " use one of the following suffixes for the object name:"
917
+ f" {', '.join(suffixes[:-1])}."
918
+ )
919
+ if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
920
+ raise ValueError("Only a tiledbsoma store can be openened with `mode!='r'`.")
921
+
922
+ from lamindb.core.storage._backed_access import _track_writes_factory, backed_access
923
+
924
+ using_key = settings._using_key
925
+ filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
926
+ is_tiledbsoma_w = (
927
+ filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
928
+ ) and mode == "w"
929
+ # consider the case where an object is already locally cached
930
+ localpath = setup_settings.instance.storage.cloud_to_local_no_update(
931
+ filepath, cache_key=cache_key
932
+ )
933
+ if not is_tiledbsoma_w and localpath.exists():
934
+ access = backed_access(localpath, mode, using_key)
935
+ else:
936
+ access = backed_access(filepath, mode, using_key)
937
+ if is_tiledbsoma_w:
938
+
939
+ def finalize():
940
+ nonlocal self, filepath, localpath
941
+ if not isinstance(filepath, LocalPathClasses):
942
+ _, hash, _, _ = get_stat_dir_cloud(filepath)
943
+ else:
944
+ # this can be very slow
945
+ _, hash, _, _ = hash_dir(filepath)
946
+ if self.hash != hash:
947
+ from ._record import init_self_from_db
948
+
949
+ new_version = Artifact(
950
+ filepath, revises=self, _is_internal_call=True
951
+ ).save()
952
+ init_self_from_db(self, new_version)
953
+
954
+ if localpath != filepath and localpath.exists():
955
+ shutil.rmtree(localpath)
956
+
957
+ access = _track_writes_factory(access, finalize)
958
+ # only call if open is successfull
959
+ _track_run_input(self, is_run_input)
960
+ return access
961
+
962
+
963
+ # can't really just call .cache in .load because of double tracking
964
+ def _synchronize_cleanup_on_error(
965
+ filepath: UPath, cache_key: str | None = None
966
+ ) -> UPath:
967
+ try:
968
+ cache_path = setup_settings.instance.storage.cloud_to_local(
969
+ filepath, cache_key=cache_key, print_progress=True
970
+ )
971
+ except Exception as e:
972
+ if not isinstance(filepath, LocalPathClasses):
973
+ cache_path = setup_settings.instance.storage.cloud_to_local_no_update(
974
+ filepath, cache_key=cache_key
975
+ )
976
+ if cache_path.is_file():
977
+ cache_path.unlink(missing_ok=True)
978
+ elif cache_path.is_dir():
979
+ shutil.rmtree(cache_path)
980
+ raise e
981
+ return cache_path
982
+
983
+
984
+ # docstring handled through attach_func_to_class_method
985
+ def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
986
+ if hasattr(self, "_memory_rep") and self._memory_rep is not None:
987
+ access_memory = self._memory_rep
988
+ else:
989
+ filepath, cache_key = filepath_cache_key_from_artifact(
990
+ self, using_key=settings._using_key
991
+ )
992
+ cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
993
+ # cache_path is local so doesn't trigger any sync in load_to_memory
994
+ access_memory = load_to_memory(cache_path, **kwargs)
995
+ # only call if load is successfull
996
+ _track_run_input(self, is_run_input)
997
+ return access_memory
998
+
999
+
1000
+ # docstring handled through attach_func_to_class_method
1001
+ def cache(self, is_run_input: bool | None = None) -> Path:
1002
+ filepath, cache_key = filepath_cache_key_from_artifact(
1003
+ self, using_key=settings._using_key
1004
+ )
1005
+ cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
1006
+ # only call if sync is successfull
1007
+ _track_run_input(self, is_run_input)
1008
+ return cache_path
1009
+
1010
+
1011
+ # docstring handled through attach_func_to_class_method
1012
+ def delete(
1013
+ self,
1014
+ permanent: bool | None = None,
1015
+ storage: bool | None = None,
1016
+ using_key: str | None = None,
1017
+ ) -> None:
1018
+ # this first check means an invalid delete fails fast rather than cascading through
1019
+ # database and storage permission errors
1020
+ if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
1021
+ isettings = setup_settings.instance
1022
+ if self.storage.instance_uid != isettings.uid and (storage or storage is None):
1023
+ raise IntegrityError(
1024
+ "Cannot simply delete artifacts outside of this instance's managed storage locations."
1025
+ "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
1026
+ f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
1027
+ f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
1028
+ )
1029
+ # by default, we only move artifacts into the trash (visibility = -1)
1030
+ trash_visibility = VisibilityChoice.trash.value
1031
+ if self.visibility > trash_visibility and not permanent:
1032
+ if storage is not None:
1033
+ logger.warning("moving artifact to trash, storage arg is ignored")
1034
+ # move to trash
1035
+ self.visibility = trash_visibility
1036
+ self.save()
1037
+ logger.important(f"moved artifact to trash (visibility = {trash_visibility})")
1038
+ return
1039
+
1040
+ # if the artifact is already in the trash
1041
+ # permanent delete skips the trash
1042
+ if permanent is None:
1043
+ # ask for confirmation of permanent delete
1044
+ response = input(
1045
+ "Artifact record is already in trash! Are you sure you want to permanently"
1046
+ " delete it? (y/n) You can't undo this action."
1047
+ )
1048
+ delete_record = response == "y"
1049
+ else:
1050
+ assert permanent # noqa: S101
1051
+ delete_record = True
1052
+
1053
+ if delete_record:
1054
+ # need to grab file path before deletion
1055
+ try:
1056
+ path, _ = filepath_from_artifact(self, using_key)
1057
+ except OSError:
1058
+ # we can still delete the record
1059
+ logger.warning("Could not get path")
1060
+ storage = False
1061
+ # only delete in storage if DB delete is successful
1062
+ # DB delete might error because of a foreign key constraint violated etc.
1063
+ self._delete_skip_storage()
1064
+ if self.key is None or self._key_is_virtual:
1065
+ # do not ask for confirmation also if storage is None
1066
+ delete_in_storage = storage is None or storage
1067
+ else:
1068
+ # for artifacts with non-virtual semantic storage keys (key is not None)
1069
+ # ask for extra-confirmation
1070
+ if storage is None:
1071
+ response = input(
1072
+ f"Are you sure to want to delete {path}? (y/n) You can't undo"
1073
+ " this action."
1074
+ )
1075
+ delete_in_storage = response == "y"
1076
+ else:
1077
+ delete_in_storage = storage
1078
+ if not delete_in_storage:
1079
+ logger.important(f"a file/folder remains here: {path}")
1080
+ # we don't yet have logic to bring back the deleted metadata record
1081
+ # in case storage deletion fails - this is important for ACID down the road
1082
+ if delete_in_storage:
1083
+ delete_msg = delete_storage(path, raise_file_not_found_error=False)
1084
+ if delete_msg != "did-not-delete":
1085
+ logger.success(f"deleted {colors.yellow(f'{path}')}")
1086
+
1087
+
1088
+ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
1089
+ super(Artifact, artifact).delete(*args, **kwargs)
1090
+
1091
+
1092
+ # docstring handled through attach_func_to_class_method
1093
+ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1094
+ state_was_adding = self._state.adding
1095
+ print_progress = kwargs.pop("print_progress", True)
1096
+ access_token = kwargs.pop("access_token", None)
1097
+ local_path = None
1098
+ if upload and setup_settings.instance.keep_artifacts_local:
1099
+ # switch local storage location to cloud
1100
+ local_path = self.path
1101
+ self.storage_id = setup_settings.instance.storage.id
1102
+ self._local_filepath = local_path
1103
+ # switch to virtual storage key upon upload
1104
+ # the local filepath is already cached at that point
1105
+ self._key_is_virtual = True
1106
+ # ensure that the artifact is uploaded
1107
+ self._to_store = True
1108
+
1109
+ self._save_skip_storage(**kwargs)
1110
+
1111
+ from lamindb._save import check_and_attempt_clearing, check_and_attempt_upload
1112
+
1113
+ using_key = None
1114
+ if "using" in kwargs:
1115
+ using_key = kwargs["using"]
1116
+ exception = check_and_attempt_upload(
1117
+ self, using_key, access_token=access_token, print_progress=print_progress
1118
+ )
1119
+ if exception is not None:
1120
+ self._delete_skip_storage()
1121
+ raise RuntimeError(exception)
1122
+ exception = check_and_attempt_clearing(self, using_key)
1123
+ if exception is not None:
1124
+ raise RuntimeError(exception)
1125
+ if local_path is not None and not state_was_adding:
1126
+ # only move the local artifact to cache if it was not newly created
1127
+ local_path_cache = ln_setup.settings.storage.cache_dir / local_path.name
1128
+ # don't use Path.rename here because of cross-device link error
1129
+ # https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
1130
+ shutil.move(
1131
+ local_path, # type: ignore
1132
+ local_path_cache,
1133
+ )
1134
+ logger.important(f"moved local artifact to cache: {local_path_cache}")
1135
+ return self
1136
+
1137
+
1138
+ def _save_skip_storage(file, **kwargs) -> None:
1139
+ save_feature_sets(file)
1140
+ super(Artifact, file).save(**kwargs)
1141
+ save_feature_set_links(file)
1142
+
1143
+
1144
+ @property # type: ignore
1145
+ @doc_args(Artifact.path.__doc__)
1146
+ def path(self) -> Path | UPath:
1147
+ """{}""" # noqa: D415
1148
+ # return only the path, without StorageSettings
1149
+ filepath, _ = filepath_from_artifact(self, using_key=settings._using_key)
1150
+ return filepath
1151
+
1152
+
1153
+ # get cache path without triggering sync
1154
+ @property # type: ignore
1155
+ def _cache_path(self) -> UPath:
1156
+ filepath, cache_key = filepath_cache_key_from_artifact(
1157
+ self, using_key=settings._using_key
1158
+ )
1159
+ if isinstance(filepath, LocalPathClasses):
1160
+ return filepath
1161
+ return setup_settings.instance.storage.cloud_to_local_no_update(
1162
+ filepath, cache_key=cache_key
1163
+ )
1164
+
1165
+
1166
+ # docstring handled through attach_func_to_class_method
1167
+ def restore(self) -> None:
1168
+ self.visibility = VisibilityChoice.default.value
1169
+ self.save()
1170
+
1171
+
1172
+ METHOD_NAMES = [
1173
+ "__init__",
1174
+ "from_anndata",
1175
+ "from_df",
1176
+ "from_mudata",
1177
+ "open",
1178
+ "cache",
1179
+ "load",
1180
+ "delete",
1181
+ "save",
1182
+ "replace",
1183
+ "from_dir",
1184
+ "restore",
1185
+ ]
1186
+
1187
+ if ln_setup._TESTING:
1188
+ from inspect import signature
1189
+
1190
+ SIGS = {
1191
+ name: signature(getattr(Artifact, name))
1192
+ for name in METHOD_NAMES
1193
+ if name != "__init__"
1194
+ }
1195
+
1196
+ for name in METHOD_NAMES:
1197
+ attach_func_to_class_method(name, Artifact, globals())
1198
+
1199
+ # privates currently dealt with separately
1200
+ Artifact._delete_skip_storage = _delete_skip_storage
1201
+ Artifact._save_skip_storage = _save_skip_storage
1202
+ Artifact._cache_path = _cache_path
1203
+ Artifact.path = path
1204
+ Artifact.describe = describe
1205
+ Artifact.view_lineage = view_lineage