lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +31 -26
- lamindb/_finish.py +9 -1
- lamindb/_tracked.py +26 -3
- lamindb/_view.py +2 -3
- lamindb/base/__init__.py +1 -1
- lamindb/base/ids.py +1 -10
- lamindb/base/users.py +1 -4
- lamindb/core/__init__.py +7 -65
- lamindb/core/_context.py +41 -10
- lamindb/core/_mapped_collection.py +4 -2
- lamindb/core/_settings.py +6 -6
- lamindb/core/_sync_git.py +1 -1
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/_small.py +3 -3
- lamindb/core/loaders.py +22 -9
- lamindb/core/storage/_anndata_accessor.py +8 -3
- lamindb/core/storage/_backed_access.py +14 -7
- lamindb/core/storage/_pyarrow_dataset.py +24 -9
- lamindb/core/storage/_tiledbsoma.py +6 -4
- lamindb/core/storage/_zarr.py +32 -11
- lamindb/core/storage/objects.py +59 -26
- lamindb/core/storage/paths.py +16 -13
- lamindb/curators/__init__.py +173 -145
- lamindb/errors.py +1 -1
- lamindb/integrations/_vitessce.py +4 -4
- lamindb/migrations/0089_subsequent_runs.py +159 -0
- lamindb/migrations/0090_runproject_project_runs.py +73 -0
- lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
- lamindb/models/__init__.py +79 -0
- lamindb/{core → models}/_describe.py +3 -3
- lamindb/{core → models}/_django.py +8 -5
- lamindb/{core → models}/_feature_manager.py +103 -87
- lamindb/{_from_values.py → models/_from_values.py} +5 -2
- lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
- lamindb/{core → models}/_label_manager.py +10 -17
- lamindb/{core/relations.py → models/_relations.py} +8 -1
- lamindb/models/artifact.py +2601 -0
- lamindb/{_can_curate.py → models/can_curate.py} +349 -180
- lamindb/models/collection.py +683 -0
- lamindb/models/core.py +135 -0
- lamindb/models/feature.py +643 -0
- lamindb/models/flextable.py +163 -0
- lamindb/{_parents.py → models/has_parents.py} +55 -49
- lamindb/models/project.py +384 -0
- lamindb/{_query_manager.py → models/query_manager.py} +10 -8
- lamindb/{_query_set.py → models/query_set.py} +52 -30
- lamindb/models/record.py +1757 -0
- lamindb/models/run.py +563 -0
- lamindb/{_save.py → models/save.py} +18 -8
- lamindb/models/schema.py +732 -0
- lamindb/models/transform.py +360 -0
- lamindb/models/ulabel.py +249 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
- lamindb-1.2a2.dist-info/RECORD +94 -0
- lamindb/_artifact.py +0 -1361
- lamindb/_collection.py +0 -440
- lamindb/_feature.py +0 -316
- lamindb/_is_versioned.py +0 -40
- lamindb/_record.py +0 -1065
- lamindb/_run.py +0 -60
- lamindb/_schema.py +0 -347
- lamindb/_storage.py +0 -15
- lamindb/_transform.py +0 -170
- lamindb/_ulabel.py +0 -56
- lamindb/_utils.py +0 -9
- lamindb/base/validation.py +0 -63
- lamindb/core/_data.py +0 -491
- lamindb/core/fields.py +0 -12
- lamindb/models.py +0 -4435
- lamindb-1.1.0.dist-info/RECORD +0 -95
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
lamindb/_artifact.py
DELETED
@@ -1,1361 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import os
|
4
|
-
import shutil
|
5
|
-
from pathlib import Path, PurePath, PurePosixPath
|
6
|
-
from typing import TYPE_CHECKING, Any
|
7
|
-
|
8
|
-
import fsspec
|
9
|
-
import lamindb_setup as ln_setup
|
10
|
-
import pandas as pd
|
11
|
-
from anndata import AnnData
|
12
|
-
from django.db.models import Q
|
13
|
-
from lamin_utils import colors, logger
|
14
|
-
from lamindb_setup import settings as setup_settings
|
15
|
-
from lamindb_setup._init_instance import register_storage_in_instance
|
16
|
-
from lamindb_setup.core._docs import doc_args
|
17
|
-
from lamindb_setup.core._settings_storage import init_storage
|
18
|
-
from lamindb_setup.core.hashing import hash_dir, hash_file
|
19
|
-
from lamindb_setup.core.upath import (
|
20
|
-
create_path,
|
21
|
-
extract_suffix_from_path,
|
22
|
-
get_stat_dir_cloud,
|
23
|
-
get_stat_file_cloud,
|
24
|
-
)
|
25
|
-
|
26
|
-
from lamindb._record import _get_record_kwargs
|
27
|
-
from lamindb.errors import FieldValidationError
|
28
|
-
from lamindb.models import Artifact, FeatureManager, ParamManager, Run, Storage
|
29
|
-
|
30
|
-
from ._parents import view_lineage
|
31
|
-
from ._utils import attach_func_to_class_method
|
32
|
-
from .core._data import (
|
33
|
-
_track_run_input,
|
34
|
-
describe,
|
35
|
-
get_run,
|
36
|
-
save_schema_links,
|
37
|
-
save_staged_feature_sets,
|
38
|
-
)
|
39
|
-
from .core._settings import settings
|
40
|
-
from .core.loaders import load_to_memory
|
41
|
-
from .core.storage import (
|
42
|
-
LocalPathClasses,
|
43
|
-
UPath,
|
44
|
-
delete_storage,
|
45
|
-
infer_suffix,
|
46
|
-
write_to_disk,
|
47
|
-
)
|
48
|
-
from .core.storage._anndata_accessor import _anndata_n_observations
|
49
|
-
from .core.storage._pyarrow_dataset import PYARROW_SUFFIXES
|
50
|
-
from .core.storage._tiledbsoma import _soma_n_observations
|
51
|
-
from .core.storage.objects import _mudata_is_installed
|
52
|
-
from .core.storage.paths import (
|
53
|
-
AUTO_KEY_PREFIX,
|
54
|
-
auto_storage_key_from_artifact,
|
55
|
-
auto_storage_key_from_artifact_uid,
|
56
|
-
check_path_is_child_of_root,
|
57
|
-
filepath_cache_key_from_artifact,
|
58
|
-
filepath_from_artifact,
|
59
|
-
)
|
60
|
-
from .core.versioning import (
|
61
|
-
create_uid,
|
62
|
-
message_update_key_in_version_family,
|
63
|
-
)
|
64
|
-
from .errors import IntegrityError, InvalidArgument
|
65
|
-
|
66
|
-
try:
|
67
|
-
from .core.storage._zarr import zarr_is_adata
|
68
|
-
except ImportError:
|
69
|
-
|
70
|
-
def zarr_is_adata(storepath): # type: ignore
|
71
|
-
raise ImportError("Please install zarr: pip install zarr<=2.18.4")
|
72
|
-
|
73
|
-
|
74
|
-
if TYPE_CHECKING:
|
75
|
-
from lamindb_setup.core.types import UPathStr
|
76
|
-
from mudata import MuData
|
77
|
-
from pyarrow.dataset import Dataset as PyArrowDataset
|
78
|
-
from tiledbsoma import Collection as SOMACollection
|
79
|
-
from tiledbsoma import Experiment as SOMAExperiment
|
80
|
-
from tiledbsoma import Measurement as SOMAMeasurement
|
81
|
-
|
82
|
-
from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
|
83
|
-
|
84
|
-
|
85
|
-
def process_pathlike(
|
86
|
-
filepath: UPath,
|
87
|
-
default_storage: Storage,
|
88
|
-
using_key: str | None,
|
89
|
-
skip_existence_check: bool = False,
|
90
|
-
) -> tuple[Storage, bool]:
|
91
|
-
"""Determines the appropriate storage for a given path and whether to use an existing storage key."""
|
92
|
-
if not skip_existence_check:
|
93
|
-
try: # check if file exists
|
94
|
-
if not filepath.exists():
|
95
|
-
raise FileNotFoundError(filepath)
|
96
|
-
except PermissionError:
|
97
|
-
pass
|
98
|
-
if check_path_is_child_of_root(filepath, default_storage.root):
|
99
|
-
use_existing_storage_key = True
|
100
|
-
return default_storage, use_existing_storage_key
|
101
|
-
else:
|
102
|
-
# check whether the path is part of one of the existing
|
103
|
-
# already-registered storage locations
|
104
|
-
result = False
|
105
|
-
# within the hub, we don't want to perform check_path_in_existing_storage
|
106
|
-
if using_key is None:
|
107
|
-
result = check_path_in_existing_storage(filepath, using_key)
|
108
|
-
if isinstance(result, Storage):
|
109
|
-
use_existing_storage_key = True
|
110
|
-
return result, use_existing_storage_key
|
111
|
-
else:
|
112
|
-
# if the path is in the cloud, we have a good candidate
|
113
|
-
# for the storage root: the bucket
|
114
|
-
if not isinstance(filepath, LocalPathClasses):
|
115
|
-
# for a cloud path, new_root is always the bucket name
|
116
|
-
if filepath.protocol == "hf":
|
117
|
-
hf_path = filepath.fs.resolve_path(filepath.as_posix())
|
118
|
-
hf_path.path_in_repo = ""
|
119
|
-
new_root = "hf://" + hf_path.unresolve()
|
120
|
-
else:
|
121
|
-
if filepath.protocol == "s3":
|
122
|
-
# check that endpoint_url didn't propagate here
|
123
|
-
# as a part of the path string
|
124
|
-
assert "?" not in filepath.path # noqa: S101
|
125
|
-
new_root = list(filepath.parents)[-1]
|
126
|
-
# do not register remote storage locations on hub if the current instance
|
127
|
-
# is not managed on the hub
|
128
|
-
storage_settings, _ = init_storage(
|
129
|
-
new_root, prevent_register_hub=not setup_settings.instance.is_on_hub
|
130
|
-
)
|
131
|
-
storage_record = register_storage_in_instance(storage_settings)
|
132
|
-
use_existing_storage_key = True
|
133
|
-
return storage_record, use_existing_storage_key
|
134
|
-
# if the filepath is local
|
135
|
-
else:
|
136
|
-
use_existing_storage_key = False
|
137
|
-
# if the default storage is local we'll throw an error if the user
|
138
|
-
# doesn't provide a key
|
139
|
-
if default_storage.type == "local":
|
140
|
-
return default_storage, use_existing_storage_key
|
141
|
-
# if the default storage is in the cloud (the file is going to
|
142
|
-
# be uploaded upon saving it), we treat the filepath as a cache
|
143
|
-
else:
|
144
|
-
return default_storage, use_existing_storage_key
|
145
|
-
|
146
|
-
|
147
|
-
def process_data(
|
148
|
-
provisional_uid: str,
|
149
|
-
data: UPathStr | pd.DataFrame | AnnData,
|
150
|
-
format: str | None,
|
151
|
-
key: str | None,
|
152
|
-
default_storage: Storage,
|
153
|
-
using_key: str | None,
|
154
|
-
skip_existence_check: bool = False,
|
155
|
-
) -> tuple[Any, Path | UPath, str, Storage, bool]:
|
156
|
-
"""Serialize a data object that's provided as file or in memory."""
|
157
|
-
# if not overwritten, data gets stored in default storage
|
158
|
-
if _mudata_is_installed():
|
159
|
-
from mudata import MuData
|
160
|
-
|
161
|
-
data_types = (pd.DataFrame, AnnData, MuData)
|
162
|
-
else:
|
163
|
-
data_types = (pd.DataFrame, AnnData) # type:ignore
|
164
|
-
|
165
|
-
if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
|
166
|
-
access_token = (
|
167
|
-
default_storage._access_token
|
168
|
-
if hasattr(default_storage, "_access_token")
|
169
|
-
else None
|
170
|
-
)
|
171
|
-
path = create_path(data, access_token=access_token).resolve()
|
172
|
-
storage, use_existing_storage_key = process_pathlike(
|
173
|
-
path,
|
174
|
-
default_storage=default_storage,
|
175
|
-
using_key=using_key,
|
176
|
-
skip_existence_check=skip_existence_check,
|
177
|
-
)
|
178
|
-
suffix = extract_suffix_from_path(path)
|
179
|
-
memory_rep = None
|
180
|
-
elif isinstance(data, data_types):
|
181
|
-
storage = default_storage
|
182
|
-
memory_rep = data
|
183
|
-
if key is not None:
|
184
|
-
key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
|
185
|
-
# use suffix as the (adata) format if the format is not provided
|
186
|
-
if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
|
187
|
-
format = key_suffix[1:]
|
188
|
-
else:
|
189
|
-
key_suffix = None
|
190
|
-
suffix = infer_suffix(data, format)
|
191
|
-
if key_suffix is not None and key_suffix != suffix:
|
192
|
-
raise InvalidArgument(
|
193
|
-
f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
|
194
|
-
f" be '{suffix}'."
|
195
|
-
)
|
196
|
-
cache_name = f"{provisional_uid}{suffix}"
|
197
|
-
path = settings.cache_dir / cache_name
|
198
|
-
# Alex: I don't understand the line below
|
199
|
-
if path.suffixes == []:
|
200
|
-
path = path.with_suffix(suffix)
|
201
|
-
write_to_disk(data, path)
|
202
|
-
use_existing_storage_key = False
|
203
|
-
else:
|
204
|
-
raise NotImplementedError(
|
205
|
-
f"Do not know how to create a artifact object from {data}, pass a path instead!"
|
206
|
-
)
|
207
|
-
return memory_rep, path, suffix, storage, use_existing_storage_key
|
208
|
-
|
209
|
-
|
210
|
-
def get_stat_or_artifact(
|
211
|
-
path: UPath,
|
212
|
-
key: str | None = None,
|
213
|
-
check_hash: bool = True,
|
214
|
-
is_replace: bool = False,
|
215
|
-
instance: str | None = None,
|
216
|
-
) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
|
217
|
-
"""Retrieves file statistics or an existing artifact based on the path, hash, and key."""
|
218
|
-
n_files = None
|
219
|
-
if settings.creation.artifact_skip_size_hash:
|
220
|
-
return None, None, None, n_files, None
|
221
|
-
stat = path.stat() # one network request
|
222
|
-
if not isinstance(path, LocalPathClasses):
|
223
|
-
size, hash, hash_type = None, None, None
|
224
|
-
if stat is not None:
|
225
|
-
# convert UPathStatResult to fsspec info dict
|
226
|
-
stat = stat.as_info()
|
227
|
-
if (store_type := stat["type"]) == "file":
|
228
|
-
size, hash, hash_type = get_stat_file_cloud(stat)
|
229
|
-
elif store_type == "directory":
|
230
|
-
size, hash, hash_type, n_files = get_stat_dir_cloud(path)
|
231
|
-
if hash is None:
|
232
|
-
logger.warning(f"did not add hash for {path}")
|
233
|
-
return size, hash, hash_type, n_files, None
|
234
|
-
else:
|
235
|
-
if path.is_dir():
|
236
|
-
size, hash, hash_type, n_files = hash_dir(path)
|
237
|
-
else:
|
238
|
-
hash, hash_type = hash_file(path)
|
239
|
-
size = stat.st_size
|
240
|
-
if not check_hash:
|
241
|
-
return size, hash, hash_type, n_files, None
|
242
|
-
previous_artifact_version = None
|
243
|
-
if key is None or is_replace:
|
244
|
-
result = Artifact.objects.using(instance).filter(hash=hash).all()
|
245
|
-
artifact_with_same_hash_exists = len(result) > 0
|
246
|
-
else:
|
247
|
-
storage_id = settings.storage.id
|
248
|
-
result = (
|
249
|
-
Artifact.objects.using(instance)
|
250
|
-
.filter(Q(hash=hash) | Q(key=key, storage_id=storage_id))
|
251
|
-
.order_by("-created_at")
|
252
|
-
.all()
|
253
|
-
)
|
254
|
-
artifact_with_same_hash_exists = result.filter(hash=hash).count() > 0
|
255
|
-
if not artifact_with_same_hash_exists and len(result) > 0:
|
256
|
-
logger.important(
|
257
|
-
f"creating new artifact version for key='{key}' (storage: '{settings.storage.root_as_str}')"
|
258
|
-
)
|
259
|
-
previous_artifact_version = result[0]
|
260
|
-
if artifact_with_same_hash_exists:
|
261
|
-
message = "found artifact with same hash"
|
262
|
-
if result[0]._branch_code == -1:
|
263
|
-
result[0].restore()
|
264
|
-
message = "restored artifact with same hash from trash"
|
265
|
-
logger.important(
|
266
|
-
f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
|
267
|
-
)
|
268
|
-
return result[0]
|
269
|
-
else:
|
270
|
-
return size, hash, hash_type, n_files, previous_artifact_version
|
271
|
-
|
272
|
-
|
273
|
-
def check_path_in_existing_storage(
|
274
|
-
path: Path | UPath, using_key: str | None = None
|
275
|
-
) -> Storage | bool:
|
276
|
-
for storage in Storage.objects.using(using_key).filter().all():
|
277
|
-
# if path is part of storage, return it
|
278
|
-
if check_path_is_child_of_root(path, root=storage.root):
|
279
|
-
return storage
|
280
|
-
return False
|
281
|
-
|
282
|
-
|
283
|
-
def get_relative_path_to_directory(
|
284
|
-
path: PurePath | Path | UPath, directory: PurePath | Path | UPath
|
285
|
-
) -> PurePath | Path:
|
286
|
-
if isinstance(directory, UPath) and not isinstance(directory, LocalPathClasses):
|
287
|
-
# UPath.relative_to() is not behaving as it should (2023-04-07)
|
288
|
-
# need to lstrip otherwise inconsistent behavior across trailing slashes
|
289
|
-
# see test_artifact.py: test_get_relative_path_to_directory
|
290
|
-
relpath = PurePath(
|
291
|
-
path.as_posix().replace(directory.as_posix(), "").lstrip("/")
|
292
|
-
)
|
293
|
-
elif isinstance(directory, Path):
|
294
|
-
relpath = path.resolve().relative_to(directory.resolve()) # type: ignore
|
295
|
-
elif isinstance(directory, PurePath):
|
296
|
-
relpath = path.relative_to(directory)
|
297
|
-
else:
|
298
|
-
raise TypeError("Directory not of type Path or UPath")
|
299
|
-
return relpath
|
300
|
-
|
301
|
-
|
302
|
-
def get_artifact_kwargs_from_data(
|
303
|
-
*,
|
304
|
-
data: Path | UPath | str | pd.DataFrame | AnnData | MuData,
|
305
|
-
key: str | None,
|
306
|
-
run: Run | None,
|
307
|
-
format: str | None,
|
308
|
-
provisional_uid: str,
|
309
|
-
version: str | None,
|
310
|
-
default_storage: Storage,
|
311
|
-
using_key: str | None = None,
|
312
|
-
is_replace: bool = False,
|
313
|
-
skip_check_exists: bool = False,
|
314
|
-
):
|
315
|
-
run = get_run(run)
|
316
|
-
memory_rep, path, suffix, storage, use_existing_storage_key = process_data(
|
317
|
-
provisional_uid,
|
318
|
-
data,
|
319
|
-
format,
|
320
|
-
key,
|
321
|
-
default_storage,
|
322
|
-
using_key,
|
323
|
-
skip_check_exists,
|
324
|
-
)
|
325
|
-
stat_or_artifact = get_stat_or_artifact(
|
326
|
-
path=path,
|
327
|
-
key=key,
|
328
|
-
instance=using_key,
|
329
|
-
is_replace=is_replace,
|
330
|
-
)
|
331
|
-
if isinstance(stat_or_artifact, Artifact):
|
332
|
-
artifact = stat_or_artifact
|
333
|
-
# update the run of the existing artifact
|
334
|
-
if run is not None:
|
335
|
-
# save the information that this artifact was previously produced by
|
336
|
-
# another run
|
337
|
-
# note: same logic exists for _output_collections_with_later_updates
|
338
|
-
if artifact.run is not None and artifact.run != run:
|
339
|
-
artifact.run._output_artifacts_with_later_updates.add(artifact)
|
340
|
-
# update the run of the artifact with the latest run
|
341
|
-
stat_or_artifact.run = run
|
342
|
-
return artifact, None
|
343
|
-
else:
|
344
|
-
size, hash, hash_type, n_files, revises = stat_or_artifact
|
345
|
-
|
346
|
-
if revises is not None: # update provisional_uid
|
347
|
-
provisional_uid, revises = create_uid(revises=revises, version=version)
|
348
|
-
if settings.cache_dir in path.parents:
|
349
|
-
path = path.rename(path.with_name(f"{provisional_uid}{suffix}"))
|
350
|
-
|
351
|
-
check_path_in_storage = False
|
352
|
-
if use_existing_storage_key:
|
353
|
-
inferred_key = get_relative_path_to_directory(
|
354
|
-
path=path, directory=UPath(storage.root)
|
355
|
-
).as_posix()
|
356
|
-
if key is None:
|
357
|
-
key = inferred_key
|
358
|
-
else:
|
359
|
-
if not key == inferred_key:
|
360
|
-
raise InvalidArgument(
|
361
|
-
f"The path '{data}' is already in registered storage"
|
362
|
-
f" '{storage.root}' with key '{inferred_key}'\nYou passed"
|
363
|
-
f" conflicting key '{key}': please move the file before"
|
364
|
-
" registering it."
|
365
|
-
)
|
366
|
-
check_path_in_storage = True
|
367
|
-
else:
|
368
|
-
storage = default_storage
|
369
|
-
|
370
|
-
log_storage_hint(
|
371
|
-
check_path_in_storage=check_path_in_storage,
|
372
|
-
storage=storage,
|
373
|
-
key=key,
|
374
|
-
uid=provisional_uid,
|
375
|
-
suffix=suffix,
|
376
|
-
is_dir=n_files is not None,
|
377
|
-
)
|
378
|
-
|
379
|
-
# do we use a virtual or an actual storage key?
|
380
|
-
key_is_virtual = settings.creation._artifact_use_virtual_keys
|
381
|
-
|
382
|
-
# if the file is already in storage, independent of the default
|
383
|
-
# we use an actual storage key
|
384
|
-
if check_path_in_storage:
|
385
|
-
key_is_virtual = False
|
386
|
-
|
387
|
-
kwargs = {
|
388
|
-
"uid": provisional_uid,
|
389
|
-
"suffix": suffix,
|
390
|
-
"hash": hash,
|
391
|
-
"_hash_type": hash_type,
|
392
|
-
"key": key,
|
393
|
-
"size": size,
|
394
|
-
"storage_id": storage.id,
|
395
|
-
# passing both the id and the object
|
396
|
-
# to make them both available immediately
|
397
|
-
# after object creation
|
398
|
-
"n_files": n_files,
|
399
|
-
"_overwrite_versions": n_files is not None, # True for folder, False for file
|
400
|
-
"n_observations": None, # to implement
|
401
|
-
"run_id": run.id if run is not None else None,
|
402
|
-
"run": run,
|
403
|
-
"_key_is_virtual": key_is_virtual,
|
404
|
-
"revises": revises,
|
405
|
-
}
|
406
|
-
if not isinstance(path, LocalPathClasses):
|
407
|
-
local_filepath = None
|
408
|
-
cloud_filepath = path
|
409
|
-
else:
|
410
|
-
local_filepath = path
|
411
|
-
cloud_filepath = None
|
412
|
-
privates = {
|
413
|
-
"local_filepath": local_filepath,
|
414
|
-
"cloud_filepath": cloud_filepath,
|
415
|
-
"memory_rep": memory_rep,
|
416
|
-
"check_path_in_storage": check_path_in_storage,
|
417
|
-
}
|
418
|
-
return kwargs, privates
|
419
|
-
|
420
|
-
|
421
|
-
def log_storage_hint(
|
422
|
-
*,
|
423
|
-
check_path_in_storage: bool,
|
424
|
-
storage: Storage | None,
|
425
|
-
key: str | None,
|
426
|
-
uid: str,
|
427
|
-
suffix: str,
|
428
|
-
is_dir: bool,
|
429
|
-
) -> None:
|
430
|
-
hint = ""
|
431
|
-
if check_path_in_storage:
|
432
|
-
display_root = storage.root # type: ignore
|
433
|
-
# check whether path is local
|
434
|
-
if fsspec.utils.get_protocol(storage.root) == "file": # type: ignore
|
435
|
-
# if it's a local path, check whether it's in the current working directory
|
436
|
-
root_path = Path(storage.root) # type: ignore
|
437
|
-
if check_path_is_child_of_root(root_path, Path.cwd()):
|
438
|
-
# only display the relative path, not the fully resolved path
|
439
|
-
display_root = root_path.relative_to(Path.cwd()) # type: ignore
|
440
|
-
hint += f"path in storage '{display_root}'" # type: ignore
|
441
|
-
else:
|
442
|
-
hint += "path content will be copied to default storage upon `save()`"
|
443
|
-
if key is None:
|
444
|
-
storage_key = auto_storage_key_from_artifact_uid(uid, suffix, is_dir)
|
445
|
-
hint += f" with key `None` ('{storage_key}')"
|
446
|
-
else:
|
447
|
-
hint += f" with key '{key}'"
|
448
|
-
logger.hint(hint)
|
449
|
-
|
450
|
-
|
451
|
-
def data_is_anndata(data: AnnData | UPathStr) -> bool:
|
452
|
-
if isinstance(data, AnnData):
|
453
|
-
return True
|
454
|
-
if isinstance(data, (str, Path, UPath)):
|
455
|
-
data_path = UPath(data)
|
456
|
-
if data_path.suffix == ".h5ad":
|
457
|
-
return True
|
458
|
-
elif data_path.suffix == ".zarr":
|
459
|
-
# ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
|
460
|
-
if ".anndata" in data_path.suffixes:
|
461
|
-
return True
|
462
|
-
# check only for local, expensive for cloud
|
463
|
-
if fsspec.utils.get_protocol(data_path.as_posix()) == "file":
|
464
|
-
return zarr_is_adata(data_path)
|
465
|
-
else:
|
466
|
-
logger.warning("We do not check if cloud zarr is AnnData or not")
|
467
|
-
return False
|
468
|
-
return False
|
469
|
-
|
470
|
-
|
471
|
-
def data_is_mudata(data: MuData | UPathStr) -> bool:
|
472
|
-
if _mudata_is_installed():
|
473
|
-
from mudata import MuData
|
474
|
-
|
475
|
-
if isinstance(data, MuData):
|
476
|
-
return True
|
477
|
-
if isinstance(data, (str, Path)):
|
478
|
-
return UPath(data).suffix == ".h5mu"
|
479
|
-
return False
|
480
|
-
|
481
|
-
|
482
|
-
def _check_otype_artifact(data: Any, otype: str | None = None):
|
483
|
-
if otype is None:
|
484
|
-
if isinstance(data, pd.DataFrame):
|
485
|
-
logger.warning("data is a DataFrame, please use .from_df()")
|
486
|
-
otype = "DataFrame"
|
487
|
-
return otype
|
488
|
-
|
489
|
-
data_is_path = isinstance(data, (str, Path))
|
490
|
-
if data_is_anndata(data):
|
491
|
-
if not data_is_path:
|
492
|
-
logger.warning("data is an AnnData, please use .from_anndata()")
|
493
|
-
otype = "AnnData"
|
494
|
-
elif data_is_mudata(data):
|
495
|
-
if not data_is_path:
|
496
|
-
logger.warning("data is a MuData, please use .from_mudata()")
|
497
|
-
otype = "MuData"
|
498
|
-
elif not data_is_path: # UPath is a subclass of Path
|
499
|
-
raise TypeError("data has to be a string, Path, UPath")
|
500
|
-
return otype
|
501
|
-
|
502
|
-
|
503
|
-
def __init__(artifact: Artifact, *args, **kwargs):
|
504
|
-
artifact.features = FeatureManager(artifact) # type: ignore
|
505
|
-
artifact.params = ParamManager(artifact) # type: ignore
|
506
|
-
# Below checks for the Django-internal call in from_db()
|
507
|
-
# it'd be better if we could avoid this, but not being able to create a Artifact
|
508
|
-
# from data with the default constructor renders the central class of the API
|
509
|
-
# essentially useless
|
510
|
-
# The danger below is not that a user might pass as many args (12 of it), but rather
|
511
|
-
# that at some point the Django API might change; on the other hand, this
|
512
|
-
# condition of for calling the constructor based on kwargs should always
|
513
|
-
# stay robust
|
514
|
-
if len(args) == len(artifact._meta.concrete_fields):
|
515
|
-
super(Artifact, artifact).__init__(*args, **kwargs)
|
516
|
-
return None
|
517
|
-
# now we proceed with the user-facing constructor
|
518
|
-
if len(args) > 1:
|
519
|
-
raise ValueError("Only one non-keyword arg allowed: data")
|
520
|
-
|
521
|
-
data: str | Path = kwargs.pop("data") if len(args) == 0 else args[0]
|
522
|
-
kind: str = kwargs.pop("kind") if "kind" in kwargs else None
|
523
|
-
key: str | None = kwargs.pop("key") if "key" in kwargs else None
|
524
|
-
run: Run | None = kwargs.pop("run") if "run" in kwargs else None
|
525
|
-
description: str | None = (
|
526
|
-
kwargs.pop("description") if "description" in kwargs else None
|
527
|
-
)
|
528
|
-
revises: Artifact | None = kwargs.pop("revises") if "revises" in kwargs else None
|
529
|
-
version: str | None = kwargs.pop("version") if "version" in kwargs else None
|
530
|
-
if "visibility" in kwargs:
|
531
|
-
_branch_code = kwargs.pop("visibility")
|
532
|
-
elif "_branch_code" in kwargs:
|
533
|
-
_branch_code = kwargs.pop("_branch_code")
|
534
|
-
else:
|
535
|
-
_branch_code = 1
|
536
|
-
format = kwargs.pop("format") if "format" in kwargs else None
|
537
|
-
_is_internal_call = kwargs.pop("_is_internal_call", False)
|
538
|
-
skip_check_exists = (
|
539
|
-
kwargs.pop("skip_check_exists") if "skip_check_exists" in kwargs else False
|
540
|
-
)
|
541
|
-
if "default_storage" in kwargs:
|
542
|
-
default_storage = kwargs.pop("default_storage")
|
543
|
-
else:
|
544
|
-
if setup_settings.instance.keep_artifacts_local:
|
545
|
-
default_storage = setup_settings.instance.storage_local.record
|
546
|
-
else:
|
547
|
-
default_storage = setup_settings.instance.storage.record
|
548
|
-
using_key = (
|
549
|
-
kwargs.pop("using_key") if "using_key" in kwargs else settings._using_key
|
550
|
-
)
|
551
|
-
otype = kwargs.pop("otype") if "otype" in kwargs else None
|
552
|
-
otype = _check_otype_artifact(data=data, otype=otype)
|
553
|
-
if "type" in kwargs:
|
554
|
-
logger.warning("`type` will be removed soon, please use `kind`")
|
555
|
-
kind = kwargs.pop("type")
|
556
|
-
if not len(kwargs) == 0:
|
557
|
-
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
|
558
|
-
raise FieldValidationError(
|
559
|
-
f"Only {valid_keywords} can be passed, you passed: {kwargs}"
|
560
|
-
)
|
561
|
-
if revises is not None and key is not None and revises.key != key:
|
562
|
-
note = message_update_key_in_version_family(
|
563
|
-
suid=revises.stem_uid,
|
564
|
-
existing_key=revises.key,
|
565
|
-
new_key=key,
|
566
|
-
registry="Artifact",
|
567
|
-
)
|
568
|
-
raise ValueError(
|
569
|
-
f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
|
570
|
-
)
|
571
|
-
if revises is not None:
|
572
|
-
if not isinstance(revises, Artifact):
|
573
|
-
raise TypeError("`revises` has to be of type `Artifact`")
|
574
|
-
if description is None:
|
575
|
-
description = revises.description
|
576
|
-
if key is not None and AUTO_KEY_PREFIX in key:
|
577
|
-
raise ValueError(
|
578
|
-
f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
|
579
|
-
)
|
580
|
-
# below is for internal calls that require defining the storage location
|
581
|
-
# ahead of constructing the Artifact
|
582
|
-
if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
|
583
|
-
if _is_internal_call:
|
584
|
-
is_automanaged_path = True
|
585
|
-
user_provided_key = key
|
586
|
-
key = None
|
587
|
-
else:
|
588
|
-
raise ValueError(
|
589
|
-
f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
|
590
|
-
)
|
591
|
-
else:
|
592
|
-
is_automanaged_path = False
|
593
|
-
provisional_uid, revises = create_uid(revises=revises, version=version)
|
594
|
-
kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
|
595
|
-
data=data,
|
596
|
-
key=key,
|
597
|
-
run=run,
|
598
|
-
format=format,
|
599
|
-
provisional_uid=provisional_uid,
|
600
|
-
version=version,
|
601
|
-
default_storage=default_storage,
|
602
|
-
using_key=using_key,
|
603
|
-
skip_check_exists=skip_check_exists,
|
604
|
-
)
|
605
|
-
|
606
|
-
# an object with the same hash already exists
|
607
|
-
if isinstance(kwargs_or_artifact, Artifact):
|
608
|
-
from ._record import init_self_from_db, update_attributes
|
609
|
-
|
610
|
-
init_self_from_db(artifact, kwargs_or_artifact)
|
611
|
-
# adding "key" here is dangerous because key might be auto-populated
|
612
|
-
attr_to_update = {"description": description}
|
613
|
-
if kwargs_or_artifact._key_is_virtual and kwargs_or_artifact.key is None:
|
614
|
-
attr_to_update["key"] = key
|
615
|
-
elif artifact.key != key and key is not None:
|
616
|
-
logger.warning(
|
617
|
-
f"key {artifact.key} on existing artifact differs from passed key {key}"
|
618
|
-
)
|
619
|
-
update_attributes(artifact, attr_to_update)
|
620
|
-
return None
|
621
|
-
else:
|
622
|
-
kwargs = kwargs_or_artifact
|
623
|
-
|
624
|
-
if revises is None:
|
625
|
-
revises = kwargs_or_artifact.pop("revises")
|
626
|
-
|
627
|
-
if data is not None:
|
628
|
-
artifact._local_filepath = privates["local_filepath"]
|
629
|
-
artifact._cloud_filepath = privates["cloud_filepath"]
|
630
|
-
artifact._memory_rep = privates["memory_rep"]
|
631
|
-
artifact._to_store = not privates["check_path_in_storage"]
|
632
|
-
|
633
|
-
if is_automanaged_path and _is_internal_call:
|
634
|
-
kwargs["_key_is_virtual"] = True
|
635
|
-
assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
|
636
|
-
uid = kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
|
637
|
-
kwargs["key"] = user_provided_key
|
638
|
-
if revises is not None:
|
639
|
-
assert uid.startswith(revises.stem_uid) # noqa: S101
|
640
|
-
if len(uid) == 16:
|
641
|
-
if revises is None:
|
642
|
-
uid += "0000"
|
643
|
-
else:
|
644
|
-
uid, revises = create_uid(revises=revises, version=version)
|
645
|
-
kwargs["uid"] = uid
|
646
|
-
|
647
|
-
# only set key now so that we don't do a look-up on it in case revises is passed
|
648
|
-
if revises is not None:
|
649
|
-
kwargs["key"] = revises.key
|
650
|
-
|
651
|
-
kwargs["kind"] = kind
|
652
|
-
kwargs["version"] = version
|
653
|
-
kwargs["description"] = description
|
654
|
-
kwargs["_branch_code"] = _branch_code
|
655
|
-
kwargs["otype"] = otype
|
656
|
-
kwargs["revises"] = revises
|
657
|
-
# this check needs to come down here because key might be populated from an
|
658
|
-
# existing file path during get_artifact_kwargs_from_data()
|
659
|
-
if (
|
660
|
-
kwargs["key"] is None
|
661
|
-
and kwargs["description"] is None
|
662
|
-
and kwargs["run"] is None
|
663
|
-
):
|
664
|
-
raise ValueError("Pass one of key, run or description as a parameter")
|
665
|
-
|
666
|
-
super(Artifact, artifact).__init__(**kwargs)
|
667
|
-
|
668
|
-
|
669
|
-
@classmethod # type: ignore
|
670
|
-
@doc_args(Artifact.from_df.__doc__)
|
671
|
-
def from_df(
|
672
|
-
cls,
|
673
|
-
df: pd.DataFrame,
|
674
|
-
*,
|
675
|
-
key: str | None = None,
|
676
|
-
description: str | None = None,
|
677
|
-
run: Run | None = None,
|
678
|
-
revises: Artifact | None = None,
|
679
|
-
**kwargs,
|
680
|
-
) -> Artifact:
|
681
|
-
"""{}""" # noqa: D415
|
682
|
-
artifact = Artifact( # type: ignore
|
683
|
-
data=df,
|
684
|
-
key=key,
|
685
|
-
run=run,
|
686
|
-
description=description,
|
687
|
-
revises=revises,
|
688
|
-
otype="DataFrame",
|
689
|
-
kind="dataset",
|
690
|
-
**kwargs,
|
691
|
-
)
|
692
|
-
return artifact
|
693
|
-
|
694
|
-
|
695
|
-
@classmethod # type: ignore
|
696
|
-
@doc_args(Artifact.from_anndata.__doc__)
|
697
|
-
def from_anndata(
|
698
|
-
cls,
|
699
|
-
adata: AnnData | UPathStr,
|
700
|
-
*,
|
701
|
-
key: str | None = None,
|
702
|
-
description: str | None = None,
|
703
|
-
run: Run | None = None,
|
704
|
-
revises: Artifact | None = None,
|
705
|
-
**kwargs,
|
706
|
-
) -> Artifact:
|
707
|
-
"""{}""" # noqa: D415
|
708
|
-
if not data_is_anndata(adata):
|
709
|
-
raise ValueError("data has to be an AnnData object or a path to AnnData-like")
|
710
|
-
_anndata_n_observations(adata)
|
711
|
-
artifact = Artifact( # type: ignore
|
712
|
-
data=adata,
|
713
|
-
key=key,
|
714
|
-
run=run,
|
715
|
-
description=description,
|
716
|
-
revises=revises,
|
717
|
-
otype="AnnData",
|
718
|
-
kind="dataset",
|
719
|
-
**kwargs,
|
720
|
-
)
|
721
|
-
# this is done instead of _anndata_n_observations(adata)
|
722
|
-
# because we need a proper path through create_path for cloud paths
|
723
|
-
# for additional upath options etc that create_path adds
|
724
|
-
obj_for_obs: AnnData | UPath
|
725
|
-
if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
|
726
|
-
obj_for_obs = artifact._memory_rep
|
727
|
-
else:
|
728
|
-
# returns ._local_filepath for local files
|
729
|
-
# and the proper path through create_path for cloud paths
|
730
|
-
obj_for_obs = artifact.path
|
731
|
-
artifact.n_observations = _anndata_n_observations(obj_for_obs)
|
732
|
-
return artifact
|
733
|
-
|
734
|
-
|
735
|
-
@classmethod # type: ignore
|
736
|
-
@doc_args(Artifact.from_mudata.__doc__)
|
737
|
-
def from_mudata(
|
738
|
-
cls,
|
739
|
-
mdata: MuData,
|
740
|
-
*,
|
741
|
-
key: str | None = None,
|
742
|
-
description: str | None = None,
|
743
|
-
run: Run | None = None,
|
744
|
-
revises: Artifact | None = None,
|
745
|
-
**kwargs,
|
746
|
-
) -> Artifact:
|
747
|
-
"""{}""" # noqa: D415
|
748
|
-
artifact = Artifact( # type: ignore
|
749
|
-
data=mdata,
|
750
|
-
key=key,
|
751
|
-
run=run,
|
752
|
-
description=description,
|
753
|
-
revises=revises,
|
754
|
-
otype="MuData",
|
755
|
-
kind="dataset",
|
756
|
-
**kwargs,
|
757
|
-
)
|
758
|
-
artifact.n_observations = mdata.n_obs
|
759
|
-
return artifact
|
760
|
-
|
761
|
-
|
762
|
-
@classmethod # type: ignore
|
763
|
-
@doc_args(Artifact.from_tiledbsoma.__doc__)
|
764
|
-
def from_tiledbsoma(
|
765
|
-
cls,
|
766
|
-
path: UPathStr,
|
767
|
-
*,
|
768
|
-
key: str | None = None,
|
769
|
-
description: str | None = None,
|
770
|
-
run: Run | None = None,
|
771
|
-
revises: Artifact | None = None,
|
772
|
-
**kwargs,
|
773
|
-
) -> Artifact:
|
774
|
-
"""{}""" # noqa: D415
|
775
|
-
if UPath(path).suffix != ".tiledbsoma":
|
776
|
-
raise ValueError(
|
777
|
-
"A tiledbsoma store should have .tiledbsoma suffix to be registered."
|
778
|
-
)
|
779
|
-
artifact = Artifact( # type: ignore
|
780
|
-
data=path,
|
781
|
-
key=key,
|
782
|
-
run=run,
|
783
|
-
description=description,
|
784
|
-
revises=revises,
|
785
|
-
otype="tiledbsoma",
|
786
|
-
kind="dataset",
|
787
|
-
**kwargs,
|
788
|
-
)
|
789
|
-
artifact.n_observations = _soma_n_observations(artifact.path)
|
790
|
-
return artifact
|
791
|
-
|
792
|
-
|
793
|
-
@classmethod # type: ignore
|
794
|
-
@doc_args(Artifact.from_dir.__doc__)
|
795
|
-
def from_dir(
|
796
|
-
cls,
|
797
|
-
path: UPathStr,
|
798
|
-
*,
|
799
|
-
key: str | None = None,
|
800
|
-
run: Run | None = None,
|
801
|
-
) -> list[Artifact]:
|
802
|
-
"""{}""" # noqa: D415
|
803
|
-
folderpath: UPath = create_path(path) # returns Path for local
|
804
|
-
default_storage = settings.storage.record
|
805
|
-
using_key = settings._using_key
|
806
|
-
storage, use_existing_storage = process_pathlike(
|
807
|
-
folderpath, default_storage, using_key
|
808
|
-
)
|
809
|
-
folder_key_path: PurePath | Path
|
810
|
-
if key is None:
|
811
|
-
if not use_existing_storage:
|
812
|
-
logger.warning(
|
813
|
-
"folder is outside existing storage location, will copy files from"
|
814
|
-
f" {path} to {storage.root}/{folderpath.name}"
|
815
|
-
)
|
816
|
-
folder_key_path = Path(folderpath.name)
|
817
|
-
else:
|
818
|
-
# maintain the hierachy within an existing storage location
|
819
|
-
folder_key_path = get_relative_path_to_directory(
|
820
|
-
folderpath, UPath(storage.root)
|
821
|
-
)
|
822
|
-
else:
|
823
|
-
folder_key_path = Path(key)
|
824
|
-
|
825
|
-
folder_key = folder_key_path.as_posix()
|
826
|
-
# silence fine-grained logging
|
827
|
-
verbosity = settings.verbosity
|
828
|
-
verbosity_int = settings._verbosity_int
|
829
|
-
if verbosity_int >= 1:
|
830
|
-
settings.verbosity = "warning"
|
831
|
-
artifacts_dict = {}
|
832
|
-
for filepath in folderpath.rglob("*"):
|
833
|
-
if filepath.is_file():
|
834
|
-
relative_path = get_relative_path_to_directory(filepath, folderpath)
|
835
|
-
artifact_key = folder_key + "/" + relative_path.as_posix()
|
836
|
-
# if creating from rglob, we don't need to check for existence
|
837
|
-
artifact = Artifact(
|
838
|
-
filepath, run=run, key=artifact_key, skip_check_exists=True
|
839
|
-
)
|
840
|
-
artifacts_dict[artifact.uid] = artifact
|
841
|
-
settings.verbosity = verbosity
|
842
|
-
|
843
|
-
# run sanity check on hashes
|
844
|
-
hashes = [
|
845
|
-
artifact.hash
|
846
|
-
for artifact in artifacts_dict.values()
|
847
|
-
if artifact.hash is not None
|
848
|
-
]
|
849
|
-
uids = artifacts_dict.keys()
|
850
|
-
n_unique_hashes = len(set(hashes))
|
851
|
-
if n_unique_hashes == len(hashes):
|
852
|
-
artifacts = list(artifacts_dict.values())
|
853
|
-
else:
|
854
|
-
# consider exact duplicates (same id, same hash)
|
855
|
-
# below can't happen anymore because artifacts is a dict now
|
856
|
-
# if len(set(uids)) == len(set(hashes)):
|
857
|
-
# logger.warning("dropping duplicate records in list of artifact records")
|
858
|
-
# artifacts = list(set(uids))
|
859
|
-
# consider false duplicates (different id, same hash)
|
860
|
-
if not len(set(uids)) == n_unique_hashes:
|
861
|
-
seen_hashes = set()
|
862
|
-
non_unique_artifacts = {
|
863
|
-
hash: artifact
|
864
|
-
for hash, artifact in artifacts_dict.items()
|
865
|
-
if artifact.hash in seen_hashes or seen_hashes.add(artifact.hash) # type: ignore
|
866
|
-
}
|
867
|
-
display_non_unique = "\n ".join(
|
868
|
-
f"{artifact}" for artifact in non_unique_artifacts
|
869
|
-
)
|
870
|
-
logger.warning(
|
871
|
-
"there are multiple artifact uids with the same hashes, dropping"
|
872
|
-
f" {len(non_unique_artifacts)} duplicates out of"
|
873
|
-
f" {len(artifacts_dict)} artifacts:\n {display_non_unique}"
|
874
|
-
)
|
875
|
-
artifacts = [
|
876
|
-
artifact
|
877
|
-
for artifact in artifacts_dict.values()
|
878
|
-
if artifact not in non_unique_artifacts.values()
|
879
|
-
]
|
880
|
-
logger.success(
|
881
|
-
f"created {len(artifacts)} artifacts from directory using storage"
|
882
|
-
f" {storage.root} and key = {folder_key}/"
|
883
|
-
)
|
884
|
-
return artifacts
|
885
|
-
|
886
|
-
|
887
|
-
# docstring handled through attach_func_to_class_method
|
888
|
-
def replace(
|
889
|
-
self,
|
890
|
-
data: UPathStr | pd.DataFrame | AnnData | MuData,
|
891
|
-
run: Run | None = None,
|
892
|
-
format: str | None = None,
|
893
|
-
) -> None:
|
894
|
-
default_storage = settings.storage.record
|
895
|
-
kwargs, privates = get_artifact_kwargs_from_data(
|
896
|
-
provisional_uid=self.uid,
|
897
|
-
data=data,
|
898
|
-
key=self.key,
|
899
|
-
run=run,
|
900
|
-
format=format,
|
901
|
-
default_storage=default_storage,
|
902
|
-
version=None,
|
903
|
-
is_replace=True,
|
904
|
-
)
|
905
|
-
|
906
|
-
# this artifact already exists
|
907
|
-
if privates is None:
|
908
|
-
return kwargs
|
909
|
-
|
910
|
-
check_path_in_storage = privates["check_path_in_storage"]
|
911
|
-
if check_path_in_storage:
|
912
|
-
err_msg = (
|
913
|
-
"Can only replace with a local path not in any Storage. "
|
914
|
-
f"This data is in {Storage.objects.get(id=kwargs['storage_id'])}."
|
915
|
-
)
|
916
|
-
raise ValueError(err_msg)
|
917
|
-
|
918
|
-
_overwrite_versions = kwargs["_overwrite_versions"]
|
919
|
-
if self._overwrite_versions != _overwrite_versions:
|
920
|
-
err_msg = "It is not allowed to replace "
|
921
|
-
err_msg += "a folder" if self._overwrite_versions else "a file"
|
922
|
-
err_msg += " with " + ("a folder." if _overwrite_versions else "a file.")
|
923
|
-
raise ValueError(err_msg)
|
924
|
-
|
925
|
-
if self.key is not None and not self._key_is_virtual:
|
926
|
-
key_path = PurePosixPath(self.key)
|
927
|
-
new_filename = f"{key_path.stem}{kwargs['suffix']}"
|
928
|
-
# the following will only be true if the suffix changes!
|
929
|
-
if key_path.name != new_filename:
|
930
|
-
self._clear_storagekey = self.key
|
931
|
-
self.key = str(key_path.with_name(new_filename))
|
932
|
-
# update old key with the new one so that checks in record pass
|
933
|
-
self._old_key = self.key
|
934
|
-
logger.warning(
|
935
|
-
f"replacing the file will replace key '{key_path}' with '{self.key}'"
|
936
|
-
f" and delete '{key_path}' upon `save()`"
|
937
|
-
)
|
938
|
-
else:
|
939
|
-
old_storage = auto_storage_key_from_artifact(self)
|
940
|
-
is_dir = self.n_files is not None
|
941
|
-
new_storage = auto_storage_key_from_artifact_uid(
|
942
|
-
self.uid, kwargs["suffix"], is_dir
|
943
|
-
)
|
944
|
-
if old_storage != new_storage:
|
945
|
-
self._clear_storagekey = old_storage
|
946
|
-
if self.key is not None:
|
947
|
-
new_key_path = PurePosixPath(self.key).with_suffix(kwargs["suffix"])
|
948
|
-
self.key = str(new_key_path)
|
949
|
-
# update old key with the new one so that checks in record pass
|
950
|
-
self._old_key = self.key
|
951
|
-
|
952
|
-
self.suffix = kwargs["suffix"]
|
953
|
-
self.size = kwargs["size"]
|
954
|
-
self.hash = kwargs["hash"]
|
955
|
-
self._hash_type = kwargs["_hash_type"]
|
956
|
-
self.run_id = kwargs["run_id"]
|
957
|
-
self.run = kwargs["run"]
|
958
|
-
self.n_files = kwargs["n_files"]
|
959
|
-
|
960
|
-
self._local_filepath = privates["local_filepath"]
|
961
|
-
self._cloud_filepath = privates["cloud_filepath"]
|
962
|
-
self._memory_rep = privates["memory_rep"]
|
963
|
-
# no need to upload if new file is already in storage
|
964
|
-
self._to_store = not check_path_in_storage
|
965
|
-
|
966
|
-
|
967
|
-
inconsistent_state_msg = (
|
968
|
-
"Trying to read a folder artifact from an outdated version, "
|
969
|
-
"this can result in an incosistent state.\n"
|
970
|
-
"Read from the latest version: artifact.versions.filter(is_latest=True).one()"
|
971
|
-
)
|
972
|
-
|
973
|
-
|
974
|
-
# docstring handled through attach_func_to_class_method
|
975
|
-
def open(
|
976
|
-
self, mode: str = "r", is_run_input: bool | None = None
|
977
|
-
) -> (
|
978
|
-
AnnDataAccessor
|
979
|
-
| BackedAccessor
|
980
|
-
| SOMACollection
|
981
|
-
| SOMAExperiment
|
982
|
-
| SOMAMeasurement
|
983
|
-
| PyArrowDataset
|
984
|
-
):
|
985
|
-
if self._overwrite_versions and not self.is_latest:
|
986
|
-
raise ValueError(inconsistent_state_msg)
|
987
|
-
# ignore empty suffix for now
|
988
|
-
suffixes = (
|
989
|
-
"",
|
990
|
-
".h5",
|
991
|
-
".hdf5",
|
992
|
-
".h5ad",
|
993
|
-
".zarr",
|
994
|
-
".anndata.zarr",
|
995
|
-
".tiledbsoma",
|
996
|
-
) + PYARROW_SUFFIXES
|
997
|
-
if self.suffix not in suffixes:
|
998
|
-
raise ValueError(
|
999
|
-
"Artifact should have a zarr, h5, tiledbsoma object"
|
1000
|
-
" or a compatible `pyarrow.dataset.dataset` directory"
|
1001
|
-
" as the underlying data, please use one of the following suffixes"
|
1002
|
-
f" for the object name: {', '.join(suffixes[1:])}."
|
1003
|
-
f" Or no suffix for a folder with {', '.join(PYARROW_SUFFIXES)} files"
|
1004
|
-
" (no mixing allowed)."
|
1005
|
-
)
|
1006
|
-
if self.suffix != ".tiledbsoma" and self.key != "soma" and mode != "r":
|
1007
|
-
raise ValueError("Only a tiledbsoma store can be openened with `mode!='r'`.")
|
1008
|
-
|
1009
|
-
from lamindb.core.storage._backed_access import _track_writes_factory, backed_access
|
1010
|
-
|
1011
|
-
using_key = settings._using_key
|
1012
|
-
filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
|
1013
|
-
is_tiledbsoma_w = (
|
1014
|
-
filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
|
1015
|
-
) and mode == "w"
|
1016
|
-
# consider the case where an object is already locally cached
|
1017
|
-
localpath = setup_settings.paths.cloud_to_local_no_update(
|
1018
|
-
filepath, cache_key=cache_key
|
1019
|
-
)
|
1020
|
-
if is_tiledbsoma_w:
|
1021
|
-
open_cache = False
|
1022
|
-
else:
|
1023
|
-
open_cache = not isinstance(
|
1024
|
-
filepath, LocalPathClasses
|
1025
|
-
) and not filepath.synchronize(localpath, just_check=True)
|
1026
|
-
if open_cache:
|
1027
|
-
try:
|
1028
|
-
access = backed_access(localpath, mode, using_key)
|
1029
|
-
except Exception as e:
|
1030
|
-
if isinstance(filepath, LocalPathClasses):
|
1031
|
-
raise e
|
1032
|
-
logger.warning(
|
1033
|
-
f"The cache might be corrupted: {e}. Trying to open directly."
|
1034
|
-
)
|
1035
|
-
access = backed_access(filepath, mode, using_key)
|
1036
|
-
# happens only if backed_access has been successful
|
1037
|
-
# delete the corrupted cache
|
1038
|
-
if localpath.is_dir():
|
1039
|
-
shutil.rmtree(localpath)
|
1040
|
-
else:
|
1041
|
-
localpath.unlink(missing_ok=True)
|
1042
|
-
else:
|
1043
|
-
access = backed_access(filepath, mode, using_key)
|
1044
|
-
if is_tiledbsoma_w:
|
1045
|
-
|
1046
|
-
def finalize():
|
1047
|
-
nonlocal self, filepath, localpath
|
1048
|
-
if not isinstance(filepath, LocalPathClasses):
|
1049
|
-
_, hash, _, _ = get_stat_dir_cloud(filepath)
|
1050
|
-
else:
|
1051
|
-
# this can be very slow
|
1052
|
-
_, hash, _, _ = hash_dir(filepath)
|
1053
|
-
if self.hash != hash:
|
1054
|
-
from ._record import init_self_from_db
|
1055
|
-
|
1056
|
-
new_version = Artifact(
|
1057
|
-
filepath, revises=self, _is_internal_call=True
|
1058
|
-
).save()
|
1059
|
-
init_self_from_db(self, new_version)
|
1060
|
-
|
1061
|
-
if localpath != filepath and localpath.exists():
|
1062
|
-
shutil.rmtree(localpath)
|
1063
|
-
|
1064
|
-
access = _track_writes_factory(access, finalize)
|
1065
|
-
# only call if open is successfull
|
1066
|
-
_track_run_input(self, is_run_input)
|
1067
|
-
return access
|
1068
|
-
|
1069
|
-
|
1070
|
-
# can't really just call .cache in .load because of double tracking
|
1071
|
-
def _synchronize_cleanup_on_error(
|
1072
|
-
filepath: UPath, cache_key: str | None = None
|
1073
|
-
) -> UPath:
|
1074
|
-
try:
|
1075
|
-
cache_path = setup_settings.paths.cloud_to_local(
|
1076
|
-
filepath, cache_key=cache_key, print_progress=True
|
1077
|
-
)
|
1078
|
-
except Exception as e:
|
1079
|
-
if not isinstance(filepath, LocalPathClasses):
|
1080
|
-
cache_path = setup_settings.paths.cloud_to_local_no_update(
|
1081
|
-
filepath, cache_key=cache_key
|
1082
|
-
)
|
1083
|
-
if cache_path.is_dir():
|
1084
|
-
shutil.rmtree(cache_path)
|
1085
|
-
else:
|
1086
|
-
cache_path.unlink(missing_ok=True)
|
1087
|
-
raise e
|
1088
|
-
return cache_path
|
1089
|
-
|
1090
|
-
|
1091
|
-
# docstring handled through attach_func_to_class_method
|
1092
|
-
def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
|
1093
|
-
if self._overwrite_versions and not self.is_latest:
|
1094
|
-
raise ValueError(inconsistent_state_msg)
|
1095
|
-
|
1096
|
-
if hasattr(self, "_memory_rep") and self._memory_rep is not None:
|
1097
|
-
access_memory = self._memory_rep
|
1098
|
-
else:
|
1099
|
-
filepath, cache_key = filepath_cache_key_from_artifact(
|
1100
|
-
self, using_key=settings._using_key
|
1101
|
-
)
|
1102
|
-
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1103
|
-
try:
|
1104
|
-
# cache_path is local so doesn't trigger any sync in load_to_memory
|
1105
|
-
access_memory = load_to_memory(cache_path, **kwargs)
|
1106
|
-
except Exception as e:
|
1107
|
-
# just raise the exception if the original path is local
|
1108
|
-
if isinstance(filepath, LocalPathClasses):
|
1109
|
-
raise e
|
1110
|
-
logger.warning(
|
1111
|
-
f"The cache might be corrupted: {e}. Retrying to synchronize."
|
1112
|
-
)
|
1113
|
-
# delete the existing cache
|
1114
|
-
if cache_path.is_dir():
|
1115
|
-
shutil.rmtree(cache_path)
|
1116
|
-
else:
|
1117
|
-
cache_path.unlink(missing_ok=True)
|
1118
|
-
# download again and try to load into memory
|
1119
|
-
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1120
|
-
access_memory = load_to_memory(cache_path, **kwargs)
|
1121
|
-
# only call if load is successfull
|
1122
|
-
_track_run_input(self, is_run_input)
|
1123
|
-
return access_memory
|
1124
|
-
|
1125
|
-
|
1126
|
-
# docstring handled through attach_func_to_class_method
|
1127
|
-
def cache(self, is_run_input: bool | None = None) -> Path:
|
1128
|
-
if self._overwrite_versions and not self.is_latest:
|
1129
|
-
raise ValueError(inconsistent_state_msg)
|
1130
|
-
|
1131
|
-
filepath, cache_key = filepath_cache_key_from_artifact(
|
1132
|
-
self, using_key=settings._using_key
|
1133
|
-
)
|
1134
|
-
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1135
|
-
# only call if sync is successfull
|
1136
|
-
_track_run_input(self, is_run_input)
|
1137
|
-
return cache_path
|
1138
|
-
|
1139
|
-
|
1140
|
-
# docstring handled through attach_func_to_class_method
|
1141
|
-
def delete(
|
1142
|
-
self,
|
1143
|
-
permanent: bool | None = None,
|
1144
|
-
storage: bool | None = None,
|
1145
|
-
using_key: str | None = None,
|
1146
|
-
) -> None:
|
1147
|
-
# this first check means an invalid delete fails fast rather than cascading through
|
1148
|
-
# database and storage permission errors
|
1149
|
-
if os.getenv("LAMINDB_MULTI_INSTANCE") is None:
|
1150
|
-
isettings = setup_settings.instance
|
1151
|
-
if self.storage.instance_uid != isettings.uid and (storage or storage is None):
|
1152
|
-
raise IntegrityError(
|
1153
|
-
"Cannot simply delete artifacts outside of this instance's managed storage locations."
|
1154
|
-
"\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
|
1155
|
-
f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
|
1156
|
-
f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).df()}"
|
1157
|
-
)
|
1158
|
-
# by default, we only move artifacts into the trash (_branch_code = -1)
|
1159
|
-
trash__branch_code = -1
|
1160
|
-
if self._branch_code > trash__branch_code and not permanent:
|
1161
|
-
if storage is not None:
|
1162
|
-
logger.warning("moving artifact to trash, storage arg is ignored")
|
1163
|
-
# move to trash
|
1164
|
-
self._branch_code = trash__branch_code
|
1165
|
-
self.save()
|
1166
|
-
logger.important(
|
1167
|
-
f"moved artifact to trash (_branch_code = {trash__branch_code})"
|
1168
|
-
)
|
1169
|
-
return
|
1170
|
-
|
1171
|
-
# if the artifact is already in the trash
|
1172
|
-
# permanent delete skips the trash
|
1173
|
-
if permanent is None:
|
1174
|
-
# ask for confirmation of permanent delete
|
1175
|
-
response = input(
|
1176
|
-
"Artifact record is already in trash! Are you sure you want to permanently"
|
1177
|
-
" delete it? (y/n) You can't undo this action."
|
1178
|
-
)
|
1179
|
-
delete_record = response == "y"
|
1180
|
-
else:
|
1181
|
-
assert permanent # noqa: S101
|
1182
|
-
delete_record = True
|
1183
|
-
|
1184
|
-
if delete_record:
|
1185
|
-
# need to grab file path before deletion
|
1186
|
-
try:
|
1187
|
-
path, _ = filepath_from_artifact(self, using_key)
|
1188
|
-
except OSError:
|
1189
|
-
# we can still delete the record
|
1190
|
-
logger.warning("Could not get path")
|
1191
|
-
storage = False
|
1192
|
-
# only delete in storage if DB delete is successful
|
1193
|
-
# DB delete might error because of a foreign key constraint violated etc.
|
1194
|
-
if self._overwrite_versions and self.is_latest:
|
1195
|
-
# includes self
|
1196
|
-
for version in self.versions.all():
|
1197
|
-
_delete_skip_storage(version)
|
1198
|
-
else:
|
1199
|
-
self._delete_skip_storage()
|
1200
|
-
# by default do not delete storage if deleting only a previous version
|
1201
|
-
# and the underlying store is mutable
|
1202
|
-
if self._overwrite_versions and not self.is_latest:
|
1203
|
-
delete_in_storage = False
|
1204
|
-
if storage:
|
1205
|
-
logger.warning(
|
1206
|
-
"Storage argument is ignored; can't delete storage on an previous version"
|
1207
|
-
)
|
1208
|
-
elif self.key is None or self._key_is_virtual:
|
1209
|
-
# do not ask for confirmation also if storage is None
|
1210
|
-
delete_in_storage = storage is None or storage
|
1211
|
-
else:
|
1212
|
-
# for artifacts with non-virtual semantic storage keys (key is not None)
|
1213
|
-
# ask for extra-confirmation
|
1214
|
-
if storage is None:
|
1215
|
-
response = input(
|
1216
|
-
f"Are you sure to want to delete {path}? (y/n) You can't undo"
|
1217
|
-
" this action."
|
1218
|
-
)
|
1219
|
-
delete_in_storage = response == "y"
|
1220
|
-
else:
|
1221
|
-
delete_in_storage = storage
|
1222
|
-
if not delete_in_storage:
|
1223
|
-
logger.important(f"a file/folder remains here: {path}")
|
1224
|
-
# we don't yet have logic to bring back the deleted metadata record
|
1225
|
-
# in case storage deletion fails - this is important for ACID down the road
|
1226
|
-
if delete_in_storage:
|
1227
|
-
delete_msg = delete_storage(path, raise_file_not_found_error=False)
|
1228
|
-
if delete_msg != "did-not-delete":
|
1229
|
-
logger.success(f"deleted {colors.yellow(f'{path}')}")
|
1230
|
-
|
1231
|
-
|
1232
|
-
def _delete_skip_storage(artifact, *args, **kwargs) -> None:
|
1233
|
-
super(Artifact, artifact).delete(*args, **kwargs)
|
1234
|
-
|
1235
|
-
|
1236
|
-
# docstring handled through attach_func_to_class_method
|
1237
|
-
def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
1238
|
-
state_was_adding = self._state.adding
|
1239
|
-
print_progress = kwargs.pop("print_progress", True)
|
1240
|
-
access_token = kwargs.pop("access_token", None)
|
1241
|
-
local_path = None
|
1242
|
-
if upload and setup_settings.instance.keep_artifacts_local:
|
1243
|
-
# switch local storage location to cloud
|
1244
|
-
local_path = self.path
|
1245
|
-
self.storage_id = setup_settings.instance.storage.id
|
1246
|
-
self._local_filepath = local_path
|
1247
|
-
# switch to virtual storage key upon upload
|
1248
|
-
# the local filepath is already cached at that point
|
1249
|
-
self._key_is_virtual = True
|
1250
|
-
# ensure that the artifact is uploaded
|
1251
|
-
self._to_store = True
|
1252
|
-
|
1253
|
-
self._save_skip_storage(**kwargs)
|
1254
|
-
|
1255
|
-
from lamindb._save import check_and_attempt_clearing, check_and_attempt_upload
|
1256
|
-
|
1257
|
-
using_key = None
|
1258
|
-
if "using" in kwargs:
|
1259
|
-
using_key = kwargs["using"]
|
1260
|
-
exception_upload = check_and_attempt_upload(
|
1261
|
-
self, using_key, access_token=access_token, print_progress=print_progress
|
1262
|
-
)
|
1263
|
-
if exception_upload is not None:
|
1264
|
-
# we do not want to raise file not found on cleanup if upload of a file failed
|
1265
|
-
# often it is ACID in the filesystem itself
|
1266
|
-
# for example, s3 won't have the failed file, so just skip the delete in this case
|
1267
|
-
raise_file_not_found_error = False
|
1268
|
-
self._delete_skip_storage()
|
1269
|
-
else:
|
1270
|
-
# this is the case when it is cleaned on .replace
|
1271
|
-
raise_file_not_found_error = True
|
1272
|
-
# this is triggered by an exception in check_and_attempt_upload or by replace.
|
1273
|
-
exception_clear = check_and_attempt_clearing(
|
1274
|
-
self, raise_file_not_found_error=raise_file_not_found_error, using_key=using_key
|
1275
|
-
)
|
1276
|
-
if exception_upload is not None:
|
1277
|
-
raise RuntimeError(exception_upload)
|
1278
|
-
if exception_clear is not None:
|
1279
|
-
raise RuntimeError(exception_clear)
|
1280
|
-
# this is only for keep_artifacts_local
|
1281
|
-
if local_path is not None and not state_was_adding:
|
1282
|
-
# only move the local artifact to cache if it was not newly created
|
1283
|
-
local_path_cache = ln_setup.settings.cache_dir / local_path.name
|
1284
|
-
# don't use Path.rename here because of cross-device link error
|
1285
|
-
# https://laminlabs.slack.com/archives/C04A0RMA0SC/p1710259102686969
|
1286
|
-
shutil.move(
|
1287
|
-
local_path, # type: ignore
|
1288
|
-
local_path_cache,
|
1289
|
-
)
|
1290
|
-
logger.important(f"moved local artifact to cache: {local_path_cache}")
|
1291
|
-
return self
|
1292
|
-
|
1293
|
-
|
1294
|
-
def _save_skip_storage(file, **kwargs) -> None:
|
1295
|
-
save_staged_feature_sets(file)
|
1296
|
-
super(Artifact, file).save(**kwargs)
|
1297
|
-
save_schema_links(file)
|
1298
|
-
|
1299
|
-
|
1300
|
-
@property # type: ignore
|
1301
|
-
@doc_args(Artifact.path.__doc__)
|
1302
|
-
def path(self) -> Path | UPath:
|
1303
|
-
"""{}""" # noqa: D415
|
1304
|
-
# return only the path, without StorageSettings
|
1305
|
-
filepath, _ = filepath_from_artifact(self, using_key=settings._using_key)
|
1306
|
-
return filepath
|
1307
|
-
|
1308
|
-
|
1309
|
-
# get cache path without triggering sync
|
1310
|
-
@property # type: ignore
|
1311
|
-
def _cache_path(self) -> UPath:
|
1312
|
-
filepath, cache_key = filepath_cache_key_from_artifact(
|
1313
|
-
self, using_key=settings._using_key
|
1314
|
-
)
|
1315
|
-
if isinstance(filepath, LocalPathClasses):
|
1316
|
-
return filepath
|
1317
|
-
return setup_settings.paths.cloud_to_local_no_update(filepath, cache_key=cache_key)
|
1318
|
-
|
1319
|
-
|
1320
|
-
# docstring handled through attach_func_to_class_method
|
1321
|
-
def restore(self) -> None:
|
1322
|
-
self._branch_code = 1
|
1323
|
-
self.save()
|
1324
|
-
|
1325
|
-
|
1326
|
-
METHOD_NAMES = [
|
1327
|
-
"__init__",
|
1328
|
-
"from_anndata",
|
1329
|
-
"from_df",
|
1330
|
-
"from_mudata",
|
1331
|
-
"from_tiledbsoma",
|
1332
|
-
"open",
|
1333
|
-
"cache",
|
1334
|
-
"load",
|
1335
|
-
"delete",
|
1336
|
-
"save",
|
1337
|
-
"replace",
|
1338
|
-
"from_dir",
|
1339
|
-
"restore",
|
1340
|
-
]
|
1341
|
-
|
1342
|
-
if ln_setup._TESTING:
|
1343
|
-
from inspect import signature
|
1344
|
-
|
1345
|
-
SIGS = {
|
1346
|
-
name: signature(getattr(Artifact, name))
|
1347
|
-
for name in METHOD_NAMES
|
1348
|
-
if name != "__init__"
|
1349
|
-
}
|
1350
|
-
|
1351
|
-
for name in METHOD_NAMES:
|
1352
|
-
attach_func_to_class_method(name, Artifact, globals())
|
1353
|
-
|
1354
|
-
# privates currently dealt with separately
|
1355
|
-
# mypy: ignore-errors
|
1356
|
-
Artifact._delete_skip_storage = _delete_skip_storage
|
1357
|
-
Artifact._save_skip_storage = _save_skip_storage
|
1358
|
-
Artifact._cache_path = _cache_path
|
1359
|
-
Artifact.path = path
|
1360
|
-
Artifact.describe = describe
|
1361
|
-
Artifact.view_lineage = view_lineage
|