lamindb 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +17 -6
- lamindb/_artifact.py +202 -87
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +86 -52
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +21 -7
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +78 -18
- lamindb/_record.py +170 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +42 -11
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +129 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/users.py +1 -4
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +14 -9
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +53 -43
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +24 -9
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +67 -21
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +21 -15
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +49 -3
- lamindb/core/storage/_backed_access.py +12 -7
- lamindb/core/storage/_pyarrow_dataset.py +40 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +30 -24
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2193 -846
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +475 -168
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/METADATA +9 -7
- lamindb-1.1.1.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.5.dist-info/RECORD +0 -102
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
"""A data framework for biology.
|
2
2
|
|
3
|
-
Tracking notebooks &
|
3
|
+
Tracking notebooks, scripts & functions.
|
4
4
|
|
5
5
|
.. autosummary::
|
6
6
|
:toctree: .
|
7
7
|
|
8
8
|
track
|
9
9
|
finish
|
10
|
+
tracked
|
10
11
|
|
11
12
|
Registries.
|
12
13
|
|
@@ -20,7 +21,7 @@ Registries.
|
|
20
21
|
User
|
21
22
|
Storage
|
22
23
|
Feature
|
23
|
-
|
24
|
+
Schema
|
24
25
|
Param
|
25
26
|
Collection
|
26
27
|
Project
|
@@ -33,7 +34,6 @@ Key functionality.
|
|
33
34
|
:toctree: .
|
34
35
|
|
35
36
|
connect
|
36
|
-
Curator
|
37
37
|
view
|
38
38
|
save
|
39
39
|
|
@@ -44,23 +44,33 @@ Modules and settings.
|
|
44
44
|
|
45
45
|
integrations
|
46
46
|
context
|
47
|
+
curators
|
47
48
|
settings
|
49
|
+
errors
|
48
50
|
setup
|
49
51
|
UPath
|
50
52
|
base
|
51
53
|
core
|
52
54
|
|
55
|
+
Backward compatibility.
|
56
|
+
|
57
|
+
.. autosummary::
|
58
|
+
:toctree: .
|
59
|
+
|
60
|
+
FeatureSet
|
61
|
+
Curator
|
62
|
+
|
53
63
|
"""
|
54
64
|
|
55
65
|
# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
|
56
|
-
__version__ = "1.
|
66
|
+
__version__ = "1.1.1"
|
57
67
|
|
58
68
|
from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
|
59
69
|
from lamindb_setup._check_setup import _check_instance_setup
|
60
70
|
from lamindb_setup._connect_instance import connect
|
61
71
|
from lamindb_setup.core.upath import UPath
|
62
72
|
|
63
|
-
from . import base, setup
|
73
|
+
from . import base, errors, setup
|
64
74
|
|
65
75
|
|
66
76
|
def __getattr__(name):
|
@@ -86,10 +96,11 @@ if _check_instance_setup(from_module="lamindb"):
|
|
86
96
|
integrations,
|
87
97
|
)
|
88
98
|
from ._save import save
|
99
|
+
from ._tracked import tracked
|
89
100
|
from ._view import view
|
90
101
|
from .core._context import context
|
91
102
|
from .core._settings import settings
|
92
|
-
from .curators import Curator
|
103
|
+
from .curators import CatManager as Curator
|
93
104
|
from .models import (
|
94
105
|
Artifact,
|
95
106
|
Collection,
|
lamindb/_artifact.py
CHANGED
@@ -23,6 +23,8 @@ from lamindb_setup.core.upath import (
|
|
23
23
|
get_stat_file_cloud,
|
24
24
|
)
|
25
25
|
|
26
|
+
from lamindb._record import _get_record_kwargs
|
27
|
+
from lamindb.errors import FieldValidationError
|
26
28
|
from lamindb.models import Artifact, FeatureManager, ParamManager, Run, Storage
|
27
29
|
|
28
30
|
from ._parents import view_lineage
|
@@ -32,10 +34,9 @@ from .core._data import (
|
|
32
34
|
describe,
|
33
35
|
get_run,
|
34
36
|
save_schema_links,
|
35
|
-
|
37
|
+
save_staged_feature_sets,
|
36
38
|
)
|
37
39
|
from .core._settings import settings
|
38
|
-
from .core.exceptions import IntegrityError, InvalidArgument
|
39
40
|
from .core.loaders import load_to_memory
|
40
41
|
from .core.storage import (
|
41
42
|
LocalPathClasses,
|
@@ -44,7 +45,9 @@ from .core.storage import (
|
|
44
45
|
infer_suffix,
|
45
46
|
write_to_disk,
|
46
47
|
)
|
48
|
+
from .core.storage._anndata_accessor import _anndata_n_observations
|
47
49
|
from .core.storage._pyarrow_dataset import PYARROW_SUFFIXES
|
50
|
+
from .core.storage._tiledbsoma import _soma_n_observations
|
48
51
|
from .core.storage.objects import _mudata_is_installed
|
49
52
|
from .core.storage.paths import (
|
50
53
|
AUTO_KEY_PREFIX,
|
@@ -58,6 +61,7 @@ from .core.versioning import (
|
|
58
61
|
create_uid,
|
59
62
|
message_update_key_in_version_family,
|
60
63
|
)
|
64
|
+
from .errors import IntegrityError, InvalidArgument
|
61
65
|
|
62
66
|
try:
|
63
67
|
from .core.storage._zarr import zarr_is_adata
|
@@ -73,6 +77,7 @@ if TYPE_CHECKING:
|
|
73
77
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
74
78
|
from tiledbsoma import Collection as SOMACollection
|
75
79
|
from tiledbsoma import Experiment as SOMAExperiment
|
80
|
+
from tiledbsoma import Measurement as SOMAMeasurement
|
76
81
|
|
77
82
|
from lamindb.core.storage._backed_access import AnnDataAccessor, BackedAccessor
|
78
83
|
|
@@ -83,6 +88,7 @@ def process_pathlike(
|
|
83
88
|
using_key: str | None,
|
84
89
|
skip_existence_check: bool = False,
|
85
90
|
) -> tuple[Storage, bool]:
|
91
|
+
"""Determines the appropriate storage for a given path and whether to use an existing storage key."""
|
86
92
|
if not skip_existence_check:
|
87
93
|
try: # check if file exists
|
88
94
|
if not filepath.exists():
|
@@ -112,6 +118,10 @@ def process_pathlike(
|
|
112
118
|
hf_path.path_in_repo = ""
|
113
119
|
new_root = "hf://" + hf_path.unresolve()
|
114
120
|
else:
|
121
|
+
if filepath.protocol == "s3":
|
122
|
+
# check that endpoint_url didn't propagate here
|
123
|
+
# as a part of the path string
|
124
|
+
assert "?" not in filepath.path # noqa: S101
|
115
125
|
new_root = list(filepath.parents)[-1]
|
116
126
|
# do not register remote storage locations on hub if the current instance
|
117
127
|
# is not managed on the hub
|
@@ -142,6 +152,7 @@ def process_data(
|
|
142
152
|
default_storage: Storage,
|
143
153
|
using_key: str | None,
|
144
154
|
skip_existence_check: bool = False,
|
155
|
+
is_replace: bool = False,
|
145
156
|
) -> tuple[Any, Path | UPath, str, Storage, bool]:
|
146
157
|
"""Serialize a data object that's provided as file or in memory."""
|
147
158
|
# if not overwritten, data gets stored in default storage
|
@@ -151,14 +162,24 @@ def process_data(
|
|
151
162
|
data_types = (pd.DataFrame, AnnData, MuData)
|
152
163
|
else:
|
153
164
|
data_types = (pd.DataFrame, AnnData) # type:ignore
|
154
|
-
|
165
|
+
if key is not None:
|
166
|
+
key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
|
167
|
+
# use suffix as the (adata) format if the format is not provided
|
168
|
+
if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
|
169
|
+
format = key_suffix[1:]
|
170
|
+
else:
|
171
|
+
key_suffix = None
|
155
172
|
if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
|
156
173
|
access_token = (
|
157
174
|
default_storage._access_token
|
158
175
|
if hasattr(default_storage, "_access_token")
|
159
176
|
else None
|
160
177
|
)
|
161
|
-
path = create_path(data, access_token=access_token)
|
178
|
+
path = create_path(data, access_token=access_token)
|
179
|
+
# we don't resolve http links because they can resolve into a different domain
|
180
|
+
# for example into a temporary url
|
181
|
+
if path.protocol not in {"http", "https"}:
|
182
|
+
path = path.resolve()
|
162
183
|
storage, use_existing_storage_key = process_pathlike(
|
163
184
|
path,
|
164
185
|
default_storage=default_storage,
|
@@ -170,31 +191,23 @@ def process_data(
|
|
170
191
|
elif isinstance(data, data_types):
|
171
192
|
storage = default_storage
|
172
193
|
memory_rep = data
|
173
|
-
if key is not None:
|
174
|
-
key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
|
175
|
-
# use suffix as the (adata) format if the format is not provided
|
176
|
-
if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
|
177
|
-
format = key_suffix[1:]
|
178
|
-
else:
|
179
|
-
key_suffix = None
|
180
194
|
suffix = infer_suffix(data, format)
|
181
|
-
if key_suffix is not None and key_suffix != suffix:
|
182
|
-
raise InvalidArgument(
|
183
|
-
f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
|
184
|
-
f" be '{suffix}'."
|
185
|
-
)
|
186
|
-
cache_name = f"{provisional_uid}{suffix}"
|
187
|
-
path = settings.cache_dir / cache_name
|
188
|
-
# Alex: I don't understand the line below
|
189
|
-
if path.suffixes == []:
|
190
|
-
path = path.with_suffix(suffix)
|
191
|
-
write_to_disk(data, path)
|
192
|
-
use_existing_storage_key = False
|
193
195
|
else:
|
194
196
|
raise NotImplementedError(
|
195
|
-
f"Do not know how to create a artifact object from {data}, pass a path"
|
196
|
-
" instead!"
|
197
|
+
f"Do not know how to create a artifact object from {data}, pass a path instead!"
|
197
198
|
)
|
199
|
+
if key_suffix is not None and key_suffix != suffix and not is_replace:
|
200
|
+
# consciously omitting a trailing period
|
201
|
+
if isinstance(data, (str, Path, UPath)):
|
202
|
+
message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
|
203
|
+
else:
|
204
|
+
message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
|
205
|
+
raise InvalidArgument(message)
|
206
|
+
# in case we have an in-memory representation, we need to write it to disk
|
207
|
+
if isinstance(data, data_types):
|
208
|
+
path = settings.cache_dir / f"{provisional_uid}{suffix}"
|
209
|
+
write_to_disk(data, path)
|
210
|
+
use_existing_storage_key = False
|
198
211
|
return memory_rep, path, suffix, storage, use_existing_storage_key
|
199
212
|
|
200
213
|
|
@@ -205,6 +218,7 @@ def get_stat_or_artifact(
|
|
205
218
|
is_replace: bool = False,
|
206
219
|
instance: str | None = None,
|
207
220
|
) -> tuple[int, str | None, str | None, int | None, Artifact | None] | Artifact:
|
221
|
+
"""Retrieves file statistics or an existing artifact based on the path, hash, and key."""
|
208
222
|
n_files = None
|
209
223
|
if settings.creation.artifact_skip_size_hash:
|
210
224
|
return None, None, None, n_files, None
|
@@ -248,29 +262,14 @@ def get_stat_or_artifact(
|
|
248
262
|
)
|
249
263
|
previous_artifact_version = result[0]
|
250
264
|
if artifact_with_same_hash_exists:
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
)
|
257
|
-
|
258
|
-
|
259
|
-
logger.warning(
|
260
|
-
"creating new Artifact object despite existing artifact with same hash:"
|
261
|
-
f" {result[0]}"
|
262
|
-
)
|
263
|
-
return size, hash, hash_type, n_files, None
|
264
|
-
else:
|
265
|
-
if result[0]._branch_code == -1:
|
266
|
-
raise FileExistsError(
|
267
|
-
f"You're trying to re-create this artifact in trash: {result[0]}"
|
268
|
-
"Either permanently delete it with `artifact.delete(permanent=True)` or restore it with `artifact.restore()`"
|
269
|
-
)
|
270
|
-
logger.important(
|
271
|
-
f"returning existing artifact with same hash: {result[0]}; if you intended to query to track this artifact as an input, use: ln.Artifact.get()"
|
272
|
-
)
|
273
|
-
return result[0]
|
265
|
+
message = "found artifact with same hash"
|
266
|
+
if result[0]._branch_code == -1:
|
267
|
+
result[0].restore()
|
268
|
+
message = "restored artifact with same hash from trash"
|
269
|
+
logger.important(
|
270
|
+
f"{message}: {result[0]}; to track this artifact as an input, use: ln.Artifact.get()"
|
271
|
+
)
|
272
|
+
return result[0]
|
274
273
|
else:
|
275
274
|
return size, hash, hash_type, n_files, previous_artifact_version
|
276
275
|
|
@@ -326,6 +325,7 @@ def get_artifact_kwargs_from_data(
|
|
326
325
|
default_storage,
|
327
326
|
using_key,
|
328
327
|
skip_check_exists,
|
328
|
+
is_replace=is_replace,
|
329
329
|
)
|
330
330
|
stat_or_artifact = get_stat_or_artifact(
|
331
331
|
path=path,
|
@@ -441,7 +441,7 @@ def log_storage_hint(
|
|
441
441
|
root_path = Path(storage.root) # type: ignore
|
442
442
|
if check_path_is_child_of_root(root_path, Path.cwd()):
|
443
443
|
# only display the relative path, not the fully resolved path
|
444
|
-
display_root = root_path.relative_to(Path.cwd())
|
444
|
+
display_root = root_path.relative_to(Path.cwd()) # type: ignore
|
445
445
|
hint += f"path in storage '{display_root}'" # type: ignore
|
446
446
|
else:
|
447
447
|
hint += "path content will be copied to default storage upon `save()`"
|
@@ -458,7 +458,7 @@ def data_is_anndata(data: AnnData | UPathStr) -> bool:
|
|
458
458
|
return True
|
459
459
|
if isinstance(data, (str, Path, UPath)):
|
460
460
|
data_path = UPath(data)
|
461
|
-
if data_path.
|
461
|
+
if ".h5ad" in data_path.suffixes: # ".h5ad.gz" is a valid suffix
|
462
462
|
return True
|
463
463
|
elif data_path.suffix == ".zarr":
|
464
464
|
# ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
|
@@ -480,7 +480,7 @@ def data_is_mudata(data: MuData | UPathStr) -> bool:
|
|
480
480
|
if isinstance(data, MuData):
|
481
481
|
return True
|
482
482
|
if isinstance(data, (str, Path)):
|
483
|
-
return UPath(data).suffix
|
483
|
+
return UPath(data).suffix == ".h5mu"
|
484
484
|
return False
|
485
485
|
|
486
486
|
|
@@ -506,8 +506,8 @@ def _check_otype_artifact(data: Any, otype: str | None = None):
|
|
506
506
|
|
507
507
|
|
508
508
|
def __init__(artifact: Artifact, *args, **kwargs):
|
509
|
-
artifact.features = FeatureManager(artifact)
|
510
|
-
artifact.params = ParamManager(artifact)
|
509
|
+
artifact.features = FeatureManager(artifact) # type: ignore
|
510
|
+
artifact.params = ParamManager(artifact) # type: ignore
|
511
511
|
# Below checks for the Django-internal call in from_db()
|
512
512
|
# it'd be better if we could avoid this, but not being able to create a Artifact
|
513
513
|
# from data with the default constructor renders the central class of the API
|
@@ -559,9 +559,9 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
559
559
|
logger.warning("`type` will be removed soon, please use `kind`")
|
560
560
|
kind = kwargs.pop("type")
|
561
561
|
if not len(kwargs) == 0:
|
562
|
-
|
563
|
-
|
564
|
-
f" can be passed, you passed: {kwargs}"
|
562
|
+
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Artifact)])
|
563
|
+
raise FieldValidationError(
|
564
|
+
f"Only {valid_keywords} can be passed, you passed: {kwargs}"
|
565
565
|
)
|
566
566
|
if revises is not None and key is not None and revises.key != key:
|
567
567
|
note = message_update_key_in_version_family(
|
@@ -676,6 +676,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
676
676
|
def from_df(
|
677
677
|
cls,
|
678
678
|
df: pd.DataFrame,
|
679
|
+
*,
|
679
680
|
key: str | None = None,
|
680
681
|
description: str | None = None,
|
681
682
|
run: Run | None = None,
|
@@ -683,7 +684,7 @@ def from_df(
|
|
683
684
|
**kwargs,
|
684
685
|
) -> Artifact:
|
685
686
|
"""{}""" # noqa: D415
|
686
|
-
artifact = Artifact(
|
687
|
+
artifact = Artifact( # type: ignore
|
687
688
|
data=df,
|
688
689
|
key=key,
|
689
690
|
run=run,
|
@@ -693,6 +694,7 @@ def from_df(
|
|
693
694
|
kind="dataset",
|
694
695
|
**kwargs,
|
695
696
|
)
|
697
|
+
artifact.n_observations = len(df)
|
696
698
|
return artifact
|
697
699
|
|
698
700
|
|
@@ -701,6 +703,7 @@ def from_df(
|
|
701
703
|
def from_anndata(
|
702
704
|
cls,
|
703
705
|
adata: AnnData | UPathStr,
|
706
|
+
*,
|
704
707
|
key: str | None = None,
|
705
708
|
description: str | None = None,
|
706
709
|
run: Run | None = None,
|
@@ -710,7 +713,8 @@ def from_anndata(
|
|
710
713
|
"""{}""" # noqa: D415
|
711
714
|
if not data_is_anndata(adata):
|
712
715
|
raise ValueError("data has to be an AnnData object or a path to AnnData-like")
|
713
|
-
|
716
|
+
_anndata_n_observations(adata)
|
717
|
+
artifact = Artifact( # type: ignore
|
714
718
|
data=adata,
|
715
719
|
key=key,
|
716
720
|
run=run,
|
@@ -720,6 +724,17 @@ def from_anndata(
|
|
720
724
|
kind="dataset",
|
721
725
|
**kwargs,
|
722
726
|
)
|
727
|
+
# this is done instead of _anndata_n_observations(adata)
|
728
|
+
# because we need a proper path through create_path for cloud paths
|
729
|
+
# for additional upath options etc that create_path adds
|
730
|
+
obj_for_obs: AnnData | UPath
|
731
|
+
if hasattr(artifact, "_memory_rep") and artifact._memory_rep is not None:
|
732
|
+
obj_for_obs = artifact._memory_rep
|
733
|
+
else:
|
734
|
+
# returns ._local_filepath for local files
|
735
|
+
# and the proper path through create_path for cloud paths
|
736
|
+
obj_for_obs = artifact.path
|
737
|
+
artifact.n_observations = _anndata_n_observations(obj_for_obs)
|
723
738
|
return artifact
|
724
739
|
|
725
740
|
|
@@ -728,6 +743,7 @@ def from_anndata(
|
|
728
743
|
def from_mudata(
|
729
744
|
cls,
|
730
745
|
mdata: MuData,
|
746
|
+
*,
|
731
747
|
key: str | None = None,
|
732
748
|
description: str | None = None,
|
733
749
|
run: Run | None = None,
|
@@ -735,7 +751,7 @@ def from_mudata(
|
|
735
751
|
**kwargs,
|
736
752
|
) -> Artifact:
|
737
753
|
"""{}""" # noqa: D415
|
738
|
-
artifact = Artifact(
|
754
|
+
artifact = Artifact( # type: ignore
|
739
755
|
data=mdata,
|
740
756
|
key=key,
|
741
757
|
run=run,
|
@@ -745,6 +761,38 @@ def from_mudata(
|
|
745
761
|
kind="dataset",
|
746
762
|
**kwargs,
|
747
763
|
)
|
764
|
+
artifact.n_observations = mdata.n_obs
|
765
|
+
return artifact
|
766
|
+
|
767
|
+
|
768
|
+
@classmethod # type: ignore
|
769
|
+
@doc_args(Artifact.from_tiledbsoma.__doc__)
|
770
|
+
def from_tiledbsoma(
|
771
|
+
cls,
|
772
|
+
path: UPathStr,
|
773
|
+
*,
|
774
|
+
key: str | None = None,
|
775
|
+
description: str | None = None,
|
776
|
+
run: Run | None = None,
|
777
|
+
revises: Artifact | None = None,
|
778
|
+
**kwargs,
|
779
|
+
) -> Artifact:
|
780
|
+
"""{}""" # noqa: D415
|
781
|
+
if UPath(path).suffix != ".tiledbsoma":
|
782
|
+
raise ValueError(
|
783
|
+
"A tiledbsoma store should have .tiledbsoma suffix to be registered."
|
784
|
+
)
|
785
|
+
artifact = Artifact( # type: ignore
|
786
|
+
data=path,
|
787
|
+
key=key,
|
788
|
+
run=run,
|
789
|
+
description=description,
|
790
|
+
revises=revises,
|
791
|
+
otype="tiledbsoma",
|
792
|
+
kind="dataset",
|
793
|
+
**kwargs,
|
794
|
+
)
|
795
|
+
artifact.n_observations = _soma_n_observations(artifact.path)
|
748
796
|
return artifact
|
749
797
|
|
750
798
|
|
@@ -753,8 +801,8 @@ def from_mudata(
|
|
753
801
|
def from_dir(
|
754
802
|
cls,
|
755
803
|
path: UPathStr,
|
756
|
-
key: str | None = None,
|
757
804
|
*,
|
805
|
+
key: str | None = None,
|
758
806
|
run: Run | None = None,
|
759
807
|
) -> list[Artifact]:
|
760
808
|
"""{}""" # noqa: D415
|
@@ -931,22 +979,34 @@ inconsistent_state_msg = (
|
|
931
979
|
|
932
980
|
# docstring handled through attach_func_to_class_method
|
933
981
|
def open(
|
934
|
-
self, mode: str = "r", is_run_input: bool | None = None
|
982
|
+
self, mode: str = "r", is_run_input: bool | None = None, **kwargs
|
935
983
|
) -> (
|
936
|
-
AnnDataAccessor
|
984
|
+
AnnDataAccessor
|
985
|
+
| BackedAccessor
|
986
|
+
| SOMACollection
|
987
|
+
| SOMAExperiment
|
988
|
+
| SOMAMeasurement
|
989
|
+
| PyArrowDataset
|
937
990
|
):
|
938
991
|
if self._overwrite_versions and not self.is_latest:
|
939
992
|
raise ValueError(inconsistent_state_msg)
|
993
|
+
# all hdf5 suffixes including gzipped
|
994
|
+
h5_suffixes = [".h5", ".hdf5", ".h5ad"]
|
995
|
+
h5_suffixes += [s + ".gz" for s in h5_suffixes]
|
940
996
|
# ignore empty suffix for now
|
941
997
|
suffixes = (
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
998
|
+
(
|
999
|
+
"",
|
1000
|
+
".zarr",
|
1001
|
+
".anndata.zarr",
|
1002
|
+
".tiledbsoma",
|
1003
|
+
)
|
1004
|
+
+ tuple(h5_suffixes)
|
1005
|
+
+ PYARROW_SUFFIXES
|
1006
|
+
+ tuple(
|
1007
|
+
s + ".gz" for s in PYARROW_SUFFIXES
|
1008
|
+
) # this doesn't work for externally gzipped files, REMOVE LATER
|
1009
|
+
)
|
950
1010
|
if self.suffix not in suffixes:
|
951
1011
|
raise ValueError(
|
952
1012
|
"Artifact should have a zarr, h5, tiledbsoma object"
|
@@ -964,16 +1024,36 @@ def open(
|
|
964
1024
|
using_key = settings._using_key
|
965
1025
|
filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
|
966
1026
|
is_tiledbsoma_w = (
|
967
|
-
filepath.name == "soma" or
|
1027
|
+
filepath.name == "soma" or self.suffix == ".tiledbsoma"
|
968
1028
|
) and mode == "w"
|
969
1029
|
# consider the case where an object is already locally cached
|
970
1030
|
localpath = setup_settings.paths.cloud_to_local_no_update(
|
971
1031
|
filepath, cache_key=cache_key
|
972
1032
|
)
|
973
|
-
if
|
974
|
-
|
1033
|
+
if is_tiledbsoma_w:
|
1034
|
+
open_cache = False
|
975
1035
|
else:
|
976
|
-
|
1036
|
+
open_cache = not isinstance(
|
1037
|
+
filepath, LocalPathClasses
|
1038
|
+
) and not filepath.synchronize(localpath, just_check=True)
|
1039
|
+
if open_cache:
|
1040
|
+
try:
|
1041
|
+
access = backed_access(localpath, mode, using_key, **kwargs)
|
1042
|
+
except Exception as e:
|
1043
|
+
if isinstance(filepath, LocalPathClasses):
|
1044
|
+
raise e
|
1045
|
+
logger.warning(
|
1046
|
+
f"The cache might be corrupted: {e}. Trying to open directly."
|
1047
|
+
)
|
1048
|
+
access = backed_access(filepath, mode, using_key, **kwargs)
|
1049
|
+
# happens only if backed_access has been successful
|
1050
|
+
# delete the corrupted cache
|
1051
|
+
if localpath.is_dir():
|
1052
|
+
shutil.rmtree(localpath)
|
1053
|
+
else:
|
1054
|
+
localpath.unlink(missing_ok=True)
|
1055
|
+
else:
|
1056
|
+
access = backed_access(filepath, mode, using_key, **kwargs)
|
977
1057
|
if is_tiledbsoma_w:
|
978
1058
|
|
979
1059
|
def finalize():
|
@@ -1013,10 +1093,10 @@ def _synchronize_cleanup_on_error(
|
|
1013
1093
|
cache_path = setup_settings.paths.cloud_to_local_no_update(
|
1014
1094
|
filepath, cache_key=cache_key
|
1015
1095
|
)
|
1016
|
-
if cache_path.
|
1017
|
-
cache_path.unlink(missing_ok=True)
|
1018
|
-
elif cache_path.is_dir():
|
1096
|
+
if cache_path.is_dir():
|
1019
1097
|
shutil.rmtree(cache_path)
|
1098
|
+
else:
|
1099
|
+
cache_path.unlink(missing_ok=True)
|
1020
1100
|
raise e
|
1021
1101
|
return cache_path
|
1022
1102
|
|
@@ -1033,8 +1113,24 @@ def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
|
|
1033
1113
|
self, using_key=settings._using_key
|
1034
1114
|
)
|
1035
1115
|
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1036
|
-
|
1037
|
-
|
1116
|
+
try:
|
1117
|
+
# cache_path is local so doesn't trigger any sync in load_to_memory
|
1118
|
+
access_memory = load_to_memory(cache_path, **kwargs)
|
1119
|
+
except Exception as e:
|
1120
|
+
# just raise the exception if the original path is local
|
1121
|
+
if isinstance(filepath, LocalPathClasses):
|
1122
|
+
raise e
|
1123
|
+
logger.warning(
|
1124
|
+
f"The cache might be corrupted: {e}. Retrying to synchronize."
|
1125
|
+
)
|
1126
|
+
# delete the existing cache
|
1127
|
+
if cache_path.is_dir():
|
1128
|
+
shutil.rmtree(cache_path)
|
1129
|
+
else:
|
1130
|
+
cache_path.unlink(missing_ok=True)
|
1131
|
+
# download again and try to load into memory
|
1132
|
+
cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
|
1133
|
+
access_memory = load_to_memory(cache_path, **kwargs)
|
1038
1134
|
# only call if load is successfull
|
1039
1135
|
_track_run_input(self, is_run_input)
|
1040
1136
|
return access_memory
|
@@ -1154,6 +1250,7 @@ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
|
|
1154
1250
|
def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
1155
1251
|
state_was_adding = self._state.adding
|
1156
1252
|
print_progress = kwargs.pop("print_progress", True)
|
1253
|
+
store_kwargs = kwargs.pop("store_kwargs", {}) # kwargs for .upload_from in the end
|
1157
1254
|
access_token = kwargs.pop("access_token", None)
|
1158
1255
|
local_path = None
|
1159
1256
|
if upload and setup_settings.instance.keep_artifacts_local:
|
@@ -1174,15 +1271,31 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1174
1271
|
using_key = None
|
1175
1272
|
if "using" in kwargs:
|
1176
1273
|
using_key = kwargs["using"]
|
1177
|
-
|
1178
|
-
self,
|
1274
|
+
exception_upload = check_and_attempt_upload(
|
1275
|
+
self,
|
1276
|
+
using_key,
|
1277
|
+
access_token=access_token,
|
1278
|
+
print_progress=print_progress,
|
1279
|
+
**store_kwargs,
|
1179
1280
|
)
|
1180
|
-
if
|
1281
|
+
if exception_upload is not None:
|
1282
|
+
# we do not want to raise file not found on cleanup if upload of a file failed
|
1283
|
+
# often it is ACID in the filesystem itself
|
1284
|
+
# for example, s3 won't have the failed file, so just skip the delete in this case
|
1285
|
+
raise_file_not_found_error = False
|
1181
1286
|
self._delete_skip_storage()
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1287
|
+
else:
|
1288
|
+
# this is the case when it is cleaned on .replace
|
1289
|
+
raise_file_not_found_error = True
|
1290
|
+
# this is triggered by an exception in check_and_attempt_upload or by replace.
|
1291
|
+
exception_clear = check_and_attempt_clearing(
|
1292
|
+
self, raise_file_not_found_error=raise_file_not_found_error, using_key=using_key
|
1293
|
+
)
|
1294
|
+
if exception_upload is not None:
|
1295
|
+
raise RuntimeError(exception_upload)
|
1296
|
+
if exception_clear is not None:
|
1297
|
+
raise RuntimeError(exception_clear)
|
1298
|
+
# this is only for keep_artifacts_local
|
1186
1299
|
if local_path is not None and not state_was_adding:
|
1187
1300
|
# only move the local artifact to cache if it was not newly created
|
1188
1301
|
local_path_cache = ln_setup.settings.cache_dir / local_path.name
|
@@ -1197,7 +1310,7 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
|
|
1197
1310
|
|
1198
1311
|
|
1199
1312
|
def _save_skip_storage(file, **kwargs) -> None:
|
1200
|
-
|
1313
|
+
save_staged_feature_sets(file)
|
1201
1314
|
super(Artifact, file).save(**kwargs)
|
1202
1315
|
save_schema_links(file)
|
1203
1316
|
|
@@ -1233,6 +1346,7 @@ METHOD_NAMES = [
|
|
1233
1346
|
"from_anndata",
|
1234
1347
|
"from_df",
|
1235
1348
|
"from_mudata",
|
1349
|
+
"from_tiledbsoma",
|
1236
1350
|
"open",
|
1237
1351
|
"cache",
|
1238
1352
|
"load",
|
@@ -1256,6 +1370,7 @@ for name in METHOD_NAMES:
|
|
1256
1370
|
attach_func_to_class_method(name, Artifact, globals())
|
1257
1371
|
|
1258
1372
|
# privates currently dealt with separately
|
1373
|
+
# mypy: ignore-errors
|
1259
1374
|
Artifact._delete_skip_storage = _delete_skip_storage
|
1260
1375
|
Artifact._save_skip_storage = _save_skip_storage
|
1261
1376
|
Artifact._cache_path = _cache_path
|