lamindb 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +17 -6
- lamindb/_artifact.py +202 -87
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +86 -52
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +21 -7
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +78 -18
- lamindb/_record.py +170 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +42 -11
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +129 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/users.py +1 -4
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +14 -9
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +53 -43
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +24 -9
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +67 -21
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +21 -15
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +49 -3
- lamindb/core/storage/_backed_access.py +12 -7
- lamindb/core/storage/_pyarrow_dataset.py +40 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +30 -24
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2193 -846
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +475 -168
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/METADATA +9 -7
- lamindb-1.1.1.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.5.dist-info/RECORD +0 -102
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
- {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
lamindb/curators/__init__.py
CHANGED
@@ -1,21 +1,54 @@
|
|
1
|
+
"""Curators.
|
2
|
+
|
3
|
+
.. versionadded:: 1.1.0
|
4
|
+
|
5
|
+
.. autosummary::
|
6
|
+
:toctree: .
|
7
|
+
|
8
|
+
Curator
|
9
|
+
DataFrameCurator
|
10
|
+
AnnDataCurator
|
11
|
+
|
12
|
+
"""
|
13
|
+
|
1
14
|
from __future__ import annotations
|
2
15
|
|
3
16
|
import copy
|
4
|
-
import
|
17
|
+
import random
|
18
|
+
import re
|
19
|
+
from importlib import resources
|
5
20
|
from itertools import chain
|
6
|
-
from typing import TYPE_CHECKING
|
21
|
+
from typing import TYPE_CHECKING, Any, Literal
|
7
22
|
|
8
23
|
import anndata as ad
|
9
24
|
import lamindb_setup as ln_setup
|
10
25
|
import pandas as pd
|
26
|
+
import pandera
|
11
27
|
import pyarrow as pa
|
12
28
|
from lamin_utils import colors, logger
|
29
|
+
from lamindb_setup.core import deprecated, upath
|
13
30
|
from lamindb_setup.core._docs import doc_args
|
14
31
|
from lamindb_setup.core.upath import UPath
|
15
32
|
|
33
|
+
from lamindb.core.storage._backed_access import backed_access
|
34
|
+
|
35
|
+
from ._cellxgene_schemas import _read_schema_versions
|
36
|
+
|
37
|
+
if TYPE_CHECKING:
|
38
|
+
from anndata import AnnData
|
39
|
+
from lamindb_setup.core.types import UPathStr
|
40
|
+
|
41
|
+
from lamindb.base.types import FieldAttr
|
42
|
+
from lamindb.models import Record
|
43
|
+
from lamindb._feature import parse_dtype, parse_dtype_single_cat
|
16
44
|
from lamindb.base.types import FieldAttr # noqa
|
45
|
+
from lamindb.core._data import add_labels
|
46
|
+
from lamindb.core._feature_manager import parse_staged_feature_sets_from_anndata
|
47
|
+
from lamindb.core._settings import settings
|
17
48
|
from lamindb.models import (
|
18
49
|
Artifact,
|
50
|
+
CanCurate,
|
51
|
+
Collection,
|
19
52
|
Feature,
|
20
53
|
Record,
|
21
54
|
Run,
|
@@ -23,15 +56,25 @@ from lamindb.models import (
|
|
23
56
|
ULabel,
|
24
57
|
)
|
25
58
|
|
59
|
+
from .._artifact import data_is_anndata
|
26
60
|
from .._from_values import _format_values
|
27
|
-
from ..
|
61
|
+
from ..errors import InvalidArgument, ValidationError
|
28
62
|
|
29
63
|
if TYPE_CHECKING:
|
30
|
-
from collections.abc import Iterable
|
64
|
+
from collections.abc import Iterable, MutableMapping
|
31
65
|
from typing import Any
|
32
66
|
|
33
67
|
from lamindb_setup.core.types import UPathStr
|
34
68
|
from mudata import MuData
|
69
|
+
from spatialdata import SpatialData
|
70
|
+
|
71
|
+
from lamindb._query_set import RecordList
|
72
|
+
|
73
|
+
|
74
|
+
def strip_ansi_codes(text):
|
75
|
+
# This pattern matches ANSI escape sequences
|
76
|
+
ansi_pattern = re.compile(r"\x1b\[[0-9;]*m")
|
77
|
+
return ansi_pattern.sub("", text)
|
35
78
|
|
36
79
|
|
37
80
|
class CurateLookup:
|
@@ -40,8 +83,6 @@ class CurateLookup:
|
|
40
83
|
Args:
|
41
84
|
categoricals: A dictionary of categorical fields to lookup.
|
42
85
|
slots: A dictionary of slot fields to lookup.
|
43
|
-
using_key: The key of the instance to lookup from. Defaults to the
|
44
|
-
current instance if not specified.
|
45
86
|
public: Whether to lookup from the public instance. Defaults to False.
|
46
87
|
|
47
88
|
Example:
|
@@ -55,48 +96,43 @@ class CurateLookup:
|
|
55
96
|
self,
|
56
97
|
categoricals: dict[str, FieldAttr],
|
57
98
|
slots: dict[str, FieldAttr] = None,
|
58
|
-
using_key: str | None = None,
|
59
99
|
public: bool = False,
|
60
100
|
) -> None:
|
61
101
|
slots = slots or {}
|
62
|
-
self.
|
63
|
-
self._using_key = None if using_key == "default" else using_key
|
64
|
-
self._using_key_name = self._using_key or ln_setup.settings.instance.slug
|
102
|
+
self._categoricals = {**categoricals, **slots}
|
65
103
|
self._public = public
|
66
|
-
debug_message = f"Lookup objects from {colors.italic(self._using_key_name)}"
|
67
|
-
logger.debug(debug_message)
|
68
104
|
|
69
105
|
def __getattr__(self, name):
|
70
|
-
if name in self.
|
71
|
-
registry = self.
|
106
|
+
if name in self._categoricals:
|
107
|
+
registry = self._categoricals[name].field.model
|
72
108
|
if self._public and hasattr(registry, "public"):
|
73
109
|
return registry.public().lookup()
|
74
110
|
else:
|
75
|
-
return
|
111
|
+
return registry.lookup()
|
76
112
|
raise AttributeError(
|
77
113
|
f'"{self.__class__.__name__}" object has no attribute "{name}"'
|
78
114
|
)
|
79
115
|
|
80
116
|
def __getitem__(self, name):
|
81
|
-
if name in self.
|
82
|
-
registry = self.
|
117
|
+
if name in self._categoricals:
|
118
|
+
registry = self._categoricals[name].field.model
|
83
119
|
if self._public and hasattr(registry, "public"):
|
84
120
|
return registry.public().lookup()
|
85
121
|
else:
|
86
|
-
return
|
122
|
+
return registry.lookup()
|
87
123
|
raise AttributeError(
|
88
124
|
f'"{self.__class__.__name__}" object has no attribute "{name}"'
|
89
125
|
)
|
90
126
|
|
91
127
|
def __repr__(self) -> str:
|
92
|
-
if len(self.
|
128
|
+
if len(self._categoricals) > 0:
|
93
129
|
getattr_keys = "\n ".join(
|
94
|
-
[f".{key}" for key in self.
|
130
|
+
[f".{key}" for key in self._categoricals if key.isidentifier()]
|
95
131
|
)
|
96
132
|
getitem_keys = "\n ".join(
|
97
|
-
[str([key]) for key in self.
|
133
|
+
[str([key]) for key in self._categoricals if not key.isidentifier()]
|
98
134
|
)
|
99
|
-
ref = "public" if self._public else
|
135
|
+
ref = "public" if self._public else "registries"
|
100
136
|
return (
|
101
137
|
f"Lookup objects from the {colors.italic(ref)}:\n "
|
102
138
|
f"{colors.green(getattr_keys)}\n "
|
@@ -105,21 +141,442 @@ class CurateLookup:
|
|
105
141
|
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
106
142
|
"To look up public ontologies, use .lookup(public=True)"
|
107
143
|
)
|
108
|
-
else: #
|
144
|
+
else: # pdagma: no cover
|
109
145
|
return colors.warning("No fields are found!")
|
110
146
|
|
111
147
|
|
112
|
-
|
113
|
-
|
148
|
+
CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
|
149
|
+
|
150
|
+
|
151
|
+
SLOTS_DOCSTRING = """Curator objects by slot.
|
152
|
+
|
153
|
+
.. versionadded:: 1.1.1
|
154
|
+
"""
|
155
|
+
|
156
|
+
|
157
|
+
VALIDATE_DOCSTRING = """Validate dataset.
|
158
|
+
|
159
|
+
Raises:
|
160
|
+
lamindb.errors.ValidationError: If validation fails.
|
161
|
+
"""
|
162
|
+
|
163
|
+
SAVE_ARTIFACT_DOCSTRING = """Save an annotated artifact.
|
164
|
+
|
165
|
+
Args:
|
166
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
167
|
+
description: A description.
|
168
|
+
revises: Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
|
169
|
+
run: The run that creates the artifact.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
A saved artifact record.
|
173
|
+
"""
|
174
|
+
|
175
|
+
|
176
|
+
class Curator:
|
177
|
+
"""Dataset curator.
|
178
|
+
|
179
|
+
A `Curator` object makes it easy to validate, standardize & annotate datasets.
|
180
|
+
|
181
|
+
.. versionadded:: 1.1.0
|
182
|
+
|
183
|
+
See:
|
184
|
+
- :class:`~lamindb.curators.DataFrameCurator`
|
185
|
+
- :class:`~lamindb.curators.AnnDataCurator`
|
186
|
+
"""
|
187
|
+
|
188
|
+
def __init__(self, dataset: Any, schema: Schema | None = None):
|
189
|
+
self._artifact: Artifact = None # pass the dataset as an artifact
|
190
|
+
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
191
|
+
if isinstance(self._dataset, Artifact):
|
192
|
+
self._artifact = self._dataset
|
193
|
+
if self._artifact.otype in {"DataFrame", "AnnData"}:
|
194
|
+
self._dataset = self._dataset.load()
|
195
|
+
self._schema: Schema | None = schema
|
196
|
+
self._is_validated: bool = False
|
197
|
+
self._cat_manager: CatManager = None # is None for CatManager curators
|
198
|
+
|
199
|
+
@doc_args(VALIDATE_DOCSTRING)
|
200
|
+
def validate(self) -> bool | str:
|
201
|
+
"""{}""" # noqa: D415
|
202
|
+
pass # pdagma: no cover
|
203
|
+
|
204
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
205
|
+
def save_artifact(
|
206
|
+
self,
|
207
|
+
*,
|
208
|
+
key: str | None = None,
|
209
|
+
description: str | None = None,
|
210
|
+
revises: Artifact | None = None,
|
211
|
+
run: Run | None = None,
|
212
|
+
) -> Artifact:
|
213
|
+
"""{}""" # noqa: D415
|
214
|
+
# Note that this docstring has to be consistent with the Artifact()
|
215
|
+
# constructor signature
|
216
|
+
pass
|
217
|
+
|
218
|
+
|
219
|
+
class DataFrameCurator(Curator):
|
220
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
221
|
+
"""Curator for a DataFrame object.
|
222
|
+
|
223
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
224
|
+
|
225
|
+
.. versionadded:: 1.1.0
|
226
|
+
|
227
|
+
Args:
|
228
|
+
dataset: The DataFrame-like object to validate & annotate.
|
229
|
+
schema: A `Schema` object that defines the validation constraints.
|
230
|
+
|
231
|
+
Example::
|
232
|
+
|
233
|
+
import lamindb as ln
|
234
|
+
import bionty as bt
|
235
|
+
|
236
|
+
# define valid labels
|
237
|
+
perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
|
238
|
+
ln.ULabel(name="DMSO", type=perturbation).save()
|
239
|
+
ln.ULabel(name="IFNG", type=perturbation).save()
|
240
|
+
bt.CellType.from_source(name="B cell").save()
|
241
|
+
bt.CellType.from_source(name="T cell").save()
|
242
|
+
|
243
|
+
# define schema
|
244
|
+
schema = ln.Schema(
|
245
|
+
name="small_dataset1_obs_level_metadata",
|
246
|
+
features=[
|
247
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
248
|
+
ln.Feature(name="sample_note", dtype=str).save(),
|
249
|
+
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
|
250
|
+
ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
|
251
|
+
],
|
252
|
+
).save()
|
253
|
+
|
254
|
+
# curate a DataFrame
|
255
|
+
df = datasets.small_dataset1(otype="DataFrame")
|
256
|
+
curator = ln.curators.DataFrameCurator(df, schema)
|
257
|
+
artifact = curator.save_artifact(key="example_datasets/dataset1.parquet")
|
258
|
+
assert artifact.schema == schema
|
259
|
+
"""
|
260
|
+
|
261
|
+
def __init__(
|
262
|
+
self,
|
263
|
+
dataset: pd.DataFrame | Artifact,
|
264
|
+
schema: Schema,
|
265
|
+
) -> None:
|
266
|
+
super().__init__(dataset=dataset, schema=schema)
|
267
|
+
categoricals = {}
|
268
|
+
if schema.n > 0:
|
269
|
+
# populate features
|
270
|
+
pandera_columns = {}
|
271
|
+
for feature in schema.features.all():
|
272
|
+
pandera_dtype = (
|
273
|
+
feature.dtype if not feature.dtype.startswith("cat") else "category"
|
274
|
+
)
|
275
|
+
pandera_columns[feature.name] = pandera.Column(
|
276
|
+
pandera_dtype, nullable=feature.nullable
|
277
|
+
)
|
278
|
+
if feature.dtype.startswith("cat"):
|
279
|
+
categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
|
280
|
+
self._pandera_schema = pandera.DataFrameSchema(
|
281
|
+
pandera_columns, coerce=schema.coerce_dtype
|
282
|
+
)
|
283
|
+
else:
|
284
|
+
assert schema.itype is not None # noqa: S101
|
285
|
+
self._cat_manager = DataFrameCatManager(
|
286
|
+
self._dataset,
|
287
|
+
columns=parse_dtype_single_cat(schema.itype, is_itype=True)["field"],
|
288
|
+
categoricals=categoricals,
|
289
|
+
)
|
290
|
+
|
291
|
+
@property
|
292
|
+
@doc_args(CAT_MANAGER_DOCSTRING)
|
293
|
+
def cat(self) -> CatManager:
|
294
|
+
"""{}""" # noqa: D415
|
295
|
+
return self._cat_manager
|
296
|
+
|
297
|
+
def standardize(self) -> None:
|
298
|
+
"""Standardize the dataset.
|
299
|
+
|
300
|
+
- Adds missing columns for features
|
301
|
+
- Fills missing values for features with default values
|
302
|
+
"""
|
303
|
+
for feature in self._schema.members:
|
304
|
+
if feature.name not in self._dataset.columns:
|
305
|
+
if feature.default_value is not None or feature.nullable:
|
306
|
+
fill_value = (
|
307
|
+
feature.default_value
|
308
|
+
if feature.default_value is not None
|
309
|
+
else pd.NA
|
310
|
+
)
|
311
|
+
if feature.dtype.startswith("cat"):
|
312
|
+
self._dataset[feature.name] = pd.Categorical(
|
313
|
+
[fill_value] * len(self._dataset)
|
314
|
+
)
|
315
|
+
else:
|
316
|
+
self._dataset[feature.name] = fill_value
|
317
|
+
logger.important(
|
318
|
+
f"added column {feature.name} with fill value {fill_value}"
|
319
|
+
)
|
320
|
+
else:
|
321
|
+
raise ValidationError(
|
322
|
+
f"Missing column {feature.name} cannot be added because is not nullable and has no default value"
|
323
|
+
)
|
324
|
+
else:
|
325
|
+
if feature.default_value is not None:
|
326
|
+
if isinstance(
|
327
|
+
self._dataset[feature.name].dtype, pd.CategoricalDtype
|
328
|
+
):
|
329
|
+
if (
|
330
|
+
feature.default_value
|
331
|
+
not in self._dataset[feature.name].cat.categories
|
332
|
+
):
|
333
|
+
self._dataset[feature.name] = self._dataset[
|
334
|
+
feature.name
|
335
|
+
].cat.add_categories(feature.default_value)
|
336
|
+
self._dataset[feature.name] = self._dataset[feature.name].fillna(
|
337
|
+
feature.default_value
|
338
|
+
)
|
339
|
+
|
340
|
+
def _cat_manager_validate(self) -> None:
|
341
|
+
self._cat_manager.validate()
|
342
|
+
if self._cat_manager._is_validated:
|
343
|
+
self._is_validated = True
|
344
|
+
else:
|
345
|
+
self._is_validated = False
|
346
|
+
raise ValidationError(self._cat_manager._validate_category_error_messages)
|
347
|
+
|
348
|
+
@doc_args(VALIDATE_DOCSTRING)
|
349
|
+
def validate(self) -> None:
|
350
|
+
"""{}""" # noqa: D415
|
351
|
+
if self._schema.n > 0:
|
352
|
+
try:
|
353
|
+
# first validate through pandera
|
354
|
+
self._pandera_schema.validate(self._dataset)
|
355
|
+
# then validate lamindb categoricals
|
356
|
+
self._cat_manager_validate()
|
357
|
+
except pandera.errors.SchemaError as err:
|
358
|
+
self._is_validated = False
|
359
|
+
# .exconly() doesn't exist on SchemaError
|
360
|
+
raise ValidationError(str(err)) from err
|
361
|
+
else:
|
362
|
+
self._cat_manager_validate()
|
363
|
+
|
364
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
365
|
+
def save_artifact(
|
366
|
+
self,
|
367
|
+
*,
|
368
|
+
key: str | None = None,
|
369
|
+
description: str | None = None,
|
370
|
+
revises: Artifact | None = None,
|
371
|
+
run: Run | None = None,
|
372
|
+
):
|
373
|
+
"""{}""" # noqa: D415
|
374
|
+
if not self._is_validated:
|
375
|
+
self.validate() # raises ValidationError if doesn't validate
|
376
|
+
result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
|
377
|
+
return save_artifact( # type: ignore
|
378
|
+
self._dataset,
|
379
|
+
description=description,
|
380
|
+
fields=self._cat_manager.categoricals,
|
381
|
+
columns_field=result["field"],
|
382
|
+
key=key,
|
383
|
+
artifact=self._artifact,
|
384
|
+
revises=revises,
|
385
|
+
run=run,
|
386
|
+
schema=self._schema,
|
387
|
+
)
|
388
|
+
|
389
|
+
|
390
|
+
class AnnDataCurator(Curator):
|
391
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
392
|
+
"""Curator for a DataFrame object.
|
393
|
+
|
394
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
395
|
+
|
396
|
+
.. versionadded:: 1.1.0
|
397
|
+
|
398
|
+
Args:
|
399
|
+
dataset: The AnnData-like object to validate & annotate.
|
400
|
+
schema: A `Schema` object that defines the validation constraints.
|
401
|
+
|
402
|
+
Example::
|
403
|
+
|
404
|
+
import lamindb as ln
|
405
|
+
import bionty as bt
|
406
|
+
|
407
|
+
# define valid labels
|
408
|
+
perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
|
409
|
+
ln.ULabel(name="DMSO", type=perturbation).save()
|
410
|
+
ln.ULabel(name="IFNG", type=perturbation).save()
|
411
|
+
bt.CellType.from_source(name="B cell").save()
|
412
|
+
bt.CellType.from_source(name="T cell").save()
|
413
|
+
|
414
|
+
# define obs schema
|
415
|
+
obs_schema = ln.Schema(
|
416
|
+
name="small_dataset1_obs_level_metadata",
|
417
|
+
features=[
|
418
|
+
ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
|
419
|
+
ln.Feature(name="sample_note", dtype=str).save(),
|
420
|
+
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
|
421
|
+
ln.Feature(name="cell_type_by_model", dtype=bt.CellType").save(),
|
422
|
+
],
|
423
|
+
).save()
|
424
|
+
|
425
|
+
# define var schema
|
426
|
+
var_schema = ln.Schema(
|
427
|
+
name="scRNA_seq_var_schema",
|
428
|
+
itype=bt.Gene.ensembl_gene_id,
|
429
|
+
dtype=int,
|
430
|
+
).save()
|
431
|
+
|
432
|
+
# define composite schema
|
433
|
+
anndata_schema = ln.Schema(
|
434
|
+
name="small_dataset1_anndata_schema",
|
435
|
+
otype="AnnData",
|
436
|
+
components={"obs": obs_schema, "var": var_schema},
|
437
|
+
).save()
|
438
|
+
|
439
|
+
# curate an AnnData
|
440
|
+
adata = datasets.small_dataset1(otype="AnnData")
|
441
|
+
curator = ln.curators.AnnDataCurator(adata, anndata_schema)
|
442
|
+
artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
|
443
|
+
assert artifact.schema == anndata_schema
|
444
|
+
"""
|
445
|
+
|
446
|
+
def __init__(
|
447
|
+
self,
|
448
|
+
dataset: AnnData | Artifact,
|
449
|
+
schema: Schema,
|
450
|
+
) -> None:
|
451
|
+
super().__init__(dataset=dataset, schema=schema)
|
452
|
+
if not data_is_anndata(self._dataset):
|
453
|
+
raise InvalidArgument("dataset must be AnnData-like.")
|
454
|
+
if schema.otype != "AnnData":
|
455
|
+
raise InvalidArgument("Schema otype must be 'AnnData'.")
|
456
|
+
self._obs_curator = DataFrameCurator(
|
457
|
+
self._dataset.obs, schema._get_component("obs")
|
458
|
+
)
|
459
|
+
self._var_curator = DataFrameCurator(
|
460
|
+
self._dataset.var.T, schema._get_component("var")
|
461
|
+
)
|
462
|
+
|
463
|
+
@property
|
464
|
+
@doc_args(SLOTS_DOCSTRING)
|
465
|
+
def slots(self) -> dict[str, DataFrameCurator]:
|
466
|
+
"""{}""" # noqa: D415
|
467
|
+
return {"obs": self._obs_curator, "var": self._var_curator}
|
468
|
+
|
469
|
+
@doc_args(VALIDATE_DOCSTRING)
|
470
|
+
def validate(self) -> None:
|
471
|
+
"""{}""" # noqa: D415
|
472
|
+
self._obs_curator.validate()
|
473
|
+
self._var_curator.validate()
|
474
|
+
|
475
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
476
|
+
def save_artifact(
|
477
|
+
self,
|
478
|
+
*,
|
479
|
+
key: str | None = None,
|
480
|
+
description: str | None = None,
|
481
|
+
revises: Artifact | None = None,
|
482
|
+
run: Run | None = None,
|
483
|
+
):
|
484
|
+
"""{}""" # noqa: D415
|
485
|
+
if not self._is_validated:
|
486
|
+
self.validate() # raises ValidationError if doesn't validate
|
487
|
+
result = parse_dtype_single_cat(self._var_curator._schema.itype, is_itype=True)
|
488
|
+
return save_artifact( # type: ignore
|
489
|
+
self._dataset,
|
490
|
+
description=description,
|
491
|
+
fields=self._obs_curator._cat_manager.categoricals,
|
492
|
+
columns_field=result["field"],
|
493
|
+
key=key,
|
494
|
+
artifact=self._artifact,
|
495
|
+
revises=revises,
|
496
|
+
run=run,
|
497
|
+
schema=self._schema,
|
498
|
+
)
|
499
|
+
|
500
|
+
|
501
|
+
class CatManager:
|
502
|
+
"""Manage valid categoricals by updating registries.
|
503
|
+
|
504
|
+
A `CatManager` object makes it easy to validate, standardize & annotate datasets.
|
505
|
+
|
506
|
+
Example:
|
507
|
+
|
508
|
+
>>> cat_manager = ln.CatManager(
|
509
|
+
>>> dataset,
|
510
|
+
>>> # define validation criteria as mappings
|
511
|
+
>>> columns=Feature.name, # map column names
|
512
|
+
>>> categoricals={"perturbation": ULabel.name}, # map categories
|
513
|
+
>>> )
|
514
|
+
>>> cat_manager.validate() # validate the dataframe
|
515
|
+
>>> artifact = cat_manager.save_artifact(description="my RNA-seq")
|
516
|
+
>>> artifact.describe() # see annotations
|
517
|
+
|
518
|
+
`cat_manager.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
|
519
|
+
|
520
|
+
If you find non-validated values, you have several options:
|
521
|
+
|
522
|
+
- new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCatManager.add_new_from`
|
523
|
+
- non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCatManager.non_validated` and addressed manually
|
524
|
+
"""
|
525
|
+
|
526
|
+
def __init__(
|
527
|
+
self, *, dataset, categoricals, sources, organism, exclude, columns_field=None
|
528
|
+
):
|
529
|
+
# the below is shared with Curator
|
530
|
+
self._artifact: Artifact = None # pass the dataset as an artifact
|
531
|
+
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
532
|
+
if isinstance(self._dataset, Artifact):
|
533
|
+
self._artifact = self._dataset
|
534
|
+
if self._artifact.otype in {"DataFrame", "AnnData"}:
|
535
|
+
self._dataset = self._dataset.load()
|
536
|
+
self._is_validated: bool = False
|
537
|
+
# shared until here
|
538
|
+
self._categoricals = categoricals or {}
|
539
|
+
self._non_validated = None
|
540
|
+
self._organism = organism
|
541
|
+
self._sources = sources or {}
|
542
|
+
self._exclude = exclude or {}
|
543
|
+
self._columns_field = columns_field
|
544
|
+
self._validate_category_error_messages: str = ""
|
545
|
+
|
546
|
+
@property
|
547
|
+
def non_validated(self) -> dict[str, list[str]]:
|
548
|
+
"""Return the non-validated features and labels."""
|
549
|
+
if self._non_validated is None:
|
550
|
+
raise ValidationError("Please run validate() first!")
|
551
|
+
return self._non_validated
|
114
552
|
|
115
|
-
|
116
|
-
|
117
|
-
|
553
|
+
@property
|
554
|
+
def categoricals(self) -> dict:
|
555
|
+
"""Return the columns fields to validate against."""
|
556
|
+
return self._categoricals
|
118
557
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
558
|
+
def _replace_synonyms(
|
559
|
+
self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
|
560
|
+
):
|
561
|
+
# replace the values in df
|
562
|
+
std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
|
563
|
+
# remove the standardized values from self.non_validated
|
564
|
+
non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
|
565
|
+
if len(non_validated) == 0:
|
566
|
+
self._non_validated.pop(key, None) # type: ignore
|
567
|
+
else:
|
568
|
+
self._non_validated[key] = non_validated # type: ignore
|
569
|
+
# logging
|
570
|
+
n = len(syn_mapper)
|
571
|
+
if n > 0:
|
572
|
+
syn_mapper_print = _format_values(
|
573
|
+
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
574
|
+
)
|
575
|
+
s = "s" if n > 1 else ""
|
576
|
+
logger.success(
|
577
|
+
f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
|
578
|
+
)
|
579
|
+
return std_values
|
123
580
|
|
124
581
|
def validate(self) -> bool:
|
125
582
|
"""Validate dataset.
|
@@ -127,9 +584,9 @@ class BaseCurator:
|
|
127
584
|
This method also registers the validated records in the current instance.
|
128
585
|
|
129
586
|
Returns:
|
130
|
-
|
587
|
+
The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
|
131
588
|
"""
|
132
|
-
pass
|
589
|
+
pass
|
133
590
|
|
134
591
|
def standardize(self, key: str) -> None:
|
135
592
|
"""Replace synonyms with standardized values.
|
@@ -142,30 +599,48 @@ class BaseCurator:
|
|
142
599
|
Returns:
|
143
600
|
None
|
144
601
|
"""
|
145
|
-
pass #
|
602
|
+
pass # pdagma: no cover
|
146
603
|
|
604
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
147
605
|
def save_artifact(
|
148
606
|
self,
|
149
|
-
|
607
|
+
*,
|
150
608
|
key: str | None = None,
|
609
|
+
description: str | None = None,
|
151
610
|
revises: Artifact | None = None,
|
152
611
|
run: Run | None = None,
|
153
612
|
) -> Artifact:
|
154
|
-
"""
|
613
|
+
"""{}""" # noqa: D415
|
614
|
+
from lamindb.core._settings import settings
|
155
615
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
run: The run that creates the artifact.
|
616
|
+
if not self._is_validated:
|
617
|
+
self.validate() # returns True or False
|
618
|
+
if not self._is_validated: # need to raise error manually
|
619
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
161
620
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
621
|
+
# Make sure all labels are saved in the current instance
|
622
|
+
verbosity = settings.verbosity
|
623
|
+
try:
|
624
|
+
settings.verbosity = "warning"
|
625
|
+
self._artifact = save_artifact( # type: ignore
|
626
|
+
self._dataset,
|
627
|
+
description=description,
|
628
|
+
fields=self.categoricals,
|
629
|
+
columns_field=self._columns_field,
|
630
|
+
key=key,
|
631
|
+
artifact=self._artifact,
|
632
|
+
revises=revises,
|
633
|
+
run=run,
|
634
|
+
schema=None,
|
635
|
+
organism=self._organism,
|
636
|
+
)
|
637
|
+
finally:
|
638
|
+
settings.verbosity = verbosity
|
639
|
+
|
640
|
+
return self._artifact
|
166
641
|
|
167
642
|
|
168
|
-
class
|
643
|
+
class DataFrameCatManager(CatManager):
|
169
644
|
"""Curation flow for a DataFrame object.
|
170
645
|
|
171
646
|
See also :class:`~lamindb.Curator`.
|
@@ -174,7 +649,6 @@ class DataFrameCurator(BaseCurator):
|
|
174
649
|
df: The DataFrame object to curate.
|
175
650
|
columns: The field attribute for the feature column.
|
176
651
|
categoricals: A dictionary mapping column names to registry_field.
|
177
|
-
using_key: The reference instance containing registries to validate against.
|
178
652
|
verbosity: The verbosity level.
|
179
653
|
organism: The organism name.
|
180
654
|
sources: A dictionary mapping column names to Source records.
|
@@ -191,165 +665,103 @@ class DataFrameCurator(BaseCurator):
|
|
191
665
|
... df,
|
192
666
|
... categoricals={
|
193
667
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
194
|
-
... "donor_id":
|
668
|
+
... "donor_id": ULabel.name
|
195
669
|
... }
|
196
670
|
... )
|
197
671
|
"""
|
198
672
|
|
199
673
|
def __init__(
|
200
674
|
self,
|
201
|
-
df: pd.DataFrame,
|
675
|
+
df: pd.DataFrame | Artifact,
|
202
676
|
columns: FieldAttr = Feature.name,
|
203
677
|
categoricals: dict[str, FieldAttr] | None = None,
|
204
|
-
using_key: str | None = None,
|
205
678
|
verbosity: str = "hint",
|
206
679
|
organism: str | None = None,
|
207
680
|
sources: dict[str, Record] | None = None,
|
208
681
|
exclude: dict | None = None,
|
209
|
-
check_valid_keys: bool = True,
|
210
682
|
) -> None:
|
211
683
|
from lamindb.core._settings import settings
|
212
684
|
|
213
685
|
if organism is not None and not isinstance(organism, str):
|
214
686
|
raise ValueError("organism must be a string such as 'human' or 'mouse'!")
|
215
687
|
|
216
|
-
self._df = df
|
217
|
-
self._fields = categoricals or {}
|
218
|
-
self._columns_field = columns
|
219
|
-
self._using_key = using_key
|
220
|
-
# TODO: change verbosity back
|
221
688
|
settings.verbosity = verbosity
|
222
|
-
self._artifact = None
|
223
|
-
self._collection = None
|
224
|
-
self._validated = False
|
225
|
-
self._kwargs = {"organism": organism} if organism else {}
|
226
|
-
self._sources = sources or {}
|
227
|
-
self._exclude = exclude or {}
|
228
689
|
self._non_validated = None
|
229
|
-
|
230
|
-
|
690
|
+
super().__init__(
|
691
|
+
dataset=df,
|
692
|
+
columns_field=columns,
|
693
|
+
organism=organism,
|
694
|
+
categoricals=categoricals,
|
695
|
+
sources=sources,
|
696
|
+
exclude=exclude,
|
697
|
+
)
|
231
698
|
self._save_columns()
|
232
699
|
|
233
|
-
|
234
|
-
def non_validated(self) -> dict[str, list[str]]:
|
235
|
-
"""Return the non-validated features and labels."""
|
236
|
-
if self._non_validated is None:
|
237
|
-
raise ValidationError("Please run validate() first!")
|
238
|
-
return self._non_validated
|
239
|
-
|
240
|
-
@property
|
241
|
-
def fields(self) -> dict:
|
242
|
-
"""Return the columns fields to validate against."""
|
243
|
-
return self._fields
|
244
|
-
|
245
|
-
def lookup(
|
246
|
-
self, using_key: str | None = None, public: bool = False
|
247
|
-
) -> CurateLookup:
|
700
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
248
701
|
"""Lookup categories.
|
249
702
|
|
250
703
|
Args:
|
251
|
-
|
252
|
-
if "public", the lookup is performed on the public reference.
|
704
|
+
public: If "public", the lookup is performed on the public reference.
|
253
705
|
"""
|
254
706
|
return CurateLookup(
|
255
|
-
categoricals=self.
|
707
|
+
categoricals=self._categoricals,
|
256
708
|
slots={"columns": self._columns_field},
|
257
|
-
using_key=using_key or self._using_key,
|
258
709
|
public=public,
|
259
710
|
)
|
260
711
|
|
261
|
-
def _check_valid_keys(self, extra: set | None = None) -> None:
|
262
|
-
extra = extra or set()
|
263
|
-
for name, d in {
|
264
|
-
"categoricals": self._fields,
|
265
|
-
"sources": self._sources,
|
266
|
-
"exclude": self._exclude,
|
267
|
-
}.items():
|
268
|
-
if not isinstance(d, dict):
|
269
|
-
raise TypeError(f"{name} must be a dictionary!")
|
270
|
-
valid_keys = set(self._df.columns) | {"columns"} | extra
|
271
|
-
nonval_keys = [key for key in d.keys() if key not in valid_keys]
|
272
|
-
n = len(nonval_keys)
|
273
|
-
s = "s" if n > 1 else ""
|
274
|
-
are = "are" if n > 1 else "is"
|
275
|
-
if len(nonval_keys) > 0:
|
276
|
-
raise ValidationError(
|
277
|
-
f"key{s} passed to {name} {are} not present in columns: {colors.yellow(_format_values(nonval_keys))}"
|
278
|
-
)
|
279
|
-
|
280
712
|
def _save_columns(self, validated_only: bool = True) -> None:
|
281
713
|
"""Save column name records."""
|
282
714
|
# Always save features specified as the fields keys
|
283
715
|
update_registry(
|
284
|
-
values=list(self.
|
716
|
+
values=list(self.categoricals.keys()),
|
285
717
|
field=self._columns_field,
|
286
718
|
key="columns",
|
287
|
-
using_key=self._using_key,
|
288
719
|
validated_only=False,
|
289
720
|
source=self._sources.get("columns"),
|
290
721
|
exclude=self._exclude.get("columns"),
|
291
|
-
**self._kwargs, # type: ignore
|
292
722
|
)
|
293
723
|
|
294
724
|
# Save the rest of the columns based on validated_only
|
295
|
-
additional_columns = set(self.
|
725
|
+
additional_columns = set(self._dataset.columns) - set(self.categoricals.keys())
|
296
726
|
if additional_columns:
|
297
727
|
update_registry(
|
298
728
|
values=list(additional_columns),
|
299
729
|
field=self._columns_field,
|
300
730
|
key="columns",
|
301
|
-
using_key=self._using_key,
|
302
731
|
validated_only=validated_only,
|
303
|
-
df=self.
|
732
|
+
df=self._dataset, # Get the Feature type from df
|
304
733
|
source=self._sources.get("columns"),
|
305
734
|
exclude=self._exclude.get("columns"),
|
306
|
-
**self._kwargs, # type: ignore
|
307
735
|
)
|
308
736
|
|
309
|
-
|
310
|
-
|
737
|
+
@deprecated(new_name="is run by default")
|
738
|
+
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
739
|
+
pass
|
740
|
+
|
741
|
+
def validate(self) -> bool:
|
742
|
+
"""Validate variables and categorical observations.
|
743
|
+
|
744
|
+
This method also registers the validated records in the current instance:
|
745
|
+
- from public sources
|
311
746
|
|
312
747
|
Args:
|
313
|
-
key: The key referencing the slot in the DataFrame from which to draw terms.
|
314
748
|
organism: The organism name.
|
315
|
-
|
749
|
+
|
750
|
+
Returns:
|
751
|
+
Whether the DataFrame is validated.
|
316
752
|
"""
|
317
|
-
|
318
|
-
|
319
|
-
self.
|
320
|
-
self.
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
stacklevel=2,
|
753
|
+
# add all validated records to the current instance
|
754
|
+
self._update_registry_all()
|
755
|
+
self._validate_category_error_messages = "" # reset the error messages
|
756
|
+
self._is_validated, self._non_validated = validate_categories_in_df( # type: ignore
|
757
|
+
self._dataset,
|
758
|
+
fields=self.categoricals,
|
759
|
+
sources=self._sources,
|
760
|
+
exclude=self._exclude,
|
761
|
+
curator=self,
|
762
|
+
organism=self._organism,
|
328
763
|
)
|
329
|
-
|
330
|
-
|
331
|
-
def _replace_synonyms(
|
332
|
-
self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
|
333
|
-
):
|
334
|
-
# replace the values in df
|
335
|
-
std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
|
336
|
-
# remove the standardized values from self.non_validated
|
337
|
-
non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
|
338
|
-
if len(non_validated) == 0:
|
339
|
-
self._non_validated.pop(key, None) # type: ignore
|
340
|
-
else:
|
341
|
-
self._non_validated[key] = non_validated # type: ignore
|
342
|
-
# logging
|
343
|
-
n = len(syn_mapper)
|
344
|
-
if n > 0:
|
345
|
-
syn_mapper_print = _format_values(
|
346
|
-
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
347
|
-
)
|
348
|
-
s = "s" if n > 1 else ""
|
349
|
-
logger.success(
|
350
|
-
f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
|
351
|
-
)
|
352
|
-
return std_values
|
764
|
+
return self._is_validated
|
353
765
|
|
354
766
|
def standardize(self, key: str) -> None:
|
355
767
|
"""Replace synonyms with standardized values.
|
@@ -359,6 +771,8 @@ class DataFrameCurator(BaseCurator):
|
|
359
771
|
Args:
|
360
772
|
key: The key referencing the column in the DataFrame to standardize.
|
361
773
|
"""
|
774
|
+
if self._artifact is not None:
|
775
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
362
776
|
# list is needed to avoid RuntimeError: dictionary changed size during iteration
|
363
777
|
avail_keys = list(self.non_validated.keys())
|
364
778
|
if len(avail_keys) == 0:
|
@@ -367,137 +781,74 @@ class DataFrameCurator(BaseCurator):
|
|
367
781
|
|
368
782
|
if key == "all":
|
369
783
|
for k in avail_keys:
|
370
|
-
if k in self.
|
784
|
+
if k in self._categoricals: # needed to exclude var_index
|
371
785
|
syn_mapper = standardize_categories(
|
372
786
|
self.non_validated[k],
|
373
|
-
field=self.
|
374
|
-
using_key=self._using_key,
|
787
|
+
field=self._categoricals[k],
|
375
788
|
source=self._sources.get(k),
|
376
|
-
**self._kwargs,
|
377
789
|
)
|
378
|
-
self.
|
790
|
+
self._dataset[k] = self._replace_synonyms(
|
791
|
+
k, syn_mapper, self._dataset[k]
|
792
|
+
)
|
379
793
|
else:
|
380
794
|
if key not in avail_keys:
|
381
|
-
if key in self.
|
795
|
+
if key in self._categoricals:
|
382
796
|
logger.info(f"No unstandardized values found for {key!r}")
|
383
797
|
else:
|
384
798
|
raise KeyError(
|
385
799
|
f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
|
386
800
|
)
|
387
801
|
else:
|
388
|
-
if key in self.
|
802
|
+
if key in self._categoricals: # needed to exclude var_index
|
389
803
|
syn_mapper = standardize_categories(
|
390
804
|
self.non_validated[key],
|
391
|
-
field=self.
|
392
|
-
using_key=self._using_key,
|
805
|
+
field=self._categoricals[key],
|
393
806
|
source=self._sources.get(key),
|
394
|
-
|
807
|
+
organism=self._organism,
|
395
808
|
)
|
396
|
-
self.
|
397
|
-
key, syn_mapper, self.
|
809
|
+
self._dataset[key] = self._replace_synonyms(
|
810
|
+
key, syn_mapper, self._dataset[key]
|
398
811
|
)
|
399
812
|
|
813
|
+
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
814
|
+
"""Save labels for all features."""
|
815
|
+
for name in self.categoricals.keys():
|
816
|
+
self._update_registry(name, validated_only=validated_only, **kwargs)
|
817
|
+
|
400
818
|
def _update_registry(
|
401
819
|
self, categorical: str, validated_only: bool = True, **kwargs
|
402
820
|
) -> None:
|
403
821
|
if categorical == "all":
|
404
822
|
self._update_registry_all(validated_only=validated_only, **kwargs)
|
405
823
|
else:
|
406
|
-
if categorical not in self.
|
824
|
+
if categorical not in self.categoricals:
|
407
825
|
raise ValidationError(
|
408
826
|
f"Feature {categorical} is not part of the fields!"
|
409
827
|
)
|
410
828
|
update_registry(
|
411
|
-
values=_flatten_unique(self.
|
412
|
-
field=self.
|
829
|
+
values=_flatten_unique(self._dataset[categorical]),
|
830
|
+
field=self.categoricals[categorical],
|
413
831
|
key=categorical,
|
414
|
-
using_key=self._using_key,
|
415
832
|
validated_only=validated_only,
|
416
833
|
source=self._sources.get(categorical),
|
417
834
|
exclude=self._exclude.get(categorical),
|
418
|
-
|
835
|
+
organism=self._organism,
|
419
836
|
)
|
420
837
|
# adding new records removes them from non_validated
|
421
838
|
if not validated_only and self._non_validated:
|
422
839
|
self._non_validated.pop(categorical, None) # type: ignore
|
423
840
|
|
424
|
-
def
|
425
|
-
"""
|
426
|
-
for name in self.fields.keys():
|
427
|
-
self._update_registry(name, validated_only=validated_only, **kwargs)
|
428
|
-
|
429
|
-
def validate(self, organism: str | None = None) -> bool:
|
430
|
-
"""Validate variables and categorical observations.
|
431
|
-
|
432
|
-
This method also registers the validated records in the current instance:
|
433
|
-
- from public sources
|
434
|
-
- from the using_key instance
|
841
|
+
def add_new_from(self, key: str, **kwargs):
|
842
|
+
"""Add validated & new categories.
|
435
843
|
|
436
844
|
Args:
|
845
|
+
key: The key referencing the slot in the DataFrame from which to draw terms.
|
437
846
|
organism: The organism name.
|
438
|
-
|
439
|
-
Returns:
|
440
|
-
Whether the DataFrame is validated.
|
441
|
-
"""
|
442
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
443
|
-
|
444
|
-
# add all validated records to the current instance
|
445
|
-
self._update_registry_all()
|
446
|
-
|
447
|
-
self._validated, self._non_validated = validate_categories_in_df( # type: ignore
|
448
|
-
self._df,
|
449
|
-
fields=self.fields,
|
450
|
-
using_key=self._using_key,
|
451
|
-
sources=self._sources,
|
452
|
-
exclude=self._exclude,
|
453
|
-
**self._kwargs,
|
454
|
-
)
|
455
|
-
return self._validated
|
456
|
-
|
457
|
-
def save_artifact(
|
458
|
-
self,
|
459
|
-
description: str | None = None,
|
460
|
-
key: str | None = None,
|
461
|
-
revises: Artifact | None = None,
|
462
|
-
run: Run | None = None,
|
463
|
-
) -> Artifact:
|
464
|
-
"""Save the validated DataFrame and metadata.
|
465
|
-
|
466
|
-
Args:
|
467
|
-
description: Description of the DataFrame object.
|
468
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
469
|
-
Artifacts with the same key form a revision family.
|
470
|
-
revises: Previous version of the artifact. Triggers a revision.
|
471
|
-
run: The run that creates the artifact.
|
472
|
-
|
473
|
-
Returns:
|
474
|
-
A saved artifact record.
|
847
|
+
**kwargs: Additional keyword arguments to pass to create new records
|
475
848
|
"""
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
self.validate()
|
480
|
-
if not self._validated:
|
481
|
-
raise ValidationError("Dataset does not validate. Please curate.")
|
482
|
-
|
483
|
-
# Make sure all labels are saved in the current instance
|
484
|
-
verbosity = settings.verbosity
|
485
|
-
try:
|
486
|
-
settings.verbosity = "warning"
|
487
|
-
self._artifact = save_artifact(
|
488
|
-
self._df,
|
489
|
-
description=description,
|
490
|
-
fields=self.fields,
|
491
|
-
columns_field=self._columns_field,
|
492
|
-
key=key,
|
493
|
-
revises=revises,
|
494
|
-
run=run,
|
495
|
-
**self._kwargs,
|
496
|
-
)
|
497
|
-
finally:
|
498
|
-
settings.verbosity = verbosity
|
499
|
-
|
500
|
-
return self._artifact
|
849
|
+
if len(kwargs) > 0 and key == "all":
|
850
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
851
|
+
self._update_registry(key, validated_only=False, **kwargs)
|
501
852
|
|
502
853
|
def clean_up_failed_runs(self):
|
503
854
|
"""Clean up previous failed runs that don't save any outputs."""
|
@@ -509,21 +860,14 @@ class DataFrameCurator(BaseCurator):
|
|
509
860
|
).delete()
|
510
861
|
|
511
862
|
|
512
|
-
class
|
513
|
-
"""
|
514
|
-
|
515
|
-
See also :class:`~lamindb.Curator`.
|
516
|
-
|
517
|
-
Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
|
518
|
-
|
519
|
-
See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
|
863
|
+
class AnnDataCatManager(CatManager):
|
864
|
+
"""Manage categorical curation.
|
520
865
|
|
521
866
|
Args:
|
522
867
|
data: The AnnData object or an AnnData-like path.
|
523
868
|
var_index: The registry field for mapping the ``.var`` index.
|
524
869
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
525
870
|
obs_columns: The registry field for mapping the ``.obs.columns``.
|
526
|
-
using_key: A reference LaminDB instance.
|
527
871
|
verbosity: The verbosity level.
|
528
872
|
organism: The organism name.
|
529
873
|
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
@@ -538,7 +882,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
538
882
|
... var_index=bt.Gene.ensembl_gene_id,
|
539
883
|
... categoricals={
|
540
884
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
541
|
-
... "donor_id":
|
885
|
+
... "donor_id": ULabel.name
|
542
886
|
... },
|
543
887
|
... organism="human",
|
544
888
|
... )
|
@@ -546,56 +890,48 @@ class AnnDataCurator(DataFrameCurator):
|
|
546
890
|
|
547
891
|
def __init__(
|
548
892
|
self,
|
549
|
-
data: ad.AnnData |
|
893
|
+
data: ad.AnnData | Artifact,
|
550
894
|
var_index: FieldAttr,
|
551
895
|
categoricals: dict[str, FieldAttr] | None = None,
|
552
896
|
obs_columns: FieldAttr = Feature.name,
|
553
|
-
using_key: str | None = None,
|
554
897
|
verbosity: str = "hint",
|
555
898
|
organism: str | None = None,
|
556
899
|
sources: dict[str, Record] | None = None,
|
557
900
|
exclude: dict | None = None,
|
558
901
|
) -> None:
|
559
|
-
from lamindb_setup.core import upath
|
560
|
-
|
561
902
|
if isinstance(var_index, str):
|
562
903
|
raise TypeError("var_index parameter has to be a bionty field")
|
563
904
|
|
564
|
-
from .._artifact import data_is_anndata
|
565
|
-
|
566
905
|
if sources is None:
|
567
906
|
sources = {}
|
568
907
|
if not data_is_anndata(data):
|
569
|
-
raise TypeError(
|
570
|
-
"data has to be an AnnData object or a path to AnnData-like"
|
571
|
-
)
|
572
|
-
if isinstance(data, ad.AnnData):
|
573
|
-
self._adata = data
|
574
|
-
else: # pragma: no cover
|
575
|
-
from lamindb.core.storage._backed_access import backed_access
|
576
|
-
|
577
|
-
self._adata = backed_access(upath.create_path(data))
|
908
|
+
raise TypeError("data has to be an AnnData object")
|
578
909
|
|
579
910
|
if "symbol" in str(var_index):
|
580
911
|
logger.warning(
|
581
912
|
"indexing datasets with gene symbols can be problematic: https://docs.lamin.ai/faq/symbol-mapping"
|
582
913
|
)
|
583
914
|
|
584
|
-
self.
|
915
|
+
self._obs_fields = categoricals or {}
|
585
916
|
self._var_field = var_index
|
586
917
|
super().__init__(
|
587
|
-
|
918
|
+
dataset=data,
|
588
919
|
categoricals=categoricals,
|
920
|
+
sources=sources,
|
921
|
+
organism=organism,
|
922
|
+
exclude=exclude,
|
923
|
+
columns_field=var_index,
|
924
|
+
)
|
925
|
+
self._adata = self._dataset
|
926
|
+
self._obs_df_curator = DataFrameCatManager(
|
927
|
+
df=self._adata.obs,
|
928
|
+
categoricals=self.categoricals,
|
589
929
|
columns=obs_columns,
|
590
|
-
using_key=using_key,
|
591
930
|
verbosity=verbosity,
|
592
|
-
organism=
|
931
|
+
organism=None,
|
593
932
|
sources=sources,
|
594
933
|
exclude=exclude,
|
595
|
-
check_valid_keys=False,
|
596
934
|
)
|
597
|
-
self._obs_fields = categoricals or {}
|
598
|
-
self._check_valid_keys(extra={"var_index"})
|
599
935
|
|
600
936
|
@property
|
601
937
|
def var_index(self) -> FieldAttr:
|
@@ -607,54 +943,53 @@ class AnnDataCurator(DataFrameCurator):
|
|
607
943
|
"""Return the obs fields to validate against."""
|
608
944
|
return self._obs_fields
|
609
945
|
|
610
|
-
def lookup(
|
611
|
-
self, using_key: str | None = None, public: bool = False
|
612
|
-
) -> CurateLookup:
|
946
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
613
947
|
"""Lookup categories.
|
614
948
|
|
615
949
|
Args:
|
616
|
-
|
617
|
-
if "public", the lookup is performed on the public reference.
|
950
|
+
public: If "public", the lookup is performed on the public reference.
|
618
951
|
"""
|
619
952
|
return CurateLookup(
|
620
953
|
categoricals=self._obs_fields,
|
621
954
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
622
|
-
using_key=using_key or self._using_key,
|
623
955
|
public=public,
|
624
956
|
)
|
625
957
|
|
626
958
|
def _save_from_var_index(
|
627
|
-
self,
|
959
|
+
self,
|
960
|
+
validated_only: bool = True,
|
628
961
|
):
|
629
962
|
"""Save variable records."""
|
630
963
|
update_registry(
|
631
964
|
values=list(self._adata.var.index),
|
632
965
|
field=self.var_index,
|
633
966
|
key="var_index",
|
634
|
-
using_key=self._using_key,
|
635
967
|
validated_only=validated_only,
|
636
|
-
organism=
|
968
|
+
organism=self._organism,
|
637
969
|
source=self._sources.get("var_index"),
|
638
970
|
exclude=self._exclude.get("var_index"),
|
639
971
|
)
|
640
972
|
|
641
|
-
def
|
642
|
-
"""
|
643
|
-
self._save_from_var_index(validated_only=validated_only, **self._kwargs)
|
644
|
-
for name in self._obs_fields.keys():
|
645
|
-
self._update_registry(name, validated_only=validated_only, **self._kwargs)
|
973
|
+
def add_new_from(self, key: str, **kwargs):
|
974
|
+
"""Add validated & new categories.
|
646
975
|
|
647
|
-
|
976
|
+
Args:
|
977
|
+
key: The key referencing the slot in the DataFrame from which to draw terms.
|
978
|
+
organism: The organism name.
|
979
|
+
**kwargs: Additional keyword arguments to pass to create new records
|
980
|
+
"""
|
981
|
+
self._obs_df_curator.add_new_from(key, **kwargs)
|
982
|
+
|
983
|
+
def add_new_from_var_index(self, **kwargs):
|
648
984
|
"""Update variable records.
|
649
985
|
|
650
986
|
Args:
|
651
987
|
organism: The organism name.
|
652
988
|
**kwargs: Additional keyword arguments to pass to create new records.
|
653
989
|
"""
|
654
|
-
self.
|
655
|
-
self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
|
990
|
+
self._save_from_var_index(validated_only=False, **kwargs)
|
656
991
|
|
657
|
-
def validate(self
|
992
|
+
def validate(self) -> bool:
|
658
993
|
"""Validate categories.
|
659
994
|
|
660
995
|
This method also registers the validated records in the current instance.
|
@@ -665,38 +1000,25 @@ class AnnDataCurator(DataFrameCurator):
|
|
665
1000
|
Returns:
|
666
1001
|
Whether the AnnData object is validated.
|
667
1002
|
"""
|
668
|
-
self.
|
669
|
-
if self._using_key is not None and self._using_key != "default":
|
670
|
-
logger.important(
|
671
|
-
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
672
|
-
)
|
1003
|
+
self._validate_category_error_messages = "" # reset the error messages
|
673
1004
|
|
674
1005
|
# add all validated records to the current instance
|
675
|
-
self.
|
676
|
-
|
1006
|
+
self._save_from_var_index(validated_only=True)
|
677
1007
|
validated_var, non_validated_var = validate_categories(
|
678
1008
|
self._adata.var.index,
|
679
1009
|
field=self._var_field,
|
680
1010
|
key="var_index",
|
681
|
-
using_key=self._using_key,
|
682
1011
|
source=self._sources.get("var_index"),
|
683
1012
|
hint_print=".add_new_from_var_index()",
|
684
1013
|
exclude=self._exclude.get("var_index"),
|
685
|
-
|
686
|
-
)
|
687
|
-
validated_obs, non_validated_obs = validate_categories_in_df(
|
688
|
-
self._adata.obs,
|
689
|
-
fields=self.categoricals,
|
690
|
-
using_key=self._using_key,
|
691
|
-
sources=self._sources,
|
692
|
-
exclude=self._exclude,
|
693
|
-
**self._kwargs,
|
1014
|
+
organism=self._organism, # type: ignore
|
694
1015
|
)
|
695
|
-
|
1016
|
+
validated_obs = self._obs_df_curator.validate()
|
1017
|
+
self._non_validated = self._obs_df_curator._non_validated # type: ignore
|
696
1018
|
if len(non_validated_var) > 0:
|
697
1019
|
self._non_validated["var_index"] = non_validated_var # type: ignore
|
698
|
-
self.
|
699
|
-
return self.
|
1020
|
+
self._is_validated = validated_var and validated_obs
|
1021
|
+
return self._is_validated
|
700
1022
|
|
701
1023
|
def standardize(self, key: str):
|
702
1024
|
"""Replace synonyms with standardized values.
|
@@ -709,83 +1031,35 @@ class AnnDataCurator(DataFrameCurator):
|
|
709
1031
|
|
710
1032
|
Inplace modification of the dataset.
|
711
1033
|
"""
|
1034
|
+
if self._artifact is not None:
|
1035
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
712
1036
|
if key in self._adata.obs.columns or key == "all":
|
713
1037
|
# standardize obs columns
|
714
|
-
|
1038
|
+
self._obs_df_curator.standardize(key)
|
715
1039
|
# in addition to the obs columns, standardize the var.index
|
716
1040
|
if key == "var_index" or key == "all":
|
717
1041
|
syn_mapper = standardize_categories(
|
718
1042
|
self._adata.var.index,
|
719
1043
|
field=self.var_index,
|
720
|
-
using_key=self._using_key,
|
721
1044
|
source=self._sources.get("var_index"),
|
722
|
-
|
1045
|
+
organism=self._organism,
|
723
1046
|
)
|
724
1047
|
if "var_index" in self._non_validated: # type: ignore
|
725
1048
|
self._adata.var.index = self._replace_synonyms(
|
726
1049
|
"var_index", syn_mapper, self._adata.var.index
|
727
1050
|
)
|
728
1051
|
|
729
|
-
def save_artifact(
|
730
|
-
self,
|
731
|
-
description: str | None = None,
|
732
|
-
key: str | None = None,
|
733
|
-
revises: Artifact | None = None,
|
734
|
-
run: Run | None = None,
|
735
|
-
) -> Artifact:
|
736
|
-
"""Save the validated ``AnnData`` and metadata.
|
737
|
-
|
738
|
-
Args:
|
739
|
-
description: A description of the ``AnnData`` object.
|
740
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
741
|
-
Artifacts with the same key form a revision family.
|
742
|
-
revises: Previous version of the artifact. Triggers a revision.
|
743
|
-
run: The run that creates the artifact.
|
744
|
-
|
745
|
-
Returns:
|
746
|
-
A saved artifact record.
|
747
|
-
"""
|
748
|
-
from lamindb.core._settings import settings
|
749
|
-
|
750
|
-
if not self._validated:
|
751
|
-
self.validate()
|
752
|
-
if not self._validated:
|
753
|
-
raise ValidationError("Dataset does not validate. Please curate.")
|
754
|
-
verbosity = settings.verbosity
|
755
|
-
try:
|
756
|
-
settings.verbosity = "warning"
|
757
|
-
self._artifact = save_artifact(
|
758
|
-
self._data,
|
759
|
-
adata=self._adata,
|
760
|
-
description=description,
|
761
|
-
columns_field=self.var_index,
|
762
|
-
fields=self.categoricals,
|
763
|
-
key=key,
|
764
|
-
revises=revises,
|
765
|
-
run=run,
|
766
|
-
**self._kwargs,
|
767
|
-
)
|
768
|
-
finally:
|
769
|
-
settings.verbosity = verbosity
|
770
|
-
return self._artifact
|
771
|
-
|
772
1052
|
|
773
|
-
class
|
1053
|
+
class MuDataCatManager(CatManager):
|
774
1054
|
"""Curation flow for a ``MuData`` object.
|
775
1055
|
|
776
|
-
See also :class:`~lamindb.Curator`.
|
777
|
-
|
778
|
-
Note that if genes or other measurements are removed from the MuData object,
|
779
|
-
the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
|
780
|
-
|
781
1056
|
Args:
|
782
1057
|
mdata: The MuData object to curate.
|
783
1058
|
var_index: The registry field for mapping the ``.var`` index for each modality.
|
784
1059
|
For example:
|
785
|
-
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2":
|
1060
|
+
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
|
786
1061
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
787
1062
|
Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
|
788
|
-
using_key: A reference LaminDB instance.
|
789
1063
|
verbosity: The verbosity level.
|
790
1064
|
organism: The organism name.
|
791
1065
|
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
@@ -799,11 +1073,11 @@ class MuDataCurator:
|
|
799
1073
|
... mdata,
|
800
1074
|
... var_index={
|
801
1075
|
... "rna": bt.Gene.ensembl_gene_id,
|
802
|
-
... "adt":
|
1076
|
+
... "adt": CellMarker.name
|
803
1077
|
... },
|
804
1078
|
... categoricals={
|
805
1079
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
806
|
-
... "donor_id":
|
1080
|
+
... "donor_id": ULabel.name
|
807
1081
|
... },
|
808
1082
|
... organism="human",
|
809
1083
|
... )
|
@@ -811,52 +1085,47 @@ class MuDataCurator:
|
|
811
1085
|
|
812
1086
|
def __init__(
|
813
1087
|
self,
|
814
|
-
mdata: MuData,
|
1088
|
+
mdata: MuData | Artifact,
|
815
1089
|
var_index: dict[str, FieldAttr],
|
816
1090
|
categoricals: dict[str, FieldAttr] | None = None,
|
817
|
-
using_key: str | None = None,
|
818
1091
|
verbosity: str = "hint",
|
819
1092
|
organism: str | None = None,
|
820
1093
|
sources: dict[str, Record] | None = None,
|
821
1094
|
exclude: dict | None = None, # {modality: {field: [values]}}
|
822
1095
|
) -> None:
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
self.
|
1096
|
+
super().__init__(
|
1097
|
+
dataset=mdata,
|
1098
|
+
categoricals={},
|
1099
|
+
sources=sources,
|
1100
|
+
organism=organism,
|
1101
|
+
exclude=exclude,
|
1102
|
+
)
|
1103
|
+
self._columns_field = var_index # this is for consistency with BaseCatManager
|
831
1104
|
self._var_fields = var_index
|
832
1105
|
self._verify_modality(self._var_fields.keys())
|
833
1106
|
self._obs_fields = self._parse_categoricals(categoricals)
|
834
1107
|
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
835
|
-
self._using_key = using_key
|
836
1108
|
self._verbosity = verbosity
|
837
1109
|
self._obs_df_curator = None
|
838
1110
|
if "obs" in self._modalities:
|
839
|
-
self._obs_df_curator =
|
840
|
-
df=
|
1111
|
+
self._obs_df_curator = DataFrameCatManager(
|
1112
|
+
df=self._dataset.obs,
|
841
1113
|
columns=Feature.name,
|
842
1114
|
categoricals=self._obs_fields.get("obs", {}),
|
843
|
-
using_key=using_key,
|
844
1115
|
verbosity=verbosity,
|
845
1116
|
sources=self._sources.get("obs"),
|
846
1117
|
exclude=self._exclude.get("obs"),
|
847
|
-
|
848
|
-
**self._kwargs,
|
1118
|
+
organism=organism,
|
849
1119
|
)
|
850
1120
|
self._mod_adata_curators = {
|
851
|
-
modality:
|
852
|
-
data=
|
1121
|
+
modality: AnnDataCatManager(
|
1122
|
+
data=self._dataset[modality],
|
853
1123
|
var_index=var_index.get(modality),
|
854
1124
|
categoricals=self._obs_fields.get(modality),
|
855
|
-
using_key=using_key,
|
856
1125
|
verbosity=verbosity,
|
857
1126
|
sources=self._sources.get(modality),
|
858
1127
|
exclude=self._exclude.get(modality),
|
859
|
-
|
1128
|
+
organism=organism,
|
860
1129
|
)
|
861
1130
|
for modality in self._modalities
|
862
1131
|
if modality != "obs"
|
@@ -874,7 +1143,7 @@ class MuDataCurator:
|
|
874
1143
|
return self._obs_fields
|
875
1144
|
|
876
1145
|
@property
|
877
|
-
def non_validated(self) -> dict[str, dict[str, list[str]]]:
|
1146
|
+
def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
|
878
1147
|
"""Return the non-validated features and labels."""
|
879
1148
|
if self._non_validated is None:
|
880
1149
|
raise ValidationError("Please run validate() first!")
|
@@ -883,15 +1152,15 @@ class MuDataCurator:
|
|
883
1152
|
def _verify_modality(self, modalities: Iterable[str]):
|
884
1153
|
"""Verify the modality exists."""
|
885
1154
|
for modality in modalities:
|
886
|
-
if modality not in self.
|
1155
|
+
if modality not in self._dataset.mod.keys():
|
887
1156
|
raise ValidationError(f"modality '{modality}' does not exist!")
|
888
1157
|
|
889
1158
|
def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
|
890
1159
|
"""Parse the categorical fields."""
|
891
|
-
prefixes = {f"{k}:" for k in self.
|
1160
|
+
prefixes = {f"{k}:" for k in self._dataset.mod.keys()}
|
892
1161
|
obs_fields: dict[str, dict[str, FieldAttr]] = {}
|
893
1162
|
for k, v in categoricals.items():
|
894
|
-
if k not in self.
|
1163
|
+
if k not in self._dataset.obs.columns:
|
895
1164
|
raise ValidationError(f"column '{k}' does not exist in mdata.obs!")
|
896
1165
|
if any(k.startswith(prefix) for prefix in prefixes):
|
897
1166
|
modality, col = k.split(":")[0], k.split(":")[1]
|
@@ -904,14 +1173,11 @@ class MuDataCurator:
|
|
904
1173
|
obs_fields["obs"][k] = v
|
905
1174
|
return obs_fields
|
906
1175
|
|
907
|
-
def lookup(
|
908
|
-
self, using_key: str | None = None, public: bool = False
|
909
|
-
) -> CurateLookup:
|
1176
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
910
1177
|
"""Lookup categories.
|
911
1178
|
|
912
1179
|
Args:
|
913
|
-
|
914
|
-
if "public", the lookup is performed on the public reference.
|
1180
|
+
public: Perform lookup on public source ontologies.
|
915
1181
|
"""
|
916
1182
|
obs_fields = {}
|
917
1183
|
for mod, fields in self._obs_fields.items():
|
@@ -925,27 +1191,19 @@ class MuDataCurator:
|
|
925
1191
|
slots={
|
926
1192
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
927
1193
|
},
|
928
|
-
using_key=using_key or self._using_key,
|
929
1194
|
public=public,
|
930
1195
|
)
|
931
1196
|
|
1197
|
+
@deprecated(new_name="is run by default")
|
932
1198
|
def add_new_from_columns(
|
933
1199
|
self,
|
934
1200
|
modality: str,
|
935
1201
|
column_names: list[str] | None = None,
|
936
|
-
organism: str | None = None,
|
937
1202
|
**kwargs,
|
938
1203
|
):
|
939
|
-
|
940
|
-
warnings.warn(
|
941
|
-
"`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
|
942
|
-
DeprecationWarning,
|
943
|
-
stacklevel=2,
|
944
|
-
)
|
1204
|
+
pass
|
945
1205
|
|
946
|
-
def add_new_from_var_index(
|
947
|
-
self, modality: str, organism: str | None = None, **kwargs
|
948
|
-
):
|
1206
|
+
def add_new_from_var_index(self, modality: str, **kwargs):
|
949
1207
|
"""Update variable records.
|
950
1208
|
|
951
1209
|
Args:
|
@@ -953,25 +1211,19 @@ class MuDataCurator:
|
|
953
1211
|
organism: The organism name.
|
954
1212
|
**kwargs: Additional keyword arguments to pass to create new records.
|
955
1213
|
"""
|
956
|
-
self.
|
957
|
-
self._mod_adata_curators[modality].add_new_from_var_index(
|
958
|
-
**self._kwargs, **kwargs
|
959
|
-
)
|
1214
|
+
self._mod_adata_curators[modality].add_new_from_var_index(**kwargs)
|
960
1215
|
|
961
1216
|
def _update_registry_all(self):
|
962
1217
|
"""Update all registries."""
|
963
1218
|
if self._obs_df_curator is not None:
|
964
|
-
self._obs_df_curator._update_registry_all(
|
965
|
-
validated_only=True, **self._kwargs
|
966
|
-
)
|
1219
|
+
self._obs_df_curator._update_registry_all(validated_only=True)
|
967
1220
|
for _, adata_curator in self._mod_adata_curators.items():
|
968
|
-
adata_curator._update_registry_all(validated_only=True
|
1221
|
+
adata_curator._obs_df_curator._update_registry_all(validated_only=True)
|
969
1222
|
|
970
1223
|
def add_new_from(
|
971
1224
|
self,
|
972
1225
|
key: str,
|
973
1226
|
modality: str | None = None,
|
974
|
-
organism: str | None = None,
|
975
1227
|
**kwargs,
|
976
1228
|
):
|
977
1229
|
"""Add validated & new categories.
|
@@ -984,24 +1236,17 @@ class MuDataCurator:
|
|
984
1236
|
"""
|
985
1237
|
if len(kwargs) > 0 and key == "all":
|
986
1238
|
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
987
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
988
1239
|
modality = modality or "obs"
|
989
1240
|
if modality in self._mod_adata_curators:
|
990
1241
|
adata_curator = self._mod_adata_curators[modality]
|
991
|
-
adata_curator.add_new_from(key=key, **
|
1242
|
+
adata_curator.add_new_from(key=key, **kwargs)
|
992
1243
|
if modality == "obs":
|
993
|
-
self._obs_df_curator.add_new_from(key=key, **
|
1244
|
+
self._obs_df_curator.add_new_from(key=key, **kwargs)
|
994
1245
|
|
995
|
-
def validate(self
|
1246
|
+
def validate(self) -> bool:
|
996
1247
|
"""Validate categories."""
|
997
1248
|
from lamindb.core._settings import settings
|
998
1249
|
|
999
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
1000
|
-
if self._using_key is not None and self._using_key != "default":
|
1001
|
-
logger.important(
|
1002
|
-
f"validating using registries of instance {colors.italic(self._using_key)}"
|
1003
|
-
)
|
1004
|
-
|
1005
1250
|
# add all validated records to the current instance
|
1006
1251
|
verbosity = settings.verbosity
|
1007
1252
|
try:
|
@@ -1015,20 +1260,20 @@ class MuDataCurator:
|
|
1015
1260
|
obs_validated = True
|
1016
1261
|
if "obs" in self._modalities:
|
1017
1262
|
logger.info('validating categoricals in "obs"...')
|
1018
|
-
obs_validated &= self._obs_df_curator.validate(
|
1263
|
+
obs_validated &= self._obs_df_curator.validate()
|
1019
1264
|
self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore
|
1020
1265
|
logger.print("")
|
1021
1266
|
|
1022
1267
|
mods_validated = True
|
1023
1268
|
for modality, adata_curator in self._mod_adata_curators.items():
|
1024
1269
|
logger.info(f'validating categoricals in modality "{modality}"...')
|
1025
|
-
mods_validated &= adata_curator.validate(
|
1270
|
+
mods_validated &= adata_curator.validate()
|
1026
1271
|
if len(adata_curator.non_validated) > 0:
|
1027
1272
|
self._non_validated[modality] = adata_curator.non_validated # type: ignore
|
1028
1273
|
logger.print("")
|
1029
1274
|
|
1030
|
-
self.
|
1031
|
-
return self.
|
1275
|
+
self._is_validated = obs_validated & mods_validated
|
1276
|
+
return self._is_validated
|
1032
1277
|
|
1033
1278
|
def standardize(self, key: str, modality: str | None = None):
|
1034
1279
|
"""Replace synonyms with standardized values.
|
@@ -1039,6 +1284,8 @@ class MuDataCurator:
|
|
1039
1284
|
|
1040
1285
|
Inplace modification of the dataset.
|
1041
1286
|
"""
|
1287
|
+
if self._artifact is not None:
|
1288
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
1042
1289
|
modality = modality or "obs"
|
1043
1290
|
if modality in self._mod_adata_curators:
|
1044
1291
|
adata_curator = self._mod_adata_curators[modality]
|
@@ -1046,47 +1293,6 @@ class MuDataCurator:
|
|
1046
1293
|
if modality == "obs":
|
1047
1294
|
self._obs_df_curator.standardize(key=key)
|
1048
1295
|
|
1049
|
-
def save_artifact(
|
1050
|
-
self,
|
1051
|
-
description: str | None = None,
|
1052
|
-
key: str | None = None,
|
1053
|
-
revises: Artifact | None = None,
|
1054
|
-
run: Run | None = None,
|
1055
|
-
) -> Artifact:
|
1056
|
-
"""Save the validated ``MuData`` and metadata.
|
1057
|
-
|
1058
|
-
Args:
|
1059
|
-
description: A description of the ``MuData`` object.
|
1060
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
1061
|
-
revises: Previous version of the artifact. Triggers a revision.
|
1062
|
-
run: The run that creates the artifact.
|
1063
|
-
|
1064
|
-
Returns:
|
1065
|
-
A saved artifact record.
|
1066
|
-
"""
|
1067
|
-
from lamindb.core._settings import settings
|
1068
|
-
|
1069
|
-
if not self._validated:
|
1070
|
-
self.validate()
|
1071
|
-
if not self._validated:
|
1072
|
-
raise ValidationError("Dataset does not validate. Please curate.")
|
1073
|
-
verbosity = settings.verbosity
|
1074
|
-
try:
|
1075
|
-
settings.verbosity = "warning"
|
1076
|
-
self._artifact = save_artifact(
|
1077
|
-
self._mdata,
|
1078
|
-
description=description,
|
1079
|
-
columns_field=self.var_index,
|
1080
|
-
fields=self.categoricals,
|
1081
|
-
key=key,
|
1082
|
-
revises=revises,
|
1083
|
-
run=run,
|
1084
|
-
**self._kwargs,
|
1085
|
-
)
|
1086
|
-
finally:
|
1087
|
-
settings.verbosity = verbosity
|
1088
|
-
return self._artifact
|
1089
|
-
|
1090
1296
|
|
1091
1297
|
def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
1092
1298
|
if (n := len(nonval_keys)) > 0:
|
@@ -1097,10 +1303,8 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
|
1097
1303
|
)
|
1098
1304
|
|
1099
1305
|
|
1100
|
-
class
|
1101
|
-
"""Curation flow for
|
1102
|
-
|
1103
|
-
See also :class:`~lamindb.Curator`.
|
1306
|
+
class TiledbsomaCatManager(CatManager):
|
1307
|
+
"""Curation flow for `tiledbsoma.Experiment`.
|
1104
1308
|
|
1105
1309
|
Args:
|
1106
1310
|
experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
|
@@ -1123,7 +1327,7 @@ class SOMACurator(BaseCurator):
|
|
1123
1327
|
... var_index={"RNA": ("var_id", bt.Gene.symbol)},
|
1124
1328
|
... categoricals={
|
1125
1329
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1126
|
-
... "donor_id":
|
1330
|
+
... "donor_id": ULabel.name
|
1127
1331
|
... },
|
1128
1332
|
... organism="human",
|
1129
1333
|
... )
|
@@ -1138,23 +1342,21 @@ class SOMACurator(BaseCurator):
|
|
1138
1342
|
organism: str | None = None,
|
1139
1343
|
sources: dict[str, Record] | None = None,
|
1140
1344
|
exclude: dict[str, str | list[str]] | None = None,
|
1141
|
-
using_key: str | None = None,
|
1142
1345
|
):
|
1143
1346
|
self._obs_fields = categoricals or {}
|
1144
1347
|
self._var_fields = var_index
|
1145
1348
|
self._columns_field = obs_columns
|
1146
1349
|
if isinstance(experiment_uri, Artifact):
|
1147
|
-
self.
|
1350
|
+
self._dataset = experiment_uri.path
|
1148
1351
|
self._artifact = experiment_uri
|
1149
1352
|
else:
|
1150
|
-
self.
|
1353
|
+
self._dataset = UPath(experiment_uri)
|
1151
1354
|
self._artifact = None
|
1152
1355
|
self._organism = organism
|
1153
|
-
self._using_key = using_key
|
1154
1356
|
self._sources = sources or {}
|
1155
1357
|
self._exclude = exclude or {}
|
1156
1358
|
|
1157
|
-
self.
|
1359
|
+
self._is_validated: bool | None = False
|
1158
1360
|
self._non_validated_values: dict[str, list] | None = None
|
1159
1361
|
self._validated_values: dict[str, list] = {}
|
1160
1362
|
# filled by _check_save_keys
|
@@ -1172,7 +1374,7 @@ class SOMACurator(BaseCurator):
|
|
1172
1374
|
def _check_save_keys(self):
|
1173
1375
|
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1174
1376
|
|
1175
|
-
with _open_tiledbsoma(self.
|
1377
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1176
1378
|
experiment_obs = experiment.obs
|
1177
1379
|
self._n_obs = len(experiment_obs)
|
1178
1380
|
self._obs_pa_schema = experiment_obs.schema
|
@@ -1228,7 +1430,6 @@ class SOMACurator(BaseCurator):
|
|
1228
1430
|
values=register_columns,
|
1229
1431
|
field=self._columns_field,
|
1230
1432
|
key="columns",
|
1231
|
-
using_key=self._using_key,
|
1232
1433
|
validated_only=False,
|
1233
1434
|
organism=organism,
|
1234
1435
|
source=self._sources.get("columns"),
|
@@ -1244,7 +1445,6 @@ class SOMACurator(BaseCurator):
|
|
1244
1445
|
values=additional_columns,
|
1245
1446
|
field=self._columns_field,
|
1246
1447
|
key="columns",
|
1247
|
-
using_key=self._using_key,
|
1248
1448
|
validated_only=True,
|
1249
1449
|
organism=organism,
|
1250
1450
|
source=self._sources.get("columns"),
|
@@ -1257,7 +1457,7 @@ class SOMACurator(BaseCurator):
|
|
1257
1457
|
|
1258
1458
|
validated = True
|
1259
1459
|
self._non_validated_values = {}
|
1260
|
-
with _open_tiledbsoma(self.
|
1460
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1261
1461
|
for ms, (key, field) in self._var_fields.items():
|
1262
1462
|
var_ms = experiment.ms[ms].var
|
1263
1463
|
var_ms_key = f"{ms}__{key}"
|
@@ -1274,7 +1474,6 @@ class SOMACurator(BaseCurator):
|
|
1274
1474
|
values=var_ms_values,
|
1275
1475
|
field=field,
|
1276
1476
|
key=var_ms_key,
|
1277
|
-
using_key=self._using_key,
|
1278
1477
|
validated_only=True,
|
1279
1478
|
organism=organism,
|
1280
1479
|
source=self._sources.get(var_ms_key),
|
@@ -1284,7 +1483,6 @@ class SOMACurator(BaseCurator):
|
|
1284
1483
|
values=var_ms_values,
|
1285
1484
|
field=field,
|
1286
1485
|
key=var_ms_key,
|
1287
|
-
using_key=self._using_key,
|
1288
1486
|
organism=organism,
|
1289
1487
|
source=self._sources.get(var_ms_key),
|
1290
1488
|
exclude=self._exclude.get(var_ms_key),
|
@@ -1310,7 +1508,6 @@ class SOMACurator(BaseCurator):
|
|
1310
1508
|
values=values,
|
1311
1509
|
field=field,
|
1312
1510
|
key=key,
|
1313
|
-
using_key=self._using_key,
|
1314
1511
|
validated_only=True,
|
1315
1512
|
organism=organism,
|
1316
1513
|
source=self._sources.get(key),
|
@@ -1320,7 +1517,6 @@ class SOMACurator(BaseCurator):
|
|
1320
1517
|
values=values,
|
1321
1518
|
field=field,
|
1322
1519
|
key=key,
|
1323
|
-
using_key=self._using_key,
|
1324
1520
|
organism=organism,
|
1325
1521
|
source=self._sources.get(key),
|
1326
1522
|
exclude=self._exclude.get(key),
|
@@ -1330,8 +1526,8 @@ class SOMACurator(BaseCurator):
|
|
1330
1526
|
self._non_validated_values[key] = non_val
|
1331
1527
|
else:
|
1332
1528
|
self._validated_values[key] = values
|
1333
|
-
self.
|
1334
|
-
return self.
|
1529
|
+
self._is_validated = validated
|
1530
|
+
return self._is_validated
|
1335
1531
|
|
1336
1532
|
def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
|
1337
1533
|
assert self._non_validated_values is not None # noqa: S101
|
@@ -1346,7 +1542,7 @@ class SOMACurator(BaseCurator):
|
|
1346
1542
|
values = self._non_validated_values.get(key, [])
|
1347
1543
|
return values, field
|
1348
1544
|
|
1349
|
-
def add_new_from(self, key: str) -> None:
|
1545
|
+
def add_new_from(self, key: str, **kwargs) -> None:
|
1350
1546
|
"""Add validated & new categories.
|
1351
1547
|
|
1352
1548
|
Args:
|
@@ -1378,11 +1574,11 @@ class SOMACurator(BaseCurator):
|
|
1378
1574
|
values=values,
|
1379
1575
|
field=field,
|
1380
1576
|
key=k,
|
1381
|
-
using_key=self._using_key,
|
1382
1577
|
validated_only=False,
|
1383
1578
|
organism=organism,
|
1384
1579
|
source=self._sources.get(k),
|
1385
1580
|
exclude=self._exclude.get(k),
|
1581
|
+
**kwargs,
|
1386
1582
|
)
|
1387
1583
|
# update non-validated values list but keep the key there
|
1388
1584
|
# it will be removed by .validate()
|
@@ -1405,19 +1601,15 @@ class SOMACurator(BaseCurator):
|
|
1405
1601
|
"""Return the obs fields to validate against."""
|
1406
1602
|
return self._obs_fields
|
1407
1603
|
|
1408
|
-
def lookup(
|
1409
|
-
self, using_key: str | None = None, public: bool = False
|
1410
|
-
) -> CurateLookup:
|
1604
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
1411
1605
|
"""Lookup categories.
|
1412
1606
|
|
1413
1607
|
Args:
|
1414
|
-
|
1415
|
-
if "public", the lookup is performed on the public reference.
|
1608
|
+
public: If "public", the lookup is performed on the public reference.
|
1416
1609
|
"""
|
1417
1610
|
return CurateLookup(
|
1418
1611
|
categoricals=self._obs_fields,
|
1419
1612
|
slots={"columns": self._columns_field, **self._var_fields_flat},
|
1420
|
-
using_key=using_key or self._using_key,
|
1421
1613
|
public=public,
|
1422
1614
|
)
|
1423
1615
|
|
@@ -1462,7 +1654,6 @@ class SOMACurator(BaseCurator):
|
|
1462
1654
|
syn_mapper = standardize_categories(
|
1463
1655
|
values=values,
|
1464
1656
|
field=field,
|
1465
|
-
using_key=self._using_key,
|
1466
1657
|
source=self._sources.get(k),
|
1467
1658
|
organism=organism,
|
1468
1659
|
)
|
@@ -1471,7 +1662,7 @@ class SOMACurator(BaseCurator):
|
|
1471
1662
|
|
1472
1663
|
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1473
1664
|
|
1474
|
-
with _open_tiledbsoma(self.
|
1665
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1475
1666
|
value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
|
1476
1667
|
table = slot(experiment).read(value_filter=value_filter).concat()
|
1477
1668
|
|
@@ -1484,7 +1675,7 @@ class SOMACurator(BaseCurator):
|
|
1484
1675
|
lambda val: syn_mapper.get(val, val) # noqa
|
1485
1676
|
)
|
1486
1677
|
# write the mapped values
|
1487
|
-
with _open_tiledbsoma(self.
|
1678
|
+
with _open_tiledbsoma(self._dataset, mode="w") as experiment:
|
1488
1679
|
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
1489
1680
|
# update non_validated dict
|
1490
1681
|
non_val_k = [
|
@@ -1502,8 +1693,9 @@ class SOMACurator(BaseCurator):
|
|
1502
1693
|
|
1503
1694
|
def save_artifact(
|
1504
1695
|
self,
|
1505
|
-
|
1696
|
+
*,
|
1506
1697
|
key: str | None = None,
|
1698
|
+
description: str | None = None,
|
1507
1699
|
revises: Artifact | None = None,
|
1508
1700
|
run: Run | None = None,
|
1509
1701
|
) -> Artifact:
|
@@ -1512,7 +1704,7 @@ class SOMACurator(BaseCurator):
|
|
1512
1704
|
Args:
|
1513
1705
|
description: A description of the ``tiledbsoma`` store.
|
1514
1706
|
key: A path-like key to reference artifact in default storage,
|
1515
|
-
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a
|
1707
|
+
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
|
1516
1708
|
revises: Previous version of the artifact. Triggers a revision.
|
1517
1709
|
run: The run that creates the artifact.
|
1518
1710
|
|
@@ -1521,14 +1713,14 @@ class SOMACurator(BaseCurator):
|
|
1521
1713
|
"""
|
1522
1714
|
from lamindb.core._data import add_labels
|
1523
1715
|
|
1524
|
-
if not self.
|
1716
|
+
if not self._is_validated:
|
1525
1717
|
self.validate()
|
1526
|
-
if not self.
|
1718
|
+
if not self._is_validated:
|
1527
1719
|
raise ValidationError("Dataset does not validate. Please curate.")
|
1528
1720
|
|
1529
1721
|
if self._artifact is None:
|
1530
1722
|
artifact = Artifact(
|
1531
|
-
self.
|
1723
|
+
self._dataset,
|
1532
1724
|
description=description,
|
1533
1725
|
key=key,
|
1534
1726
|
revises=revises,
|
@@ -1540,7 +1732,7 @@ class SOMACurator(BaseCurator):
|
|
1540
1732
|
else:
|
1541
1733
|
artifact = self._artifact
|
1542
1734
|
|
1543
|
-
|
1735
|
+
feature_sets = {}
|
1544
1736
|
if len(self._obs_fields) > 0:
|
1545
1737
|
organism = check_registry_organism(
|
1546
1738
|
self._columns_field.field.model, self._organism
|
@@ -1550,7 +1742,7 @@ class SOMACurator(BaseCurator):
|
|
1550
1742
|
empty_dict, schema=self._obs_pa_schema
|
1551
1743
|
).to_pandas()
|
1552
1744
|
# in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
|
1553
|
-
|
1745
|
+
feature_sets["obs"] = Schema.from_df(
|
1554
1746
|
df=mock_df,
|
1555
1747
|
field=self._columns_field,
|
1556
1748
|
mute=True,
|
@@ -1561,238 +1753,1370 @@ class SOMACurator(BaseCurator):
|
|
1561
1753
|
organism = check_registry_organism(
|
1562
1754
|
var_field.field.model, self._organism
|
1563
1755
|
).get("organism")
|
1564
|
-
|
1756
|
+
feature_sets[f"{ms}__var"] = Schema.from_values(
|
1565
1757
|
values=self._validated_values[f"{ms}__{var_key}"],
|
1566
1758
|
field=var_field,
|
1567
1759
|
organism=organism,
|
1568
1760
|
raise_validation_error=False,
|
1569
1761
|
)
|
1570
|
-
artifact.
|
1762
|
+
artifact._staged_feature_sets = feature_sets
|
1763
|
+
|
1764
|
+
feature_ref_is_name = _ref_is_name(self._columns_field)
|
1765
|
+
features = Feature.lookup().dict()
|
1766
|
+
for key, field in self._obs_fields.items():
|
1767
|
+
feature = features.get(key)
|
1768
|
+
registry = field.field.model
|
1769
|
+
organism = check_registry_organism(field.field.model, self._organism).get(
|
1770
|
+
"organism"
|
1771
|
+
)
|
1772
|
+
labels = registry.from_values(
|
1773
|
+
values=self._validated_values[key], field=field, organism=organism
|
1774
|
+
)
|
1775
|
+
if len(labels) == 0:
|
1776
|
+
continue
|
1777
|
+
if hasattr(registry, "_name_field"):
|
1778
|
+
label_ref_is_name = field.field.name == registry._name_field
|
1779
|
+
add_labels(
|
1780
|
+
artifact,
|
1781
|
+
records=labels,
|
1782
|
+
feature=feature,
|
1783
|
+
feature_ref_is_name=feature_ref_is_name,
|
1784
|
+
label_ref_is_name=label_ref_is_name,
|
1785
|
+
from_curator=True,
|
1786
|
+
)
|
1787
|
+
|
1788
|
+
return artifact.save()
|
1789
|
+
|
1790
|
+
|
1791
|
+
class SpatialDataCatManager(CatManager):
|
1792
|
+
"""Curation flow for a ``Spatialdata`` object.
|
1793
|
+
|
1794
|
+
See also :class:`~lamindb.Curator`.
|
1795
|
+
|
1796
|
+
Note that if genes or other measurements are removed from the SpatialData object,
|
1797
|
+
the object should be recreated.
|
1798
|
+
|
1799
|
+
In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
|
1800
|
+
|
1801
|
+
Args:
|
1802
|
+
sdata: The SpatialData object to curate.
|
1803
|
+
var_index: A dictionary mapping table keys to the ``.var`` indices.
|
1804
|
+
categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
|
1805
|
+
|
1806
|
+
organism: The organism name.
|
1807
|
+
sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
|
1808
|
+
exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
|
1809
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1810
|
+
using the exclude parameter ensures they are not validated.
|
1811
|
+
verbosity: The verbosity level of the logger.
|
1812
|
+
sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
|
1813
|
+
|
1814
|
+
Examples:
|
1815
|
+
>>> import bionty as bt
|
1816
|
+
>>> curator = SpatialDataCatManager(
|
1817
|
+
... sdata,
|
1818
|
+
... var_index={
|
1819
|
+
... "table_1": bt.Gene.ensembl_gene_id,
|
1820
|
+
... },
|
1821
|
+
... categoricals={
|
1822
|
+
... "table1":
|
1823
|
+
... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
|
1824
|
+
... "sample":
|
1825
|
+
... {"experimental_factor": bt.ExperimentalFactor.name},
|
1826
|
+
... },
|
1827
|
+
... organism="human",
|
1828
|
+
... )
|
1829
|
+
"""
|
1830
|
+
|
1831
|
+
def __init__(
|
1832
|
+
self,
|
1833
|
+
sdata: Any,
|
1834
|
+
var_index: dict[str, FieldAttr],
|
1835
|
+
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
1836
|
+
verbosity: str = "hint",
|
1837
|
+
organism: str | None = None,
|
1838
|
+
sources: dict[str, dict[str, Record]] | None = None,
|
1839
|
+
exclude: dict[str, dict] | None = None,
|
1840
|
+
*,
|
1841
|
+
sample_metadata_key: str | None = "sample",
|
1842
|
+
) -> None:
|
1843
|
+
super().__init__(
|
1844
|
+
dataset=sdata,
|
1845
|
+
categoricals={},
|
1846
|
+
sources=sources,
|
1847
|
+
organism=organism,
|
1848
|
+
exclude=exclude,
|
1849
|
+
)
|
1850
|
+
if isinstance(sdata, Artifact):
|
1851
|
+
# TODO: load() doesn't yet work
|
1852
|
+
self._sdata = sdata.load()
|
1853
|
+
else:
|
1854
|
+
self._sdata = self._dataset
|
1855
|
+
self._sample_metadata_key = sample_metadata_key
|
1856
|
+
self._var_fields = var_index
|
1857
|
+
self._verify_accessor_exists(self._var_fields.keys())
|
1858
|
+
self._categoricals = categoricals
|
1859
|
+
self._table_keys = set(self._var_fields.keys()) | set(
|
1860
|
+
self._categoricals.keys() - {self._sample_metadata_key}
|
1861
|
+
)
|
1862
|
+
self._verbosity = verbosity
|
1863
|
+
self._sample_df_curator = None
|
1864
|
+
if self._sample_metadata_key is not None:
|
1865
|
+
self._sample_metadata = self._sdata.get_attrs(
|
1866
|
+
key=self._sample_metadata_key, return_as="df", flatten=True
|
1867
|
+
)
|
1868
|
+
self._is_validated = False
|
1869
|
+
|
1870
|
+
# Check validity of keys in categoricals
|
1871
|
+
nonval_keys = []
|
1872
|
+
for accessor, accessor_categoricals in self._categoricals.items():
|
1873
|
+
if (
|
1874
|
+
accessor == self._sample_metadata_key
|
1875
|
+
and self._sample_metadata is not None
|
1876
|
+
):
|
1877
|
+
for key in accessor_categoricals.keys():
|
1878
|
+
if key not in self._sample_metadata.columns:
|
1879
|
+
nonval_keys.append(key)
|
1880
|
+
else:
|
1881
|
+
for key in accessor_categoricals.keys():
|
1882
|
+
if key not in self._sdata[accessor].obs.columns:
|
1883
|
+
nonval_keys.append(key)
|
1884
|
+
|
1885
|
+
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1886
|
+
|
1887
|
+
# check validity of keys in sources and exclude
|
1888
|
+
for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
|
1889
|
+
nonval_keys = []
|
1890
|
+
for accessor, accessor_sources in dct.items():
|
1891
|
+
if (
|
1892
|
+
accessor == self._sample_metadata_key
|
1893
|
+
and self._sample_metadata is not None
|
1894
|
+
):
|
1895
|
+
columns = self._sample_metadata.columns
|
1896
|
+
elif accessor != self._sample_metadata_key:
|
1897
|
+
columns = self._sdata[accessor].obs.columns
|
1898
|
+
else:
|
1899
|
+
continue
|
1900
|
+
for key in accessor_sources:
|
1901
|
+
if key not in columns:
|
1902
|
+
nonval_keys.append(key)
|
1903
|
+
_maybe_curation_keys_not_present(nonval_keys, name)
|
1904
|
+
|
1905
|
+
# Set up sample level metadata and table Curator objects
|
1906
|
+
if (
|
1907
|
+
self._sample_metadata_key is not None
|
1908
|
+
and self._sample_metadata_key in self._categoricals
|
1909
|
+
):
|
1910
|
+
self._sample_df_curator = DataFrameCatManager(
|
1911
|
+
df=self._sample_metadata,
|
1912
|
+
columns=Feature.name,
|
1913
|
+
categoricals=self._categoricals.get(self._sample_metadata_key, {}),
|
1914
|
+
verbosity=verbosity,
|
1915
|
+
sources=self._sources.get(self._sample_metadata_key),
|
1916
|
+
exclude=self._exclude.get(self._sample_metadata_key),
|
1917
|
+
organism=organism,
|
1918
|
+
)
|
1919
|
+
self._table_adata_curators = {
|
1920
|
+
table: AnnDataCatManager(
|
1921
|
+
data=self._sdata[table],
|
1922
|
+
var_index=var_index.get(table),
|
1923
|
+
categoricals=self._categoricals.get(table),
|
1924
|
+
verbosity=verbosity,
|
1925
|
+
sources=self._sources.get(table),
|
1926
|
+
exclude=self._exclude.get(table),
|
1927
|
+
organism=organism,
|
1928
|
+
)
|
1929
|
+
for table in self._table_keys
|
1930
|
+
}
|
1931
|
+
|
1932
|
+
self._non_validated = None
|
1933
|
+
|
1934
|
+
@property
|
1935
|
+
def var_index(self) -> FieldAttr:
|
1936
|
+
"""Return the registry fields to validate variables indices against."""
|
1937
|
+
return self._var_fields
|
1938
|
+
|
1939
|
+
@property
|
1940
|
+
def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
|
1941
|
+
"""Return the categorical keys and fields to validate against."""
|
1942
|
+
return self._categoricals
|
1943
|
+
|
1944
|
+
@property
|
1945
|
+
def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
|
1946
|
+
"""Return the non-validated features and labels."""
|
1947
|
+
if self._non_validated is None:
|
1948
|
+
raise ValidationError("Please run validate() first!")
|
1949
|
+
return self._non_validated
|
1950
|
+
|
1951
|
+
def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
|
1952
|
+
"""Verify that the accessors exist (either a valid table or in attrs)."""
|
1953
|
+
for acc in accessors:
|
1954
|
+
is_present = False
|
1955
|
+
try:
|
1956
|
+
self._sdata.get_attrs(key=acc)
|
1957
|
+
is_present = True
|
1958
|
+
except KeyError:
|
1959
|
+
if acc in self._sdata.tables.keys():
|
1960
|
+
is_present = True
|
1961
|
+
if not is_present:
|
1962
|
+
raise ValidationError(f"Accessor '{acc}' does not exist!")
|
1963
|
+
|
1964
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
1965
|
+
"""Look up categories.
|
1966
|
+
|
1967
|
+
Args:
|
1968
|
+
public: Whether the lookup is performed on the public reference.
|
1969
|
+
"""
|
1970
|
+
cat_values_dict = list(self.categoricals.values())[0]
|
1971
|
+
return CurateLookup(
|
1972
|
+
categoricals=cat_values_dict,
|
1973
|
+
slots={"accessors": cat_values_dict.keys()},
|
1974
|
+
public=public,
|
1975
|
+
)
|
1976
|
+
|
1977
|
+
def _update_registry_all(self) -> None:
|
1978
|
+
"""Saves labels of all features for sample and table metadata."""
|
1979
|
+
if self._sample_df_curator is not None:
|
1980
|
+
self._sample_df_curator._update_registry_all(
|
1981
|
+
validated_only=True,
|
1982
|
+
)
|
1983
|
+
for _, adata_curator in self._table_adata_curators.items():
|
1984
|
+
adata_curator._obs_df_curator._update_registry_all(
|
1985
|
+
validated_only=True,
|
1986
|
+
)
|
1987
|
+
|
1988
|
+
def add_new_from_var_index(self, table: str, **kwargs) -> None:
|
1989
|
+
"""Save new values from ``.var.index`` of table.
|
1990
|
+
|
1991
|
+
Args:
|
1992
|
+
table: The table key.
|
1993
|
+
organism: The organism name.
|
1994
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
1995
|
+
"""
|
1996
|
+
if self._non_validated is None:
|
1997
|
+
raise ValidationError("Run .validate() first.")
|
1998
|
+
self._table_adata_curators[table].add_new_from_var_index(**kwargs)
|
1999
|
+
if table in self.non_validated.keys():
|
2000
|
+
if "var_index" in self._non_validated[table]:
|
2001
|
+
self._non_validated[table].pop("var_index")
|
2002
|
+
|
2003
|
+
if len(self.non_validated[table].values()) == 0:
|
2004
|
+
self.non_validated.pop(table)
|
2005
|
+
|
2006
|
+
def add_new_from(
|
2007
|
+
self,
|
2008
|
+
key: str,
|
2009
|
+
accessor: str | None = None,
|
2010
|
+
**kwargs,
|
2011
|
+
) -> None:
|
2012
|
+
"""Save new values of categorical from sample level metadata or table.
|
2013
|
+
|
2014
|
+
Args:
|
2015
|
+
key: The key referencing the slot in the DataFrame.
|
2016
|
+
accessor: The accessor key such as 'sample' or 'table x'.
|
2017
|
+
organism: The organism name.
|
2018
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
2019
|
+
"""
|
2020
|
+
if self._non_validated is None:
|
2021
|
+
raise ValidationError("Run .validate() first.")
|
2022
|
+
|
2023
|
+
if len(kwargs) > 0 and key == "all":
|
2024
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
2025
|
+
|
2026
|
+
if accessor not in self.categoricals:
|
2027
|
+
raise ValueError(
|
2028
|
+
f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
|
2029
|
+
)
|
2030
|
+
|
2031
|
+
if accessor in self._table_adata_curators:
|
2032
|
+
adata_curator = self._table_adata_curators[accessor]
|
2033
|
+
adata_curator.add_new_from(key=key, **kwargs)
|
2034
|
+
if accessor == self._sample_metadata_key:
|
2035
|
+
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
2036
|
+
|
2037
|
+
if accessor in self.non_validated.keys():
|
2038
|
+
if len(self.non_validated[accessor].values()) == 0:
|
2039
|
+
self.non_validated.pop(accessor)
|
2040
|
+
|
2041
|
+
def standardize(self, key: str, accessor: str | None = None) -> None:
|
2042
|
+
"""Replace synonyms with canonical values.
|
2043
|
+
|
2044
|
+
Modifies the dataset inplace.
|
2045
|
+
|
2046
|
+
Args:
|
2047
|
+
key: The key referencing the slot in the table or sample metadata.
|
2048
|
+
accessor: The accessor key such as 'sample_key' or 'table_key'.
|
2049
|
+
"""
|
2050
|
+
if len(self.non_validated) == 0:
|
2051
|
+
logger.warning("values are already standardized")
|
2052
|
+
return
|
2053
|
+
if self._artifact is not None:
|
2054
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
2055
|
+
|
2056
|
+
if accessor == self._sample_metadata_key:
|
2057
|
+
if key not in self._sample_metadata.columns:
|
2058
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2059
|
+
else:
|
2060
|
+
if (
|
2061
|
+
key == "var_index" and self._sdata.tables[accessor].var.index is None
|
2062
|
+
) or (
|
2063
|
+
key != "var_index"
|
2064
|
+
and key not in self._sdata.tables[accessor].obs.columns
|
2065
|
+
):
|
2066
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2067
|
+
|
2068
|
+
if accessor in self._table_adata_curators.keys():
|
2069
|
+
adata_curator = self._table_adata_curators[accessor]
|
2070
|
+
adata_curator.standardize(key)
|
2071
|
+
if accessor == self._sample_metadata_key:
|
2072
|
+
self._sample_df_curator.standardize(key)
|
2073
|
+
|
2074
|
+
if len(self.non_validated[accessor].values()) == 0:
|
2075
|
+
self.non_validated.pop(accessor)
|
2076
|
+
|
2077
|
+
def validate(self) -> bool:
|
2078
|
+
"""Validate variables and categorical observations.
|
2079
|
+
|
2080
|
+
This method also registers the validated records in the current instance:
|
2081
|
+
- from public sources
|
2082
|
+
|
2083
|
+
Args:
|
2084
|
+
organism: The organism name.
|
2085
|
+
|
2086
|
+
Returns:
|
2087
|
+
Whether the SpatialData object is validated.
|
2088
|
+
"""
|
2089
|
+
from lamindb.core._settings import settings
|
2090
|
+
|
2091
|
+
# add all validated records to the current instance
|
2092
|
+
verbosity = settings.verbosity
|
2093
|
+
try:
|
2094
|
+
settings.verbosity = "error"
|
2095
|
+
self._update_registry_all()
|
2096
|
+
finally:
|
2097
|
+
settings.verbosity = verbosity
|
2098
|
+
|
2099
|
+
self._non_validated = {} # type: ignore
|
2100
|
+
|
2101
|
+
sample_validated = True
|
2102
|
+
if self._sample_df_curator:
|
2103
|
+
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
2104
|
+
sample_validated &= self._sample_df_curator.validate()
|
2105
|
+
if len(self._sample_df_curator.non_validated) > 0:
|
2106
|
+
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
2107
|
+
logger.print("")
|
2108
|
+
|
2109
|
+
mods_validated = True
|
2110
|
+
for table, adata_curator in self._table_adata_curators.items():
|
2111
|
+
logger.info(f"validating categoricals of table '{table}' ...")
|
2112
|
+
mods_validated &= adata_curator.validate()
|
2113
|
+
if len(adata_curator.non_validated) > 0:
|
2114
|
+
self._non_validated[table] = adata_curator.non_validated # type: ignore
|
2115
|
+
logger.print("")
|
2116
|
+
|
2117
|
+
self._is_validated = sample_validated & mods_validated
|
2118
|
+
return self._is_validated
|
2119
|
+
|
2120
|
+
def save_artifact(
|
2121
|
+
self,
|
2122
|
+
*,
|
2123
|
+
key: str | None = None,
|
2124
|
+
description: str | None = None,
|
2125
|
+
revises: Artifact | None = None,
|
2126
|
+
run: Run | None = None,
|
2127
|
+
) -> Artifact:
|
2128
|
+
if not self._is_validated:
|
2129
|
+
self.validate()
|
2130
|
+
if not self._is_validated:
|
2131
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
2132
|
+
|
2133
|
+
verbosity = settings.verbosity
|
2134
|
+
try:
|
2135
|
+
settings.verbosity = "warning"
|
2136
|
+
|
2137
|
+
if self._artifact is None:
|
2138
|
+
# Write the SpatialData object to a random path in tmp directory
|
2139
|
+
# The Artifact constructor will move it to the cache
|
2140
|
+
write_path = (
|
2141
|
+
f"{settings.cache_dir}/{random.randint(10**7, 10**8 - 1)}.zarr"
|
2142
|
+
)
|
2143
|
+
self._sdata.write(write_path)
|
2144
|
+
|
2145
|
+
# Create the Artifact and associate Artifact metadata
|
2146
|
+
self._artifact = Artifact(
|
2147
|
+
write_path,
|
2148
|
+
description=description,
|
2149
|
+
key=key,
|
2150
|
+
revises=revises,
|
2151
|
+
run=run,
|
2152
|
+
)
|
2153
|
+
# According to Tim it is not easy to calculate the number of observations.
|
2154
|
+
# We would have to write custom code to iterate over labels (which might not even exist at that point)
|
2155
|
+
self._artifact.otype = "spatialdata"
|
2156
|
+
self._artifact.save()
|
2157
|
+
|
2158
|
+
# Link schemas
|
2159
|
+
feature_kwargs = check_registry_organism(
|
2160
|
+
(list(self._var_fields.values())[0].field.model),
|
2161
|
+
self._organism,
|
2162
|
+
)
|
2163
|
+
|
2164
|
+
def _add_set_from_spatialdata(
|
2165
|
+
host: Artifact | Collection | Run,
|
2166
|
+
var_fields: dict[str, FieldAttr],
|
2167
|
+
obs_fields: dict[str, FieldAttr] = None,
|
2168
|
+
mute: bool = False,
|
2169
|
+
organism: str | Record | None = None,
|
2170
|
+
):
|
2171
|
+
"""Add Schemas from SpatialData."""
|
2172
|
+
if obs_fields is None:
|
2173
|
+
obs_fields = {}
|
2174
|
+
assert host.otype == "spatialdata" # noqa: S101
|
2175
|
+
|
2176
|
+
feature_sets = {}
|
2177
|
+
|
2178
|
+
# sample features
|
2179
|
+
sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
|
2180
|
+
if len(sample_features) > 0:
|
2181
|
+
feature_sets[self._sample_metadata_key] = Schema(
|
2182
|
+
features=sample_features
|
2183
|
+
)
|
2184
|
+
|
2185
|
+
# table features
|
2186
|
+
for table, field in var_fields.items():
|
2187
|
+
table_fs = parse_staged_feature_sets_from_anndata(
|
2188
|
+
self._sdata[table],
|
2189
|
+
var_field=field,
|
2190
|
+
obs_field=obs_fields.get(table, Feature.name),
|
2191
|
+
mute=mute,
|
2192
|
+
organism=organism,
|
2193
|
+
)
|
2194
|
+
for k, v in table_fs.items():
|
2195
|
+
feature_sets[f"['{table}'].{k}"] = v
|
2196
|
+
|
2197
|
+
def _unify_staged_feature_sets_by_hash(
|
2198
|
+
feature_sets: MutableMapping[str, Schema],
|
2199
|
+
):
|
2200
|
+
unique_values: dict[str, Any] = {}
|
2201
|
+
|
2202
|
+
for key, value in feature_sets.items():
|
2203
|
+
value_hash = (
|
2204
|
+
value.hash
|
2205
|
+
) # Assuming each value has a .hash attribute
|
2206
|
+
if value_hash in unique_values:
|
2207
|
+
feature_sets[key] = unique_values[value_hash]
|
2208
|
+
else:
|
2209
|
+
unique_values[value_hash] = value
|
2210
|
+
|
2211
|
+
return feature_sets
|
2212
|
+
|
2213
|
+
# link feature sets
|
2214
|
+
host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
|
2215
|
+
feature_sets
|
2216
|
+
)
|
2217
|
+
host.save()
|
2218
|
+
|
2219
|
+
_add_set_from_spatialdata(
|
2220
|
+
self._artifact, var_fields=self._var_fields, **feature_kwargs
|
2221
|
+
)
|
2222
|
+
|
2223
|
+
# Link labels
|
2224
|
+
def _add_labels_from_spatialdata(
|
2225
|
+
data,
|
2226
|
+
artifact: Artifact,
|
2227
|
+
fields: dict[str, FieldAttr],
|
2228
|
+
feature_ref_is_name: bool | None = None,
|
2229
|
+
):
|
2230
|
+
"""Add Labels from SpatialData."""
|
2231
|
+
features = Feature.lookup().dict()
|
2232
|
+
for key, field in fields.items():
|
2233
|
+
feature = features.get(key)
|
2234
|
+
registry = field.field.model
|
2235
|
+
filter_kwargs = check_registry_organism(registry, self._organism)
|
2236
|
+
filter_kwargs_current = get_current_filter_kwargs(
|
2237
|
+
registry, filter_kwargs
|
2238
|
+
)
|
2239
|
+
df = data if isinstance(data, pd.DataFrame) else data.obs
|
2240
|
+
labels = registry.from_values(
|
2241
|
+
df[key],
|
2242
|
+
field=field,
|
2243
|
+
**filter_kwargs_current,
|
2244
|
+
)
|
2245
|
+
if len(labels) == 0:
|
2246
|
+
continue
|
2247
|
+
|
2248
|
+
label_ref_is_name = None
|
2249
|
+
if hasattr(registry, "_name_field"):
|
2250
|
+
label_ref_is_name = field.field.name == registry._name_field
|
2251
|
+
add_labels(
|
2252
|
+
artifact,
|
2253
|
+
records=labels,
|
2254
|
+
feature=feature,
|
2255
|
+
feature_ref_is_name=feature_ref_is_name,
|
2256
|
+
label_ref_is_name=label_ref_is_name,
|
2257
|
+
from_curator=True,
|
2258
|
+
)
|
2259
|
+
|
2260
|
+
for accessor, accessor_fields in self._categoricals.items():
|
2261
|
+
column_field = self._var_fields.get(accessor)
|
2262
|
+
if accessor == self._sample_metadata_key:
|
2263
|
+
_add_labels_from_spatialdata(
|
2264
|
+
self._sample_metadata,
|
2265
|
+
self._artifact,
|
2266
|
+
accessor_fields,
|
2267
|
+
feature_ref_is_name=(
|
2268
|
+
None if column_field is None else _ref_is_name(column_field)
|
2269
|
+
),
|
2270
|
+
)
|
2271
|
+
else:
|
2272
|
+
_add_labels_from_spatialdata(
|
2273
|
+
self._sdata.tables[accessor],
|
2274
|
+
self._artifact,
|
2275
|
+
accessor_fields,
|
2276
|
+
feature_ref_is_name=(
|
2277
|
+
None if column_field is None else _ref_is_name(column_field)
|
2278
|
+
),
|
2279
|
+
)
|
2280
|
+
|
2281
|
+
finally:
|
2282
|
+
settings.verbosity = verbosity
|
2283
|
+
|
2284
|
+
slug = ln_setup.settings.instance.slug
|
2285
|
+
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
2286
|
+
logger.important(
|
2287
|
+
f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
|
2288
|
+
)
|
2289
|
+
|
2290
|
+
return self._artifact
|
2291
|
+
|
2292
|
+
|
2293
|
+
def _restrict_obs_fields(
|
2294
|
+
obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
|
2295
|
+
) -> dict[str, str]:
|
2296
|
+
"""Restrict the obs fields to name return only available obs fields.
|
2297
|
+
|
2298
|
+
To simplify the curation, we only validate against either name or ontology_id.
|
2299
|
+
If both are available, we validate against ontology_id.
|
2300
|
+
If none are available, we validate against name.
|
2301
|
+
"""
|
2302
|
+
obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
|
2303
|
+
for name, field in obs_fields.items():
|
2304
|
+
if name.endswith("_ontology_term_id"):
|
2305
|
+
continue
|
2306
|
+
# if both the ontology id and the name are present, only validate on the ontology_id
|
2307
|
+
if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
|
2308
|
+
obs_fields_unique.pop(name)
|
2309
|
+
# if the neither name nor ontology id are present, validate on the name
|
2310
|
+
# this will raise error downstream, we just use name to be more readable
|
2311
|
+
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
2312
|
+
obs_fields_unique[name] = field
|
2313
|
+
|
2314
|
+
# Only retain obs_fields_unique that have keys in adata.obs.columns
|
2315
|
+
available_obs_fields = {
|
2316
|
+
k: v for k, v in obs_fields_unique.items() if k in obs.columns
|
2317
|
+
}
|
2318
|
+
|
2319
|
+
return available_obs_fields
|
2320
|
+
|
2321
|
+
|
2322
|
+
def _add_defaults_to_obs(
|
2323
|
+
obs: pd.DataFrame,
|
2324
|
+
defaults: dict[str, str],
|
2325
|
+
) -> None:
|
2326
|
+
"""Add default columns and values to obs DataFrame."""
|
2327
|
+
added_defaults: dict = {}
|
2328
|
+
for name, default in defaults.items():
|
2329
|
+
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
2330
|
+
obs[name] = default
|
2331
|
+
added_defaults[name] = default
|
2332
|
+
logger.important(
|
2333
|
+
f"added default value '{default}' to the adata.obs['{name}']"
|
2334
|
+
)
|
2335
|
+
|
2336
|
+
|
2337
|
+
class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
2338
|
+
"""Annotation flow of AnnData based on CELLxGENE schema."""
|
2339
|
+
|
2340
|
+
_controls_were_created: bool | None = None
|
2341
|
+
|
2342
|
+
def __init__(
|
2343
|
+
self,
|
2344
|
+
adata: ad.AnnData | UPathStr,
|
2345
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
2346
|
+
organism: Literal["human", "mouse"] = "human",
|
2347
|
+
*,
|
2348
|
+
defaults: dict[str, str] = None,
|
2349
|
+
extra_sources: dict[str, Record] = None,
|
2350
|
+
schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
|
2351
|
+
verbosity: str = "hint",
|
2352
|
+
) -> None:
|
2353
|
+
"""CELLxGENE schema curator.
|
2354
|
+
|
2355
|
+
Args:
|
2356
|
+
adata: Path to or AnnData object to curate against the CELLxGENE schema.
|
2357
|
+
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
2358
|
+
The CELLxGENE Curator maps against the required CELLxGENE fields by default.
|
2359
|
+
organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
|
2360
|
+
defaults: Default values that are set if columns or column values are missing.
|
2361
|
+
extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
|
2362
|
+
These extra sources are joined with the CELLxGENE fixed sources.
|
2363
|
+
Use this parameter when subclassing.
|
2364
|
+
exclude: A dictionary mapping column names to values to exclude.
|
2365
|
+
schema_version: The CELLxGENE schema version to curate against.
|
2366
|
+
verbosity: The verbosity level.
|
2367
|
+
|
2368
|
+
"""
|
2369
|
+
import bionty as bt
|
2370
|
+
|
2371
|
+
CellxGeneAnnDataCatManager._init_categoricals_additional_values()
|
2372
|
+
|
2373
|
+
var_index: FieldAttr = bt.Gene.ensembl_gene_id
|
2374
|
+
|
2375
|
+
if categoricals is None:
|
2376
|
+
categoricals = CellxGeneAnnDataCatManager._get_categoricals()
|
2377
|
+
|
2378
|
+
self.organism = organism
|
2379
|
+
|
2380
|
+
VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
|
2381
|
+
if schema_version not in VALID_SCHEMA_VERSIONS:
|
2382
|
+
valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
|
2383
|
+
raise ValueError(
|
2384
|
+
f"Invalid schema_version: {schema_version}. "
|
2385
|
+
f"Valid versions are: {valid_versions}"
|
2386
|
+
)
|
2387
|
+
self.schema_version = schema_version
|
2388
|
+
self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
|
2389
|
+
with resources.path(
|
2390
|
+
"lamindb.curators._cellxgene_schemas", "schema_versions.yml"
|
2391
|
+
) as schema_versions_path:
|
2392
|
+
self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
|
2393
|
+
self.schema_version
|
2394
|
+
]
|
2395
|
+
|
2396
|
+
# Fetch AnnData obs to be able to set defaults and get sources
|
2397
|
+
if isinstance(adata, ad.AnnData):
|
2398
|
+
self._adata_obs = adata.obs
|
2399
|
+
else:
|
2400
|
+
self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
|
2401
|
+
|
2402
|
+
# Add defaults first to ensure that we fetch valid sources
|
2403
|
+
if defaults:
|
2404
|
+
_add_defaults_to_obs(self._adata_obs, defaults)
|
2405
|
+
|
2406
|
+
self.sources = self._create_sources(self._adata_obs)
|
2407
|
+
self.sources = {
|
2408
|
+
entity: source
|
2409
|
+
for entity, source in self.sources.items()
|
2410
|
+
if source is not None
|
2411
|
+
}
|
2412
|
+
|
2413
|
+
# These sources are not a part of the cellxgene schema but rather passed through.
|
2414
|
+
# This is useful when other Curators extend the CELLxGENE curator
|
2415
|
+
if extra_sources:
|
2416
|
+
self.sources = self.sources | extra_sources
|
2417
|
+
|
2418
|
+
# Exclude default values from validation because they are not available in the pinned sources
|
2419
|
+
exclude_keys = {
|
2420
|
+
entity: default
|
2421
|
+
for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
|
2422
|
+
if entity in self._adata_obs.columns # type: ignore
|
2423
|
+
}
|
2424
|
+
|
2425
|
+
super().__init__(
|
2426
|
+
data=adata,
|
2427
|
+
var_index=var_index,
|
2428
|
+
categoricals=_restrict_obs_fields(self._adata_obs, categoricals),
|
2429
|
+
verbosity=verbosity,
|
2430
|
+
organism=organism,
|
2431
|
+
sources=self.sources,
|
2432
|
+
exclude=exclude_keys,
|
2433
|
+
)
|
2434
|
+
|
2435
|
+
@classmethod
|
2436
|
+
def _init_categoricals_additional_values(cls) -> None:
|
2437
|
+
import bionty as bt
|
2438
|
+
|
2439
|
+
import lamindb as ln
|
2440
|
+
|
2441
|
+
# Note: if you add another control below, be mindful to change the if condition that
|
2442
|
+
# triggers whether creating these records is re-considered
|
2443
|
+
if cls._controls_were_created is None:
|
2444
|
+
cls._controls_were_created = (
|
2445
|
+
ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
|
2446
|
+
is not None
|
2447
|
+
)
|
2448
|
+
if not cls._controls_were_created:
|
2449
|
+
logger.important("Creating control labels in the CellxGene schema.")
|
2450
|
+
bt.CellType(
|
2451
|
+
ontology_id="unknown",
|
2452
|
+
name="unknown",
|
2453
|
+
description="From CellxGene schema.",
|
2454
|
+
).save()
|
2455
|
+
pato = bt.Source.filter(name="pato", version="2024-03-28").one()
|
2456
|
+
normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
|
2457
|
+
bt.Disease(
|
2458
|
+
uid=normal.uid,
|
2459
|
+
name=normal.name,
|
2460
|
+
ontology_id=normal.ontology_id,
|
2461
|
+
description=normal.description,
|
2462
|
+
source=normal.source,
|
2463
|
+
).save()
|
2464
|
+
bt.Ethnicity(
|
2465
|
+
ontology_id="na", name="na", description="From CellxGene schema."
|
2466
|
+
).save()
|
2467
|
+
bt.Ethnicity(
|
2468
|
+
ontology_id="unknown",
|
2469
|
+
name="unknown",
|
2470
|
+
description="From CellxGene schema.",
|
2471
|
+
).save()
|
2472
|
+
bt.DevelopmentalStage(
|
2473
|
+
ontology_id="unknown",
|
2474
|
+
name="unknown",
|
2475
|
+
description="From CellxGene schema.",
|
2476
|
+
).save()
|
2477
|
+
bt.Phenotype(
|
2478
|
+
ontology_id="unknown",
|
2479
|
+
name="unknown",
|
2480
|
+
description="From CellxGene schema.",
|
2481
|
+
).save()
|
2482
|
+
|
2483
|
+
tissue_type = ln.ULabel(
|
2484
|
+
name="TissueType",
|
2485
|
+
is_type=True,
|
2486
|
+
description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
|
2487
|
+
).save()
|
2488
|
+
ln.ULabel(
|
2489
|
+
name="tissue", type=tissue_type, description="From CellxGene schema."
|
2490
|
+
).save()
|
2491
|
+
ln.ULabel(
|
2492
|
+
name="organoid", type=tissue_type, description="From CellxGene schema."
|
2493
|
+
).save()
|
2494
|
+
ln.ULabel(
|
2495
|
+
name="cell culture",
|
2496
|
+
type=tissue_type,
|
2497
|
+
description="From CellxGene schema.",
|
2498
|
+
).save()
|
2499
|
+
|
2500
|
+
suspension_type = ln.ULabel(
|
2501
|
+
name="SuspensionType",
|
2502
|
+
is_type=True,
|
2503
|
+
description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
|
2504
|
+
).save()
|
2505
|
+
ln.ULabel(
|
2506
|
+
name="cell", type=suspension_type, description="From CellxGene schema."
|
2507
|
+
).save()
|
2508
|
+
ln.ULabel(
|
2509
|
+
name="nucleus",
|
2510
|
+
type=suspension_type,
|
2511
|
+
description="From CellxGene schema.",
|
2512
|
+
).save()
|
2513
|
+
ln.ULabel(name="na", type=suspension_type).save()
|
2514
|
+
|
2515
|
+
@classmethod
|
2516
|
+
def _get_categoricals(cls) -> dict[str, FieldAttr]:
|
2517
|
+
import bionty as bt
|
2518
|
+
|
2519
|
+
return {
|
2520
|
+
"assay": bt.ExperimentalFactor.name,
|
2521
|
+
"assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
|
2522
|
+
"cell_type": bt.CellType.name,
|
2523
|
+
"cell_type_ontology_term_id": bt.CellType.ontology_id,
|
2524
|
+
"development_stage": bt.DevelopmentalStage.name,
|
2525
|
+
"development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
|
2526
|
+
"disease": bt.Disease.name,
|
2527
|
+
"disease_ontology_term_id": bt.Disease.ontology_id,
|
2528
|
+
# "donor_id": "str", via pandera
|
2529
|
+
"self_reported_ethnicity": bt.Ethnicity.name,
|
2530
|
+
"self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
|
2531
|
+
"sex": bt.Phenotype.name,
|
2532
|
+
"sex_ontology_term_id": bt.Phenotype.ontology_id,
|
2533
|
+
"suspension_type": ULabel.name,
|
2534
|
+
"tissue": bt.Tissue.name,
|
2535
|
+
"tissue_ontology_term_id": bt.Tissue.ontology_id,
|
2536
|
+
"tissue_type": ULabel.name,
|
2537
|
+
"organism": bt.Organism.name,
|
2538
|
+
"organism_ontology_term_id": bt.Organism.ontology_id,
|
2539
|
+
}
|
2540
|
+
|
2541
|
+
@classmethod
|
2542
|
+
def _get_categoricals_defaults(cls) -> dict[str, str]:
|
2543
|
+
return {
|
2544
|
+
"cell_type": "unknown",
|
2545
|
+
"development_stage": "unknown",
|
2546
|
+
"disease": "normal",
|
2547
|
+
"donor_id": "unknown",
|
2548
|
+
"self_reported_ethnicity": "unknown",
|
2549
|
+
"sex": "unknown",
|
2550
|
+
"suspension_type": "cell",
|
2551
|
+
"tissue_type": "tissue",
|
2552
|
+
}
|
2553
|
+
|
2554
|
+
@property
|
2555
|
+
def pinned_ontologies(self) -> pd.DataFrame:
|
2556
|
+
return self._pinned_ontologies
|
2557
|
+
|
2558
|
+
@property
|
2559
|
+
def adata(self) -> AnnData:
|
2560
|
+
return self._adata
|
2561
|
+
|
2562
|
+
def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
|
2563
|
+
"""Creates a sources dictionary that can be passed to AnnDataCatManager."""
|
2564
|
+
import bionty as bt
|
2565
|
+
|
2566
|
+
# fmt: off
|
2567
|
+
def _fetch_bionty_source(
|
2568
|
+
entity: str, organism: str, source: str
|
2569
|
+
) -> bt.Source | None:
|
2570
|
+
"""Fetch the Bionty source of the pinned ontology.
|
2571
|
+
|
2572
|
+
Returns None if the source does not exist.
|
2573
|
+
"""
|
2574
|
+
version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
|
2575
|
+
(self._pinned_ontologies["organism"] == organism) &
|
2576
|
+
(self._pinned_ontologies["source"] == source), "version"].iloc[0]
|
2577
|
+
return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
|
2578
|
+
|
2579
|
+
entity_mapping = {
|
2580
|
+
"var_index": ("Gene", self.organism, "ensembl"),
|
2581
|
+
"cell_type": ("CellType", "all", "cl"),
|
2582
|
+
"assay": ("ExperimentalFactor", "all", "efo"),
|
2583
|
+
"self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
|
2584
|
+
"development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
|
2585
|
+
"disease": ("Disease", "all", "mondo"),
|
2586
|
+
# "organism": ("Organism", "vertebrates", "ensembl"),
|
2587
|
+
"sex": ("Phenotype", "all", "pato"),
|
2588
|
+
"tissue": ("Tissue", "all", "uberon"),
|
2589
|
+
}
|
2590
|
+
# fmt: on
|
2591
|
+
|
2592
|
+
# Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
|
2593
|
+
entity_to_sources = {
|
2594
|
+
entity: _fetch_bionty_source(*params)
|
2595
|
+
for entity, params in entity_mapping.items()
|
2596
|
+
if entity in obs.columns
|
2597
|
+
or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
|
2598
|
+
or entity == "var_index"
|
2599
|
+
}
|
2600
|
+
|
2601
|
+
return entity_to_sources
|
2602
|
+
|
2603
|
+
def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
|
2604
|
+
"""Converts a column that stores a name into a column that stores the ontology id.
|
2605
|
+
|
2606
|
+
cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
|
2607
|
+
"""
|
2608
|
+
field_name = field.field.name
|
2609
|
+
assert field_name == "name" # noqa: S101
|
2610
|
+
cols = ["name", "ontology_id"]
|
2611
|
+
registry = field.field.model
|
2612
|
+
|
2613
|
+
if hasattr(registry, "ontology_id"):
|
2614
|
+
validated_records = registry.filter(**{f"{field_name}__in": values})
|
2615
|
+
mapper = (
|
2616
|
+
pd.DataFrame(validated_records.values_list(*cols))
|
2617
|
+
.set_index(0)
|
2618
|
+
.to_dict()[1]
|
2619
|
+
)
|
2620
|
+
return values.map(mapper)
|
2621
|
+
|
2622
|
+
def validate(self) -> bool: # type: ignore
|
2623
|
+
"""Validates the AnnData object against most cellxgene requirements."""
|
2624
|
+
# Verify that all required obs columns are present
|
2625
|
+
missing_obs_fields = [
|
2626
|
+
name
|
2627
|
+
for name in CellxGeneAnnDataCatManager._get_categoricals_defaults().keys()
|
2628
|
+
if name not in self._adata.obs.columns
|
2629
|
+
and f"{name}_ontology_term_id" not in self._adata.obs.columns
|
2630
|
+
]
|
2631
|
+
if len(missing_obs_fields) > 0:
|
2632
|
+
missing_obs_fields_str = ", ".join(list(missing_obs_fields))
|
2633
|
+
logger.error(f"missing required obs columns {missing_obs_fields_str}")
|
2634
|
+
logger.info(
|
2635
|
+
"consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
|
2636
|
+
"to automatically add these columns with default values."
|
2637
|
+
)
|
2638
|
+
return False
|
2639
|
+
|
2640
|
+
# Verify that no cellxgene reserved names are present
|
2641
|
+
reserved_names = {
|
2642
|
+
"ethnicity",
|
2643
|
+
"ethnicity_ontology_term_id",
|
2644
|
+
"X_normalization",
|
2645
|
+
"default_field",
|
2646
|
+
"layer_descriptions",
|
2647
|
+
"tags",
|
2648
|
+
"versions",
|
2649
|
+
"contributors",
|
2650
|
+
"preprint_doi",
|
2651
|
+
"project_description",
|
2652
|
+
"project_links",
|
2653
|
+
"project_name",
|
2654
|
+
"publication_doi",
|
2655
|
+
}
|
2656
|
+
matched_columns = [
|
2657
|
+
column for column in self._adata.obs.columns if column in reserved_names
|
2658
|
+
]
|
2659
|
+
if len(matched_columns) > 0:
|
2660
|
+
raise ValueError(
|
2661
|
+
f"AnnData object must not contain obs columns {matched_columns} which are"
|
2662
|
+
" reserved from previous schema versions."
|
2663
|
+
)
|
1571
2664
|
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
2665
|
+
return super().validate()
|
2666
|
+
|
2667
|
+
def to_cellxgene_anndata(
|
2668
|
+
self, is_primary_data: bool, title: str | None = None
|
2669
|
+
) -> ad.AnnData:
|
2670
|
+
"""Converts the AnnData object to the cellxgene-schema input format.
|
2671
|
+
|
2672
|
+
cellxgene expects the obs fields to be {entity}_ontology_id fields and has many further requirements which are
|
2673
|
+
documented here: https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema.
|
2674
|
+
This function checks for most but not all requirements of the CELLxGENE schema.
|
2675
|
+
If you want to ensure that it fully adheres to the CELLxGENE schema, run `cellxgene-schema` on the AnnData object.
|
2676
|
+
|
2677
|
+
Args:
|
2678
|
+
is_primary_data: Whether the measured data is primary data or not.
|
2679
|
+
title: Title of the AnnData object. Commonly the name of the publication.
|
2680
|
+
|
2681
|
+
Returns:
|
2682
|
+
An AnnData object which adheres to the cellxgene-schema.
|
2683
|
+
"""
|
2684
|
+
# Create a copy since we modify the AnnData object extensively
|
2685
|
+
adata_cxg = self._adata.copy()
|
2686
|
+
|
2687
|
+
# cellxgene requires an embedding
|
2688
|
+
embedding_pattern = r"^[a-zA-Z][a-zA-Z0-9_.-]*$"
|
2689
|
+
exclude_key = "spatial"
|
2690
|
+
matching_keys = [
|
2691
|
+
key
|
2692
|
+
for key in adata_cxg.obsm.keys()
|
2693
|
+
if re.match(embedding_pattern, key) and key != exclude_key
|
2694
|
+
]
|
2695
|
+
if len(matching_keys) == 0:
|
2696
|
+
raise ValueError(
|
2697
|
+
"Unable to find an embedding key. Please calculate an embedding."
|
1579
2698
|
)
|
1580
|
-
|
1581
|
-
|
2699
|
+
|
2700
|
+
# convert name column to ontology_term_id column
|
2701
|
+
for column in adata_cxg.obs.columns:
|
2702
|
+
if column in self.categoricals and not column.endswith("_ontology_term_id"):
|
2703
|
+
mapped_column = self._convert_name_to_ontology_id(
|
2704
|
+
adata_cxg.obs[column], field=self.categoricals.get(column)
|
2705
|
+
)
|
2706
|
+
if mapped_column is not None:
|
2707
|
+
adata_cxg.obs[f"{column}_ontology_term_id"] = mapped_column
|
2708
|
+
|
2709
|
+
# drop the name columns for ontologies. cellxgene does not allow them.
|
2710
|
+
drop_columns = [
|
2711
|
+
i
|
2712
|
+
for i in adata_cxg.obs.columns
|
2713
|
+
if f"{i}_ontology_term_id" in adata_cxg.obs.columns
|
2714
|
+
]
|
2715
|
+
adata_cxg.obs.drop(columns=drop_columns, inplace=True)
|
2716
|
+
|
2717
|
+
# Add cellxgene metadata to AnnData object
|
2718
|
+
if "is_primary_data" not in adata_cxg.obs.columns:
|
2719
|
+
adata_cxg.obs["is_primary_data"] = is_primary_data
|
2720
|
+
if "feature_is_filtered" not in adata_cxg.var.columns:
|
2721
|
+
logger.warn(
|
2722
|
+
"column 'feature_is_filtered' not present in var. Setting to default"
|
2723
|
+
" value of False."
|
1582
2724
|
)
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1586
|
-
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
|
2725
|
+
adata_cxg.var["feature_is_filtered"] = False
|
2726
|
+
if title is None:
|
2727
|
+
raise ValueError("please pass a title!")
|
2728
|
+
else:
|
2729
|
+
adata_cxg.uns["title"] = title
|
2730
|
+
adata_cxg.uns["cxg_lamin_schema_reference"] = self.schema_reference
|
2731
|
+
adata_cxg.uns["cxg_lamin_schema_version"] = self.schema_version
|
2732
|
+
|
2733
|
+
return adata_cxg
|
2734
|
+
|
2735
|
+
|
2736
|
+
class ValueUnit:
|
2737
|
+
"""Base class for handling value-unit combinations."""
|
2738
|
+
|
2739
|
+
@staticmethod
|
2740
|
+
def parse_value_unit(value: str, is_dose: bool = True) -> tuple[str, str] | None:
|
2741
|
+
"""Parse a string containing a value and unit into a tuple."""
|
2742
|
+
if not isinstance(value, str) or not value.strip():
|
2743
|
+
return None
|
2744
|
+
|
2745
|
+
value = str(value).strip()
|
2746
|
+
match = re.match(r"^(\d*\.?\d{0,1})\s*([a-zA-ZμµΜ]+)$", value)
|
2747
|
+
|
2748
|
+
if not match:
|
2749
|
+
raise ValueError(
|
2750
|
+
f"Invalid format: {value}. Expected format: number with max 1 decimal place + unit"
|
2751
|
+
)
|
2752
|
+
|
2753
|
+
number, unit = match.groups()
|
2754
|
+
formatted_number = f"{float(number):.1f}"
|
2755
|
+
|
2756
|
+
if is_dose:
|
2757
|
+
standardized_unit = DoseHandler.standardize_unit(unit)
|
2758
|
+
if not DoseHandler.validate_unit(standardized_unit):
|
2759
|
+
raise ValueError(
|
2760
|
+
f"Invalid dose unit: {unit}. Must be convertible to one of: nM, μM, mM, M"
|
2761
|
+
)
|
2762
|
+
else:
|
2763
|
+
standardized_unit = TimeHandler.standardize_unit(unit)
|
2764
|
+
if not TimeHandler.validate_unit(standardized_unit):
|
2765
|
+
raise ValueError(
|
2766
|
+
f"Invalid time unit: {unit}. Must be convertible to one of: h, m, s, d, y"
|
1594
2767
|
)
|
1595
2768
|
|
1596
|
-
return
|
2769
|
+
return formatted_number, standardized_unit
|
1597
2770
|
|
1598
2771
|
|
1599
|
-
class
|
1600
|
-
"""
|
2772
|
+
class DoseHandler:
|
2773
|
+
"""Handler for dose-related operations."""
|
1601
2774
|
|
1602
|
-
|
2775
|
+
VALID_UNITS = {"nM", "μM", "µM", "mM", "M"}
|
2776
|
+
UNIT_MAP = {
|
2777
|
+
"nm": "nM",
|
2778
|
+
"NM": "nM",
|
2779
|
+
"um": "μM",
|
2780
|
+
"UM": "μM",
|
2781
|
+
"μm": "μM",
|
2782
|
+
"μM": "μM",
|
2783
|
+
"µm": "μM",
|
2784
|
+
"µM": "μM",
|
2785
|
+
"mm": "mM",
|
2786
|
+
"MM": "mM",
|
2787
|
+
"m": "M",
|
2788
|
+
"M": "M",
|
2789
|
+
}
|
1603
2790
|
|
1604
|
-
|
2791
|
+
@classmethod
|
2792
|
+
def validate_unit(cls, unit: str) -> bool:
|
2793
|
+
"""Validate if the dose unit is acceptable."""
|
2794
|
+
return unit in cls.VALID_UNITS
|
1605
2795
|
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
2796
|
+
@classmethod
|
2797
|
+
def standardize_unit(cls, unit: str) -> str:
|
2798
|
+
"""Standardize dose unit to standard formats."""
|
2799
|
+
return cls.UNIT_MAP.get(unit, unit)
|
2800
|
+
|
2801
|
+
@classmethod
|
2802
|
+
def validate_values(cls, values: pd.Series) -> list:
|
2803
|
+
"""Validate pert_dose values with strict case checking."""
|
2804
|
+
errors = []
|
1615
2805
|
|
1616
|
-
|
2806
|
+
for idx, value in values.items():
|
2807
|
+
if pd.isna(value):
|
2808
|
+
continue
|
1617
2809
|
|
1618
|
-
|
2810
|
+
if isinstance(value, (int, float)):
|
2811
|
+
errors.append(
|
2812
|
+
f"Row {idx} - Missing unit for dose: {value}. Must include a unit (nM, μM, mM, M)"
|
2813
|
+
)
|
2814
|
+
continue
|
2815
|
+
|
2816
|
+
try:
|
2817
|
+
ValueUnit.parse_value_unit(value, is_dose=True)
|
2818
|
+
except ValueError as e:
|
2819
|
+
errors.append(f"Row {idx} - {str(e)}")
|
2820
|
+
|
2821
|
+
return errors
|
1619
2822
|
|
1620
|
-
|
1621
|
-
|
1622
|
-
"""
|
2823
|
+
|
2824
|
+
class TimeHandler:
|
2825
|
+
"""Handler for time-related operations."""
|
2826
|
+
|
2827
|
+
VALID_UNITS = {"h", "m", "s", "d", "y"}
|
1623
2828
|
|
1624
2829
|
@classmethod
|
1625
|
-
|
1626
|
-
|
1627
|
-
cls
|
1628
|
-
df: pd.DataFrame,
|
1629
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
1630
|
-
columns: FieldAttr = Feature.name,
|
1631
|
-
using_key: str | None = None,
|
1632
|
-
verbosity: str = "hint",
|
1633
|
-
organism: str | None = None,
|
1634
|
-
) -> DataFrameCurator:
|
1635
|
-
"""{}""" # noqa: D415
|
1636
|
-
return DataFrameCurator(
|
1637
|
-
df=df,
|
1638
|
-
categoricals=categoricals,
|
1639
|
-
columns=columns,
|
1640
|
-
using_key=using_key,
|
1641
|
-
verbosity=verbosity,
|
1642
|
-
organism=organism,
|
1643
|
-
)
|
2830
|
+
def validate_unit(cls, unit: str) -> bool:
|
2831
|
+
"""Validate if the time unit is acceptable."""
|
2832
|
+
return unit == unit.lower() and unit in cls.VALID_UNITS
|
1644
2833
|
|
1645
2834
|
@classmethod
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
organism: str | None = None,
|
1656
|
-
sources: dict[str, Record] | None = None,
|
1657
|
-
) -> AnnDataCurator:
|
1658
|
-
"""{}""" # noqa: D415
|
1659
|
-
return AnnDataCurator(
|
1660
|
-
data=data,
|
1661
|
-
var_index=var_index,
|
1662
|
-
categoricals=categoricals,
|
1663
|
-
obs_columns=obs_columns,
|
1664
|
-
using_key=using_key,
|
1665
|
-
verbosity=verbosity,
|
1666
|
-
organism=organism,
|
1667
|
-
sources=sources,
|
1668
|
-
)
|
2835
|
+
def standardize_unit(cls, unit: str) -> str:
|
2836
|
+
"""Standardize time unit to standard formats."""
|
2837
|
+
if unit.startswith("hr"):
|
2838
|
+
return "h"
|
2839
|
+
elif unit.startswith("min"):
|
2840
|
+
return "m"
|
2841
|
+
elif unit.startswith("sec"):
|
2842
|
+
return "s"
|
2843
|
+
return unit[0].lower()
|
1669
2844
|
|
1670
2845
|
@classmethod
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
2846
|
+
def validate_values(cls, values: pd.Series) -> list:
|
2847
|
+
"""Validate pert_time values."""
|
2848
|
+
errors = []
|
2849
|
+
|
2850
|
+
for idx, value in values.items():
|
2851
|
+
if pd.isna(value):
|
2852
|
+
continue
|
2853
|
+
|
2854
|
+
if isinstance(value, (int, float)):
|
2855
|
+
errors.append(
|
2856
|
+
f"Row {idx} - Missing unit for time: {value}. Must include a unit (h, m, s, d, y)"
|
2857
|
+
)
|
2858
|
+
continue
|
2859
|
+
|
2860
|
+
try:
|
2861
|
+
ValueUnit.parse_value_unit(value, is_dose=False)
|
2862
|
+
except ValueError as e:
|
2863
|
+
errors.append(f"Row {idx} - {str(e)}")
|
2864
|
+
|
2865
|
+
return errors
|
2866
|
+
|
2867
|
+
|
2868
|
+
class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
2869
|
+
"""Curator flow for Perturbation data."""
|
2870
|
+
|
2871
|
+
PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
|
2872
|
+
|
2873
|
+
def __init__(
|
2874
|
+
self,
|
2875
|
+
adata: ad.AnnData,
|
2876
|
+
organism: Literal["human", "mouse"] = "human",
|
2877
|
+
pert_dose: bool = True,
|
2878
|
+
pert_time: bool = True,
|
2879
|
+
*,
|
1678
2880
|
verbosity: str = "hint",
|
1679
|
-
|
1680
|
-
)
|
1681
|
-
"""
|
1682
|
-
|
1683
|
-
|
1684
|
-
|
1685
|
-
|
1686
|
-
|
2881
|
+
cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
|
2882
|
+
):
|
2883
|
+
"""Initialize the curator with configuration and validation settings."""
|
2884
|
+
import bionty as bt
|
2885
|
+
|
2886
|
+
self._pert_time = pert_time
|
2887
|
+
self._pert_dose = pert_dose
|
2888
|
+
|
2889
|
+
self._validate_initial_data(adata)
|
2890
|
+
self._setup_configuration(adata)
|
2891
|
+
|
2892
|
+
self._setup_sources(adata)
|
2893
|
+
self._setup_compound_source()
|
2894
|
+
|
2895
|
+
super().__init__(
|
2896
|
+
adata=adata,
|
2897
|
+
categoricals=self.PT_CATEGORICALS,
|
2898
|
+
defaults=self.PT_DEFAULT_VALUES,
|
1687
2899
|
verbosity=verbosity,
|
1688
2900
|
organism=organism,
|
2901
|
+
extra_sources=self.PT_SOURCES,
|
2902
|
+
schema_version=cxg_schema_version,
|
1689
2903
|
)
|
1690
2904
|
|
1691
|
-
|
1692
|
-
|
1693
|
-
|
1694
|
-
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1698
|
-
|
1699
|
-
|
1700
|
-
|
1701
|
-
|
1702
|
-
exclude: dict[str, str | list[str]] | None = None,
|
1703
|
-
) -> SOMACurator:
|
1704
|
-
"""{}""" # noqa: D415
|
1705
|
-
return SOMACurator(
|
1706
|
-
experiment_uri=experiment_uri,
|
1707
|
-
var_index=var_index,
|
1708
|
-
categoricals=categoricals,
|
1709
|
-
obs_columns=obs_columns,
|
1710
|
-
using_key=using_key,
|
1711
|
-
organism=organism,
|
1712
|
-
sources=sources,
|
1713
|
-
exclude=exclude,
|
2905
|
+
def _setup_configuration(self, adata: ad.AnnData):
|
2906
|
+
"""Set up default configuration values."""
|
2907
|
+
import bionty as bt
|
2908
|
+
import wetlab as wl
|
2909
|
+
|
2910
|
+
self.PT_DEFAULT_VALUES = (
|
2911
|
+
CellxGeneAnnDataCatManager._get_categoricals_defaults()
|
2912
|
+
| {
|
2913
|
+
"cell_line": "unknown",
|
2914
|
+
"pert_target": "unknown",
|
2915
|
+
}
|
1714
2916
|
)
|
1715
2917
|
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1719
|
-
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1726
|
-
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1730
|
-
|
2918
|
+
self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
|
2919
|
+
k: v
|
2920
|
+
for k, v in {
|
2921
|
+
"cell_line": bt.CellLine.name,
|
2922
|
+
"pert_target": wl.PerturbationTarget.name,
|
2923
|
+
"pert_genetic": wl.GeneticPerturbation.name,
|
2924
|
+
"pert_compound": wl.Compound.name,
|
2925
|
+
"pert_biologic": wl.Biologic.name,
|
2926
|
+
"pert_physical": wl.EnvironmentalPerturbation.name,
|
2927
|
+
}.items()
|
2928
|
+
if k in adata.obs.columns
|
2929
|
+
}
|
2930
|
+
# if "donor_id" in self.PT_CATEGORICALS:
|
2931
|
+
# self.PT_CATEGORICALS["donor_id"] = Donor.name
|
2932
|
+
|
2933
|
+
def _setup_sources(self, adata: ad.AnnData):
|
2934
|
+
"""Set up data sources."""
|
2935
|
+
self.PT_SOURCES = {}
|
2936
|
+
# if "cell_line" in adata.obs.columns:
|
2937
|
+
# self.PT_SOURCES["cell_line"] = (
|
2938
|
+
# bt.Source.filter(name="depmap").first()
|
2939
|
+
# )
|
2940
|
+
if "pert_compound" in adata.obs.columns:
|
2941
|
+
import bionty as bt
|
2942
|
+
|
2943
|
+
self.PT_SOURCES["pert_compound"] = bt.Source.filter(
|
2944
|
+
entity="wetlab.Compound", name="chebi"
|
2945
|
+
).first()
|
2946
|
+
|
2947
|
+
def _validate_initial_data(self, adata: ad.AnnData):
|
2948
|
+
"""Validate the initial data structure."""
|
2949
|
+
self._validate_required_columns(adata)
|
2950
|
+
self._validate_perturbation_types(adata)
|
2951
|
+
|
2952
|
+
def _validate_required_columns(self, adata: ad.AnnData):
|
2953
|
+
"""Validate required columns are present."""
|
2954
|
+
if "pert_target" not in adata.obs.columns:
|
2955
|
+
if (
|
2956
|
+
"pert_name" not in adata.obs.columns
|
2957
|
+
or "pert_type" not in adata.obs.columns
|
2958
|
+
):
|
2959
|
+
raise ValidationError(
|
2960
|
+
"either 'pert_target' or both 'pert_name' and 'pert_type' must be present"
|
2961
|
+
)
|
2962
|
+
else:
|
2963
|
+
if "pert_name" not in adata.obs.columns:
|
2964
|
+
logger.warning(
|
2965
|
+
"no 'pert' column found in adata.obs, will only curate 'pert_target'"
|
2966
|
+
)
|
2967
|
+
elif "pert_type" not in adata.obs.columns:
|
2968
|
+
raise ValidationError("both 'pert' and 'pert_type' must be present")
|
2969
|
+
|
2970
|
+
def _validate_perturbation_types(self, adata: ad.AnnData):
|
2971
|
+
"""Validate perturbation types."""
|
2972
|
+
if "pert_type" in adata.obs.columns:
|
2973
|
+
data_pert_types = set(adata.obs["pert_type"].unique())
|
2974
|
+
invalid_pert_types = data_pert_types - self.PERT_COLUMNS
|
2975
|
+
if invalid_pert_types:
|
2976
|
+
raise ValidationError(
|
2977
|
+
f"invalid pert_type found: {invalid_pert_types}!\n"
|
2978
|
+
f" → allowed values: {self.PERT_COLUMNS}"
|
2979
|
+
)
|
2980
|
+
self._process_perturbation_types(adata, data_pert_types)
|
2981
|
+
|
2982
|
+
def _process_perturbation_types(self, adata: ad.AnnData, pert_types: set):
|
2983
|
+
"""Process and map perturbation types."""
|
2984
|
+
for pert_type in pert_types:
|
2985
|
+
col_name = "pert_" + pert_type
|
2986
|
+
adata.obs[col_name] = adata.obs["pert_name"].where(
|
2987
|
+
adata.obs["pert_type"] == pert_type, None
|
2988
|
+
)
|
2989
|
+
if adata.obs[col_name].dtype.name == "category":
|
2990
|
+
adata.obs[col_name].cat.remove_unused_categories()
|
2991
|
+
logger.important(f"mapped 'pert_name' to '{col_name}'")
|
1731
2992
|
|
1732
|
-
|
2993
|
+
def _setup_compound_source(self):
|
2994
|
+
"""Set up the compound source with muted logging."""
|
2995
|
+
import bionty as bt
|
2996
|
+
import wetlab as wl
|
2997
|
+
|
2998
|
+
with logger.mute():
|
2999
|
+
chebi_source = bt.Source.filter(
|
3000
|
+
entity="wetlab.Compound", name="chebi"
|
3001
|
+
).first()
|
3002
|
+
if not chebi_source:
|
3003
|
+
wl.Compound.add_source(
|
3004
|
+
bt.Source.filter(entity="Drug", name="chebi").first()
|
3005
|
+
)
|
1733
3006
|
|
1734
|
-
|
1735
|
-
the object
|
3007
|
+
def validate(self) -> bool: # type: ignore
|
3008
|
+
"""Validate the AnnData object."""
|
3009
|
+
validated = super().validate()
|
3010
|
+
|
3011
|
+
if self._pert_dose:
|
3012
|
+
validated &= self._validate_dose_column()
|
3013
|
+
if self._pert_time:
|
3014
|
+
validated &= self._validate_time_column()
|
3015
|
+
|
3016
|
+
self._is_validated = validated
|
3017
|
+
|
3018
|
+
# sort columns
|
3019
|
+
first_columns = [
|
3020
|
+
"pert_target",
|
3021
|
+
"pert_genetic",
|
3022
|
+
"pert_compound",
|
3023
|
+
"pert_biologic",
|
3024
|
+
"pert_physical",
|
3025
|
+
"pert_dose",
|
3026
|
+
"pert_time",
|
3027
|
+
"organism",
|
3028
|
+
"cell_line",
|
3029
|
+
"cell_type",
|
3030
|
+
"disease",
|
3031
|
+
"tissue_type",
|
3032
|
+
"tissue",
|
3033
|
+
"assay",
|
3034
|
+
"suspension_type",
|
3035
|
+
"donor_id",
|
3036
|
+
"sex",
|
3037
|
+
"self_reported_ethnicity",
|
3038
|
+
"development_stage",
|
3039
|
+
"pert_name",
|
3040
|
+
"pert_type",
|
3041
|
+
]
|
3042
|
+
sorted_columns = [
|
3043
|
+
col for col in first_columns if col in self._adata.obs.columns
|
3044
|
+
] + [col for col in self._adata.obs.columns if col not in first_columns]
|
3045
|
+
# must assign to self._df to ensure .standardize works correctly
|
3046
|
+
self._obs_df = self._adata.obs[sorted_columns]
|
3047
|
+
self._adata.obs = self._obs_df
|
3048
|
+
return validated
|
3049
|
+
|
3050
|
+
def standardize(self, key: str) -> pd.DataFrame:
|
3051
|
+
"""Standardize the AnnData object."""
|
3052
|
+
super().standardize(key)
|
3053
|
+
self._adata.obs = self._obs_df
|
3054
|
+
|
3055
|
+
def _validate_dose_column(self) -> bool:
|
3056
|
+
"""Validate the dose column."""
|
3057
|
+
if not Feature.filter(name="pert_dose").exists():
|
3058
|
+
Feature(name="pert_dose", dtype="str").save() # type: ignore
|
3059
|
+
|
3060
|
+
dose_errors = DoseHandler.validate_values(self._adata.obs["pert_dose"])
|
3061
|
+
if dose_errors:
|
3062
|
+
self._log_validation_errors("pert_dose", dose_errors)
|
3063
|
+
return False
|
3064
|
+
return True
|
3065
|
+
|
3066
|
+
def _validate_time_column(self) -> bool:
|
3067
|
+
"""Validate the time column."""
|
3068
|
+
if not Feature.filter(name="pert_time").exists():
|
3069
|
+
Feature(name="pert_time", dtype="str").save() # type: ignore
|
3070
|
+
|
3071
|
+
time_errors = TimeHandler.validate_values(self._adata.obs["pert_time"])
|
3072
|
+
if time_errors:
|
3073
|
+
self._log_validation_errors("pert_time", time_errors)
|
3074
|
+
return False
|
3075
|
+
return True
|
3076
|
+
|
3077
|
+
def _log_validation_errors(self, column: str, errors: list):
|
3078
|
+
"""Log validation errors with formatting."""
|
3079
|
+
errors_print = "\n ".join(errors)
|
3080
|
+
logger.warning(
|
3081
|
+
f"invalid {column} values found!\n {errors_print}\n"
|
3082
|
+
f" → run {colors.cyan('standardize_dose_time()')}"
|
3083
|
+
)
|
1736
3084
|
|
1737
|
-
|
3085
|
+
def standardize_dose_time(self) -> pd.DataFrame:
|
3086
|
+
"""Standardize dose and time values."""
|
3087
|
+
standardized_df = self._adata.obs.copy()
|
1738
3088
|
|
1739
|
-
|
1740
|
-
|
1741
|
-
|
1742
|
-
|
1743
|
-
using_key: A reference LaminDB instance.
|
1744
|
-
organism: The organism name.
|
1745
|
-
sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
|
1746
|
-
exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
|
1747
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1748
|
-
using the exclude parameter ensures they are not validated.
|
1749
|
-
verbosity: The verbosity level of the logger.
|
1750
|
-
sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
|
1751
|
-
|
1752
|
-
Examples:
|
1753
|
-
>>> import lamindb as ln
|
1754
|
-
>>> import bionty as bt
|
1755
|
-
>>> curator = ln.Curator.from_spatialdata(
|
1756
|
-
... sdata,
|
1757
|
-
... var_index={
|
1758
|
-
... "table_1": bt.Gene.ensembl_gene_id,
|
1759
|
-
... },
|
1760
|
-
... categoricals={
|
1761
|
-
... "table1":
|
1762
|
-
... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name},
|
1763
|
-
... "sample":
|
1764
|
-
... {"experimental_factor": bt.ExperimentalFactor.name},
|
1765
|
-
... },
|
1766
|
-
... organism="human",
|
1767
|
-
... )
|
1768
|
-
"""
|
1769
|
-
try:
|
1770
|
-
import spatialdata
|
1771
|
-
except ImportError as e:
|
1772
|
-
raise ImportError(
|
1773
|
-
"Please install spatialdata: pip install spatialdata"
|
1774
|
-
) from e
|
3089
|
+
if "pert_dose" in self._adata.obs.columns:
|
3090
|
+
standardized_df = self._standardize_column(
|
3091
|
+
standardized_df, "pert_dose", is_dose=True
|
3092
|
+
)
|
1775
3093
|
|
1776
|
-
|
3094
|
+
if "pert_time" in self._adata.obs.columns:
|
3095
|
+
standardized_df = self._standardize_column(
|
3096
|
+
standardized_df, "pert_time", is_dose=False
|
3097
|
+
)
|
1777
3098
|
|
1778
|
-
|
1779
|
-
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
3099
|
+
self._adata.obs = standardized_df
|
3100
|
+
return standardized_df
|
3101
|
+
|
3102
|
+
def _standardize_column(
|
3103
|
+
self, df: pd.DataFrame, column: str, is_dose: bool
|
3104
|
+
) -> pd.DataFrame:
|
3105
|
+
"""Standardize values in a specific column."""
|
3106
|
+
for idx, value in self._adata.obs[column].items():
|
3107
|
+
if pd.isna(value) or (
|
3108
|
+
isinstance(value, str) and (not value.strip() or value.lower() == "nan")
|
3109
|
+
):
|
3110
|
+
df.at[idx, column] = None
|
3111
|
+
continue
|
1789
3112
|
|
3113
|
+
try:
|
3114
|
+
num, unit = ValueUnit.parse_value_unit(value, is_dose=is_dose)
|
3115
|
+
df.at[idx, column] = f"{num}{unit}"
|
3116
|
+
except ValueError:
|
3117
|
+
continue
|
1790
3118
|
|
1791
|
-
|
1792
|
-
"""Get a registry instance using a specific instance."""
|
1793
|
-
if using_key is not None and using_key != "default":
|
1794
|
-
return registry.using(using_key)
|
1795
|
-
return registry
|
3119
|
+
return df
|
1796
3120
|
|
1797
3121
|
|
1798
3122
|
def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
@@ -1859,10 +3183,7 @@ def check_registry_organism(registry: Record, organism: str | None = None) -> di
|
|
1859
3183
|
import bionty as bt
|
1860
3184
|
|
1861
3185
|
if organism is None and bt.settings.organism is None:
|
1862
|
-
|
1863
|
-
f"{registry.__name__} registry requires an organism!\n"
|
1864
|
-
" → please pass an organism name via organism="
|
1865
|
-
)
|
3186
|
+
return {}
|
1866
3187
|
return {"organism": organism or bt.settings.organism.name}
|
1867
3188
|
return {}
|
1868
3189
|
|
@@ -1871,11 +3192,11 @@ def validate_categories(
|
|
1871
3192
|
values: Iterable[str],
|
1872
3193
|
field: FieldAttr,
|
1873
3194
|
key: str,
|
1874
|
-
using_key: str | None = None,
|
1875
3195
|
organism: str | None = None,
|
1876
3196
|
source: Record | None = None,
|
1877
3197
|
exclude: str | list | None = None,
|
1878
3198
|
hint_print: str | None = None,
|
3199
|
+
curator: CatManager | None = None,
|
1879
3200
|
) -> tuple[bool, list]:
|
1880
3201
|
"""Validate ontology terms in a pandas series using LaminDB registries.
|
1881
3202
|
|
@@ -1883,7 +3204,6 @@ def validate_categories(
|
|
1883
3204
|
values: The values to validate.
|
1884
3205
|
field: The field attribute.
|
1885
3206
|
key: The key referencing the slot in the DataFrame.
|
1886
|
-
using_key: A reference LaminDB instance.
|
1887
3207
|
organism: The organism name.
|
1888
3208
|
source: The source record.
|
1889
3209
|
exclude: Exclude specific values from validation.
|
@@ -1918,22 +3238,8 @@ def validate_categories(
|
|
1918
3238
|
non_validated = inspect_result.non_validated
|
1919
3239
|
syn_mapper = inspect_result.synonyms_mapper
|
1920
3240
|
|
1921
|
-
# inspect the non-validated values from the using_key instance
|
1922
|
-
values_validated = []
|
1923
|
-
if using_key is not None and using_key != "default" and non_validated:
|
1924
|
-
registry_using = get_registry_instance(registry, using_key)
|
1925
|
-
inspect_result = inspect_instance(
|
1926
|
-
values=non_validated,
|
1927
|
-
field=field,
|
1928
|
-
registry=registry_using,
|
1929
|
-
exclude=exclude,
|
1930
|
-
**kwargs,
|
1931
|
-
)
|
1932
|
-
non_validated = inspect_result.non_validated
|
1933
|
-
values_validated += inspect_result.validated
|
1934
|
-
syn_mapper.update(inspect_result.synonyms_mapper)
|
1935
|
-
|
1936
3241
|
# inspect the non-validated values from public (bionty only)
|
3242
|
+
values_validated = []
|
1937
3243
|
if hasattr(registry, "public"):
|
1938
3244
|
verbosity = settings.verbosity
|
1939
3245
|
try:
|
@@ -1969,12 +3275,16 @@ def validate_categories(
|
|
1969
3275
|
warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
|
1970
3276
|
if n_non_validated > len(syn_mapper):
|
1971
3277
|
if syn_mapper:
|
1972
|
-
warning_message += " for remaining terms:\n"
|
3278
|
+
warning_message += "\n for remaining terms:\n"
|
1973
3279
|
warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
|
1974
3280
|
|
1975
3281
|
if logger.indent == "":
|
1976
3282
|
_log_mapping_info()
|
1977
3283
|
logger.warning(warning_message)
|
3284
|
+
if curator is not None:
|
3285
|
+
curator._validate_category_error_messages = strip_ansi_codes(
|
3286
|
+
warning_message
|
3287
|
+
)
|
1978
3288
|
logger.indent = ""
|
1979
3289
|
return False, non_validated
|
1980
3290
|
|
@@ -1982,7 +3292,6 @@ def validate_categories(
|
|
1982
3292
|
def standardize_categories(
|
1983
3293
|
values: Iterable[str],
|
1984
3294
|
field: FieldAttr,
|
1985
|
-
using_key: str | None = None,
|
1986
3295
|
organism: str | None = None,
|
1987
3296
|
source: Record | None = None,
|
1988
3297
|
) -> dict:
|
@@ -1999,30 +3308,15 @@ def standardize_categories(
|
|
1999
3308
|
mute=True,
|
2000
3309
|
return_mapper=True,
|
2001
3310
|
)
|
2002
|
-
|
2003
|
-
if len(values) > len(syn_mapper): # type: ignore
|
2004
|
-
# standardize values using the using_key instance
|
2005
|
-
if using_key is not None and using_key != "default":
|
2006
|
-
registry_using = get_registry_instance(registry, using_key)
|
2007
|
-
syn_mapper.update(
|
2008
|
-
registry_using.standardize(
|
2009
|
-
[v for v in values if v not in syn_mapper],
|
2010
|
-
field=field.field.name,
|
2011
|
-
organism=organism,
|
2012
|
-
source=source,
|
2013
|
-
mute=True,
|
2014
|
-
return_mapper=True,
|
2015
|
-
)
|
2016
|
-
)
|
2017
3311
|
return syn_mapper
|
2018
3312
|
|
2019
3313
|
|
2020
3314
|
def validate_categories_in_df(
|
2021
3315
|
df: pd.DataFrame,
|
2022
3316
|
fields: dict[str, FieldAttr],
|
2023
|
-
using_key: str | None = None,
|
2024
3317
|
sources: dict[str, Record] = None,
|
2025
3318
|
exclude: dict | None = None,
|
3319
|
+
curator: CatManager | None = None,
|
2026
3320
|
**kwargs,
|
2027
3321
|
) -> tuple[bool, dict]:
|
2028
3322
|
"""Validate categories in DataFrame columns using LaminDB registries."""
|
@@ -2038,9 +3332,9 @@ def validate_categories_in_df(
|
|
2038
3332
|
df[key],
|
2039
3333
|
field=field,
|
2040
3334
|
key=key,
|
2041
|
-
using_key=using_key,
|
2042
3335
|
source=sources.get(key),
|
2043
3336
|
exclude=exclude.get(key) if exclude else None,
|
3337
|
+
curator=curator,
|
2044
3338
|
**kwargs,
|
2045
3339
|
)
|
2046
3340
|
validated &= is_val
|
@@ -2055,80 +3349,72 @@ def save_artifact(
|
|
2055
3349
|
columns_field: FieldAttr | dict[str, FieldAttr],
|
2056
3350
|
description: str | None = None,
|
2057
3351
|
organism: str | None = None,
|
2058
|
-
adata: ad.AnnData | None = None,
|
2059
3352
|
key: str | None = None,
|
3353
|
+
artifact: Artifact | None = None,
|
2060
3354
|
revises: Artifact | None = None,
|
2061
3355
|
run: Run | None = None,
|
3356
|
+
schema: Schema | None = None,
|
2062
3357
|
) -> Artifact:
|
2063
3358
|
"""Save all metadata with an Artifact.
|
2064
3359
|
|
2065
3360
|
Args:
|
2066
|
-
data: The DataFrame
|
3361
|
+
data: The DataFrame/AnnData/MuData object to save.
|
2067
3362
|
fields: A dictionary mapping obs_column to registry_field.
|
2068
3363
|
columns_field: The registry field to validate variables index against.
|
2069
3364
|
description: A description of the artifact.
|
2070
3365
|
organism: The organism name.
|
2071
|
-
adata: The AnnData object to save and get n_observations, must be provided if data is a path.
|
2072
3366
|
type: The artifact type.
|
2073
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a
|
3367
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
3368
|
+
artifact: A already registered artifact. Passing this will not save a new artifact from data.
|
2074
3369
|
revises: Previous version of the artifact. Triggers a revision.
|
2075
3370
|
run: The run that creates the artifact.
|
2076
3371
|
|
2077
3372
|
Returns:
|
2078
3373
|
The saved Artifact.
|
2079
3374
|
"""
|
2080
|
-
from .._artifact import data_is_anndata
|
3375
|
+
from .._artifact import data_is_anndata, data_is_mudata
|
2081
3376
|
from ..core._data import add_labels
|
2082
3377
|
|
2083
|
-
artifact = None
|
2084
|
-
if data_is_anndata(data):
|
2085
|
-
assert adata is not None # noqa: S101
|
2086
|
-
artifact = Artifact.from_anndata(
|
2087
|
-
data, description=description, key=key, revises=revises, run=run
|
2088
|
-
)
|
2089
|
-
artifact.n_observations = adata.shape[0]
|
2090
|
-
data = adata
|
2091
|
-
|
2092
|
-
elif isinstance(data, pd.DataFrame):
|
2093
|
-
artifact = Artifact.from_df(
|
2094
|
-
data, description=description, key=key, revises=revises, run=run
|
2095
|
-
)
|
2096
|
-
else:
|
2097
|
-
try:
|
2098
|
-
from mudata import MuData
|
2099
|
-
|
2100
|
-
if isinstance(data, MuData):
|
2101
|
-
artifact = Artifact.from_mudata(
|
2102
|
-
data,
|
2103
|
-
description=description,
|
2104
|
-
key=key,
|
2105
|
-
revises=revises,
|
2106
|
-
run=run,
|
2107
|
-
)
|
2108
|
-
artifact.n_observations = data.n_obs
|
2109
|
-
except ImportError:
|
2110
|
-
pass
|
2111
3378
|
if artifact is None:
|
2112
|
-
|
3379
|
+
if data_is_anndata(data):
|
3380
|
+
artifact = Artifact.from_anndata(
|
3381
|
+
data, description=description, key=key, revises=revises, run=run
|
3382
|
+
)
|
3383
|
+
elif isinstance(data, pd.DataFrame):
|
3384
|
+
artifact = Artifact.from_df(
|
3385
|
+
data, description=description, key=key, revises=revises, run=run
|
3386
|
+
)
|
3387
|
+
elif data_is_mudata(data):
|
3388
|
+
artifact = Artifact.from_mudata(
|
3389
|
+
data,
|
3390
|
+
description=description,
|
3391
|
+
key=key,
|
3392
|
+
revises=revises,
|
3393
|
+
run=run,
|
3394
|
+
)
|
3395
|
+
artifact.schema = schema
|
2113
3396
|
artifact.save()
|
2114
3397
|
|
2115
|
-
|
2116
|
-
(
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
3398
|
+
if organism is not None:
|
3399
|
+
feature_kwargs = check_registry_organism(
|
3400
|
+
(
|
3401
|
+
list(columns_field.values())[0].field.model
|
3402
|
+
if isinstance(columns_field, dict)
|
3403
|
+
else columns_field.field.model
|
3404
|
+
),
|
3405
|
+
organism,
|
3406
|
+
)
|
3407
|
+
else:
|
3408
|
+
feature_kwargs = {}
|
2123
3409
|
|
2124
3410
|
if artifact.otype == "DataFrame":
|
2125
|
-
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
|
3411
|
+
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
|
2126
3412
|
elif artifact.otype == "AnnData":
|
2127
|
-
artifact.features._add_set_from_anndata(
|
3413
|
+
artifact.features._add_set_from_anndata( # type: ignore
|
2128
3414
|
var_field=columns_field, **feature_kwargs
|
2129
3415
|
)
|
2130
3416
|
elif artifact.otype == "MuData":
|
2131
|
-
artifact.features._add_set_from_mudata(
|
3417
|
+
artifact.features._add_set_from_mudata( # type: ignore
|
2132
3418
|
var_fields=columns_field, **feature_kwargs
|
2133
3419
|
)
|
2134
3420
|
else:
|
@@ -2148,7 +3434,7 @@ def save_artifact(
|
|
2148
3434
|
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
2149
3435
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
2150
3436
|
# multi-value columns are separated by "|"
|
2151
|
-
if df[key].str.contains("|").any():
|
3437
|
+
if not df[key].isna().all() and df[key].str.contains("|").any():
|
2152
3438
|
values = df[key].str.split("|").explode().unique()
|
2153
3439
|
else:
|
2154
3440
|
values = df[key].unique()
|
@@ -2202,7 +3488,7 @@ def save_artifact(
|
|
2202
3488
|
)
|
2203
3489
|
|
2204
3490
|
slug = ln_setup.settings.instance.slug
|
2205
|
-
if ln_setup.settings.instance.is_remote: #
|
3491
|
+
if ln_setup.settings.instance.is_remote: # pdagma: no cover
|
2206
3492
|
logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
|
2207
3493
|
return artifact
|
2208
3494
|
|
@@ -2224,7 +3510,6 @@ def update_registry(
|
|
2224
3510
|
values: list[str],
|
2225
3511
|
field: FieldAttr,
|
2226
3512
|
key: str,
|
2227
|
-
using_key: str | None = None,
|
2228
3513
|
validated_only: bool = True,
|
2229
3514
|
df: pd.DataFrame | None = None,
|
2230
3515
|
organism: str | None = None,
|
@@ -2233,13 +3518,12 @@ def update_registry(
|
|
2233
3518
|
exclude: str | list | None = None,
|
2234
3519
|
**kwargs,
|
2235
3520
|
) -> None:
|
2236
|
-
"""Save features or labels records in the default instance
|
3521
|
+
"""Save features or labels records in the default instance..
|
2237
3522
|
|
2238
3523
|
Args:
|
2239
3524
|
values: A list of values to be saved as labels.
|
2240
3525
|
field: The FieldAttr object representing the field for which labels are being saved.
|
2241
3526
|
key: The name of the feature to save.
|
2242
|
-
using_key: The name of the instance from which to transfer labels (if applicable).
|
2243
3527
|
validated_only: If True, only save validated labels.
|
2244
3528
|
df: A DataFrame to save labels from.
|
2245
3529
|
organism: The organism name.
|
@@ -2290,22 +3574,10 @@ def update_registry(
|
|
2290
3574
|
i for i in values if i not in existing_and_public_labels
|
2291
3575
|
]
|
2292
3576
|
|
2293
|
-
# inspect and save validated records the using_key instance
|
2294
|
-
(
|
2295
|
-
labels_saved[f"from {using_key}"],
|
2296
|
-
non_validated_labels,
|
2297
|
-
) = update_registry_from_using_instance(
|
2298
|
-
non_validated_labels,
|
2299
|
-
field=field,
|
2300
|
-
using_key=using_key,
|
2301
|
-
exclude=exclude,
|
2302
|
-
**filter_kwargs,
|
2303
|
-
)
|
2304
|
-
|
2305
3577
|
# save non-validated/new records
|
2306
3578
|
labels_saved["new"] = non_validated_labels
|
2307
3579
|
if not validated_only:
|
2308
|
-
non_validated_records = []
|
3580
|
+
non_validated_records: RecordList[Any] = [] # type: ignore
|
2309
3581
|
if df is not None and registry == Feature:
|
2310
3582
|
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
2311
3583
|
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
@@ -2379,48 +3651,6 @@ def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
|
|
2379
3651
|
is_feature.children.add(*all_records)
|
2380
3652
|
|
2381
3653
|
|
2382
|
-
def update_registry_from_using_instance(
|
2383
|
-
values: list[str],
|
2384
|
-
field: FieldAttr,
|
2385
|
-
using_key: str | None = None,
|
2386
|
-
exclude: str | list | None = None,
|
2387
|
-
**kwargs,
|
2388
|
-
) -> tuple[list[str], list[str]]:
|
2389
|
-
"""Save features or labels records from the using_key instance.
|
2390
|
-
|
2391
|
-
Args:
|
2392
|
-
values: A list of values to be saved as labels.
|
2393
|
-
field: The FieldAttr object representing the field for which labels are being saved.
|
2394
|
-
using_key: The name of the instance from which to transfer labels (if applicable).
|
2395
|
-
kwargs: Additional keyword arguments to pass to the registry model.
|
2396
|
-
|
2397
|
-
Returns:
|
2398
|
-
A tuple containing the list of saved labels and the list of non-saved labels.
|
2399
|
-
"""
|
2400
|
-
labels_saved = []
|
2401
|
-
not_saved = values
|
2402
|
-
|
2403
|
-
if using_key is not None and using_key != "default":
|
2404
|
-
registry_using = get_registry_instance(field.field.model, using_key)
|
2405
|
-
|
2406
|
-
inspect_result_using = inspect_instance(
|
2407
|
-
values=values,
|
2408
|
-
field=field,
|
2409
|
-
registry=registry_using,
|
2410
|
-
exclude=exclude,
|
2411
|
-
**kwargs,
|
2412
|
-
)
|
2413
|
-
labels_using = registry_using.filter(
|
2414
|
-
**{f"{field.field.name}__in": inspect_result_using.validated}
|
2415
|
-
).all()
|
2416
|
-
for label_using in labels_using:
|
2417
|
-
label_using.save()
|
2418
|
-
labels_saved.append(getattr(label_using, field.field.name))
|
2419
|
-
not_saved = inspect_result_using.non_validated
|
2420
|
-
|
2421
|
-
return labels_saved, not_saved
|
2422
|
-
|
2423
|
-
|
2424
3654
|
def _save_organism(name: str):
|
2425
3655
|
"""Save an organism record."""
|
2426
3656
|
import bionty as bt
|
@@ -2445,4 +3675,121 @@ def _ref_is_name(field: FieldAttr) -> bool | None:
|
|
2445
3675
|
return field.field.name == name_field
|
2446
3676
|
|
2447
3677
|
|
2448
|
-
|
3678
|
+
# backward compat constructors ------------------
|
3679
|
+
|
3680
|
+
|
3681
|
+
@classmethod # type: ignore
|
3682
|
+
def from_df(
|
3683
|
+
cls,
|
3684
|
+
df: pd.DataFrame,
|
3685
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
3686
|
+
columns: FieldAttr = Feature.name,
|
3687
|
+
verbosity: str = "hint",
|
3688
|
+
organism: str | None = None,
|
3689
|
+
) -> DataFrameCatManager:
|
3690
|
+
return DataFrameCatManager(
|
3691
|
+
df=df,
|
3692
|
+
categoricals=categoricals,
|
3693
|
+
columns=columns,
|
3694
|
+
verbosity=verbosity,
|
3695
|
+
organism=organism,
|
3696
|
+
)
|
3697
|
+
|
3698
|
+
|
3699
|
+
@classmethod # type: ignore
|
3700
|
+
def from_anndata(
|
3701
|
+
cls,
|
3702
|
+
data: ad.AnnData | UPathStr,
|
3703
|
+
var_index: FieldAttr,
|
3704
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
3705
|
+
obs_columns: FieldAttr = Feature.name,
|
3706
|
+
verbosity: str = "hint",
|
3707
|
+
organism: str | None = None,
|
3708
|
+
sources: dict[str, Record] | None = None,
|
3709
|
+
) -> AnnDataCatManager:
|
3710
|
+
return AnnDataCatManager(
|
3711
|
+
data=data,
|
3712
|
+
var_index=var_index,
|
3713
|
+
categoricals=categoricals,
|
3714
|
+
obs_columns=obs_columns,
|
3715
|
+
verbosity=verbosity,
|
3716
|
+
organism=organism,
|
3717
|
+
sources=sources,
|
3718
|
+
)
|
3719
|
+
|
3720
|
+
|
3721
|
+
@classmethod # type: ignore
|
3722
|
+
def from_mudata(
|
3723
|
+
cls,
|
3724
|
+
mdata: MuData,
|
3725
|
+
var_index: dict[str, dict[str, FieldAttr]],
|
3726
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
3727
|
+
verbosity: str = "hint",
|
3728
|
+
organism: str | None = None,
|
3729
|
+
) -> MuDataCatManager:
|
3730
|
+
return MuDataCatManager(
|
3731
|
+
mdata=mdata,
|
3732
|
+
var_index=var_index,
|
3733
|
+
categoricals=categoricals,
|
3734
|
+
verbosity=verbosity,
|
3735
|
+
organism=organism,
|
3736
|
+
)
|
3737
|
+
|
3738
|
+
|
3739
|
+
@classmethod # type: ignore
|
3740
|
+
def from_tiledbsoma(
|
3741
|
+
cls,
|
3742
|
+
experiment_uri: UPathStr,
|
3743
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
3744
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
3745
|
+
obs_columns: FieldAttr = Feature.name,
|
3746
|
+
organism: str | None = None,
|
3747
|
+
sources: dict[str, Record] | None = None,
|
3748
|
+
exclude: dict[str, str | list[str]] | None = None,
|
3749
|
+
) -> TiledbsomaCatManager:
|
3750
|
+
return TiledbsomaCatManager(
|
3751
|
+
experiment_uri=experiment_uri,
|
3752
|
+
var_index=var_index,
|
3753
|
+
categoricals=categoricals,
|
3754
|
+
obs_columns=obs_columns,
|
3755
|
+
organism=organism,
|
3756
|
+
sources=sources,
|
3757
|
+
exclude=exclude,
|
3758
|
+
)
|
3759
|
+
|
3760
|
+
|
3761
|
+
@classmethod # type: ignore
|
3762
|
+
def from_spatialdata(
|
3763
|
+
cls,
|
3764
|
+
sdata,
|
3765
|
+
var_index: dict[str, FieldAttr],
|
3766
|
+
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
3767
|
+
organism: str | None = None,
|
3768
|
+
sources: dict[str, dict[str, Record]] | None = None,
|
3769
|
+
exclude: dict[str, dict] | None = None,
|
3770
|
+
verbosity: str = "hint",
|
3771
|
+
*,
|
3772
|
+
sample_metadata_key: str = "sample",
|
3773
|
+
):
|
3774
|
+
try:
|
3775
|
+
import spatialdata
|
3776
|
+
except ImportError as e:
|
3777
|
+
raise ImportError("Please install spatialdata: pip install spatialdata") from e
|
3778
|
+
|
3779
|
+
return SpatialDataCatManager(
|
3780
|
+
sdata=sdata,
|
3781
|
+
var_index=var_index,
|
3782
|
+
categoricals=categoricals,
|
3783
|
+
verbosity=verbosity,
|
3784
|
+
organism=organism,
|
3785
|
+
sources=sources,
|
3786
|
+
exclude=exclude,
|
3787
|
+
sample_metadata_key=sample_metadata_key,
|
3788
|
+
)
|
3789
|
+
|
3790
|
+
|
3791
|
+
CatManager.from_df = from_df # type: ignore
|
3792
|
+
CatManager.from_anndata = from_anndata # type: ignore
|
3793
|
+
CatManager.from_mudata = from_mudata # type: ignore
|
3794
|
+
CatManager.from_spatialdata = from_spatialdata # type: ignore
|
3795
|
+
CatManager.from_tiledbsoma = from_tiledbsoma # type: ignore
|