lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +14 -5
- lamindb/_artifact.py +174 -57
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +85 -51
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +222 -81
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +59 -17
- lamindb/_record.py +171 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +33 -10
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +106 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +39 -36
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +54 -44
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +20 -7
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +66 -20
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +7 -13
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +41 -0
- lamindb/core/storage/_backed_access.py +2 -2
- lamindb/core/storage/_pyarrow_dataset.py +25 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +41 -22
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2168 -833
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +423 -156
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
- lamindb-1.1.0.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.4.dist-info/RECORD +0 -102
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
lamindb/curators/__init__.py
CHANGED
@@ -1,21 +1,52 @@
|
|
1
|
+
"""Curators.
|
2
|
+
|
3
|
+
.. autosummary::
|
4
|
+
:toctree: .
|
5
|
+
|
6
|
+
Curator
|
7
|
+
DataFrameCurator
|
8
|
+
AnnDataCurator
|
9
|
+
|
10
|
+
"""
|
11
|
+
|
1
12
|
from __future__ import annotations
|
2
13
|
|
3
14
|
import copy
|
4
|
-
import
|
15
|
+
import random
|
16
|
+
import re
|
17
|
+
from importlib import resources
|
5
18
|
from itertools import chain
|
6
|
-
from typing import TYPE_CHECKING
|
19
|
+
from typing import TYPE_CHECKING, Any, Literal
|
7
20
|
|
8
21
|
import anndata as ad
|
9
22
|
import lamindb_setup as ln_setup
|
10
23
|
import pandas as pd
|
24
|
+
import pandera
|
11
25
|
import pyarrow as pa
|
12
26
|
from lamin_utils import colors, logger
|
27
|
+
from lamindb_setup.core import deprecated, upath
|
13
28
|
from lamindb_setup.core._docs import doc_args
|
14
29
|
from lamindb_setup.core.upath import UPath
|
15
30
|
|
31
|
+
from lamindb.core.storage._backed_access import backed_access
|
32
|
+
|
33
|
+
from ._cellxgene_schemas import _read_schema_versions
|
34
|
+
|
35
|
+
if TYPE_CHECKING:
|
36
|
+
from anndata import AnnData
|
37
|
+
from lamindb_setup.core.types import UPathStr
|
38
|
+
|
39
|
+
from lamindb.base.types import FieldAttr
|
40
|
+
from lamindb.models import Record
|
41
|
+
from lamindb._feature import parse_dtype, parse_dtype_single_cat
|
16
42
|
from lamindb.base.types import FieldAttr # noqa
|
43
|
+
from lamindb.core._data import add_labels
|
44
|
+
from lamindb.core._feature_manager import parse_staged_feature_sets_from_anndata
|
45
|
+
from lamindb.core._settings import settings
|
17
46
|
from lamindb.models import (
|
18
47
|
Artifact,
|
48
|
+
CanCurate,
|
49
|
+
Collection,
|
19
50
|
Feature,
|
20
51
|
Record,
|
21
52
|
Run,
|
@@ -23,15 +54,25 @@ from lamindb.models import (
|
|
23
54
|
ULabel,
|
24
55
|
)
|
25
56
|
|
57
|
+
from .._artifact import data_is_anndata
|
26
58
|
from .._from_values import _format_values
|
27
|
-
from ..
|
59
|
+
from ..errors import InvalidArgument, ValidationError
|
28
60
|
|
29
61
|
if TYPE_CHECKING:
|
30
|
-
from collections.abc import Iterable
|
62
|
+
from collections.abc import Iterable, MutableMapping
|
31
63
|
from typing import Any
|
32
64
|
|
33
65
|
from lamindb_setup.core.types import UPathStr
|
34
66
|
from mudata import MuData
|
67
|
+
from spatialdata import SpatialData
|
68
|
+
|
69
|
+
from lamindb._query_set import RecordList
|
70
|
+
|
71
|
+
|
72
|
+
def strip_ansi_codes(text):
|
73
|
+
# This pattern matches ANSI escape sequences
|
74
|
+
ansi_pattern = re.compile(r"\x1b\[[0-9;]*m")
|
75
|
+
return ansi_pattern.sub("", text)
|
35
76
|
|
36
77
|
|
37
78
|
class CurateLookup:
|
@@ -40,8 +81,6 @@ class CurateLookup:
|
|
40
81
|
Args:
|
41
82
|
categoricals: A dictionary of categorical fields to lookup.
|
42
83
|
slots: A dictionary of slot fields to lookup.
|
43
|
-
using_key: The key of the instance to lookup from. Defaults to the
|
44
|
-
current instance if not specified.
|
45
84
|
public: Whether to lookup from the public instance. Defaults to False.
|
46
85
|
|
47
86
|
Example:
|
@@ -55,48 +94,43 @@ class CurateLookup:
|
|
55
94
|
self,
|
56
95
|
categoricals: dict[str, FieldAttr],
|
57
96
|
slots: dict[str, FieldAttr] = None,
|
58
|
-
using_key: str | None = None,
|
59
97
|
public: bool = False,
|
60
98
|
) -> None:
|
61
99
|
slots = slots or {}
|
62
|
-
self.
|
63
|
-
self._using_key = None if using_key == "default" else using_key
|
64
|
-
self._using_key_name = self._using_key or ln_setup.settings.instance.slug
|
100
|
+
self._categoricals = {**categoricals, **slots}
|
65
101
|
self._public = public
|
66
|
-
debug_message = f"Lookup objects from {colors.italic(self._using_key_name)}"
|
67
|
-
logger.debug(debug_message)
|
68
102
|
|
69
103
|
def __getattr__(self, name):
|
70
|
-
if name in self.
|
71
|
-
registry = self.
|
104
|
+
if name in self._categoricals:
|
105
|
+
registry = self._categoricals[name].field.model
|
72
106
|
if self._public and hasattr(registry, "public"):
|
73
107
|
return registry.public().lookup()
|
74
108
|
else:
|
75
|
-
return
|
109
|
+
return registry.lookup()
|
76
110
|
raise AttributeError(
|
77
111
|
f'"{self.__class__.__name__}" object has no attribute "{name}"'
|
78
112
|
)
|
79
113
|
|
80
114
|
def __getitem__(self, name):
|
81
|
-
if name in self.
|
82
|
-
registry = self.
|
115
|
+
if name in self._categoricals:
|
116
|
+
registry = self._categoricals[name].field.model
|
83
117
|
if self._public and hasattr(registry, "public"):
|
84
118
|
return registry.public().lookup()
|
85
119
|
else:
|
86
|
-
return
|
120
|
+
return registry.lookup()
|
87
121
|
raise AttributeError(
|
88
122
|
f'"{self.__class__.__name__}" object has no attribute "{name}"'
|
89
123
|
)
|
90
124
|
|
91
125
|
def __repr__(self) -> str:
|
92
|
-
if len(self.
|
126
|
+
if len(self._categoricals) > 0:
|
93
127
|
getattr_keys = "\n ".join(
|
94
|
-
[f".{key}" for key in self.
|
128
|
+
[f".{key}" for key in self._categoricals if key.isidentifier()]
|
95
129
|
)
|
96
130
|
getitem_keys = "\n ".join(
|
97
|
-
[str([key]) for key in self.
|
131
|
+
[str([key]) for key in self._categoricals if not key.isidentifier()]
|
98
132
|
)
|
99
|
-
ref = "public" if self._public else
|
133
|
+
ref = "public" if self._public else "registries"
|
100
134
|
return (
|
101
135
|
f"Lookup objects from the {colors.italic(ref)}:\n "
|
102
136
|
f"{colors.green(getattr_keys)}\n "
|
@@ -105,21 +139,422 @@ class CurateLookup:
|
|
105
139
|
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
106
140
|
"To look up public ontologies, use .lookup(public=True)"
|
107
141
|
)
|
108
|
-
else: #
|
142
|
+
else: # pdagma: no cover
|
109
143
|
return colors.warning("No fields are found!")
|
110
144
|
|
111
145
|
|
112
|
-
|
113
|
-
|
146
|
+
CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
|
147
|
+
|
148
|
+
|
149
|
+
VALIDATE_DOCSTRING = """Validate dataset.
|
150
|
+
|
151
|
+
Raises:
|
152
|
+
lamindb.errors.ValidationError: If validation fails.
|
153
|
+
"""
|
154
|
+
|
155
|
+
SAVE_ARTIFACT_DOCSTRING = """Save an annotated artifact.
|
156
|
+
|
157
|
+
Args:
|
158
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
159
|
+
description: A description.
|
160
|
+
revises: Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
|
161
|
+
run: The run that creates the artifact.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
A saved artifact record.
|
165
|
+
"""
|
166
|
+
|
167
|
+
|
168
|
+
class Curator:
|
169
|
+
"""Dataset curator.
|
170
|
+
|
171
|
+
A `Curator` object makes it easy to validate, standardize & annotate datasets.
|
172
|
+
|
173
|
+
See:
|
174
|
+
- :class:`~lamindb.curators.DataFrameCurator`
|
175
|
+
- :class:`~lamindb.curators.AnnDataCurator`
|
176
|
+
"""
|
177
|
+
|
178
|
+
def __init__(self, dataset: Any, schema: Schema | None = None):
|
179
|
+
self._artifact: Artifact = None # pass the dataset as an artifact
|
180
|
+
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
181
|
+
if isinstance(self._dataset, Artifact):
|
182
|
+
self._artifact = self._dataset
|
183
|
+
if self._artifact.otype in {"DataFrame", "AnnData"}:
|
184
|
+
self._dataset = self._dataset.load()
|
185
|
+
self._schema: Schema | None = schema
|
186
|
+
self._is_validated: bool = False
|
187
|
+
self._cat_manager: CatManager = None # is None for CatManager curators
|
188
|
+
|
189
|
+
@doc_args(VALIDATE_DOCSTRING)
|
190
|
+
def validate(self) -> bool | str:
|
191
|
+
"""{}""" # noqa: D415
|
192
|
+
pass # pdagma: no cover
|
193
|
+
|
194
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
195
|
+
def save_artifact(
|
196
|
+
self,
|
197
|
+
*,
|
198
|
+
key: str | None = None,
|
199
|
+
description: str | None = None,
|
200
|
+
revises: Artifact | None = None,
|
201
|
+
run: Run | None = None,
|
202
|
+
) -> Artifact:
|
203
|
+
"""{}""" # noqa: D415
|
204
|
+
# Note that this docstring has to be consistent with the Artifact()
|
205
|
+
# constructor signature
|
206
|
+
pass
|
207
|
+
|
208
|
+
|
209
|
+
class DataFrameCurator(Curator):
|
210
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
211
|
+
"""Curator for a DataFrame object.
|
212
|
+
|
213
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
214
|
+
|
215
|
+
Args:
|
216
|
+
dataset: The DataFrame-like object to validate & annotate.
|
217
|
+
schema: A `Schema` object that defines the validation constraints.
|
218
|
+
|
219
|
+
Example::
|
220
|
+
|
221
|
+
import lamindb as ln
|
222
|
+
import bionty as bt
|
223
|
+
|
224
|
+
# define valid labels
|
225
|
+
cell_medium = ln.ULabel(name="CellMedium", is_type=True).save()
|
226
|
+
ln.ULabel(name="DMSO", type=cell_medium).save()
|
227
|
+
ln.ULabel(name="IFNG", type=cell_medium).save()
|
228
|
+
bt.CellType.from_source(name="B cell").save()
|
229
|
+
bt.CellType.from_source(name="T cell").save()
|
230
|
+
|
231
|
+
# define schema
|
232
|
+
schema = ln.Schema(
|
233
|
+
name="small_dataset1_obs_level_metadata",
|
234
|
+
features=[
|
235
|
+
ln.Feature(name="cell_medium", dtype="cat[ULabel[CellMedium]]").save(),
|
236
|
+
ln.Feature(name="sample_note", dtype=str).save(),
|
237
|
+
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
|
238
|
+
ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
|
239
|
+
],
|
240
|
+
).save()
|
241
|
+
|
242
|
+
# curate a DataFrame
|
243
|
+
df = datasets.small_dataset1(otype="DataFrame")
|
244
|
+
curator = ln.curators.DataFrameCurator(df, schema)
|
245
|
+
artifact = curator.save_artifact(key="example_datasets/dataset1.parquet")
|
246
|
+
assert artifact.schema == schema
|
247
|
+
"""
|
248
|
+
|
249
|
+
def __init__(
|
250
|
+
self,
|
251
|
+
dataset: pd.DataFrame | Artifact,
|
252
|
+
schema: Schema,
|
253
|
+
) -> None:
|
254
|
+
super().__init__(dataset=dataset, schema=schema)
|
255
|
+
if schema.n > 0:
|
256
|
+
# populate features
|
257
|
+
pandera_columns = {}
|
258
|
+
categoricals = {}
|
259
|
+
for feature in schema.features.all():
|
260
|
+
pandera_dtype = (
|
261
|
+
feature.dtype if not feature.dtype.startswith("cat") else "category"
|
262
|
+
)
|
263
|
+
pandera_columns[feature.name] = pandera.Column(
|
264
|
+
pandera_dtype, nullable=feature.nullable
|
265
|
+
)
|
266
|
+
if feature.dtype.startswith("cat"):
|
267
|
+
categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
|
268
|
+
self._pandera_schema = pandera.DataFrameSchema(
|
269
|
+
pandera_columns, coerce=schema.coerce_dtype
|
270
|
+
)
|
271
|
+
# now deal with detailed validation of categoricals
|
272
|
+
self._cat_manager = DataFrameCatManager(
|
273
|
+
self._dataset,
|
274
|
+
categoricals=categoricals,
|
275
|
+
)
|
276
|
+
else:
|
277
|
+
assert schema.itype is not None # noqa: S101
|
278
|
+
|
279
|
+
@property
|
280
|
+
@doc_args(CAT_MANAGER_DOCSTRING)
|
281
|
+
def cat(self) -> CatManager:
|
282
|
+
"""{}""" # noqa: D415
|
283
|
+
return self._cat_manager
|
284
|
+
|
285
|
+
def standardize(self) -> None:
|
286
|
+
"""Standardize the dataset.
|
287
|
+
|
288
|
+
- Adds missing columns if a default value for a feature is defined.
|
289
|
+
- Fills missing values with the default value if a default value for a feature is defined.
|
290
|
+
"""
|
291
|
+
for feature in self._schema.members:
|
292
|
+
if feature.name not in self._dataset.columns:
|
293
|
+
if feature.default_value is not None:
|
294
|
+
self._dataset[feature.name] = feature.default_value
|
295
|
+
else:
|
296
|
+
raise ValidationError(
|
297
|
+
f"Missing column {feature.name} cannot be added because no default value is defined for this feature"
|
298
|
+
)
|
299
|
+
else:
|
300
|
+
if feature.default_value is not None:
|
301
|
+
if isinstance(
|
302
|
+
self._dataset[feature.name].dtype, pd.CategoricalDtype
|
303
|
+
):
|
304
|
+
if (
|
305
|
+
feature.default_value
|
306
|
+
not in self._dataset[feature.name].cat.categories
|
307
|
+
):
|
308
|
+
self._dataset[feature.name] = self._dataset[
|
309
|
+
feature.name
|
310
|
+
].cat.add_categories(feature.default_value)
|
311
|
+
self._dataset[feature.name] = self._dataset[feature.name].fillna(
|
312
|
+
feature.default_value
|
313
|
+
)
|
314
|
+
|
315
|
+
@doc_args(VALIDATE_DOCSTRING)
|
316
|
+
def validate(self) -> None:
|
317
|
+
"""{}""" # noqa: D415
|
318
|
+
if self._schema.n > 0:
|
319
|
+
self._cat_manager.validate()
|
320
|
+
try:
|
321
|
+
self._pandera_schema.validate(self._dataset)
|
322
|
+
if self._cat_manager._is_validated:
|
323
|
+
self._is_validated = True
|
324
|
+
else:
|
325
|
+
self._is_validated = False
|
326
|
+
raise ValidationError(
|
327
|
+
self._cat_manager._validate_category_error_messages
|
328
|
+
)
|
329
|
+
except pandera.errors.SchemaError as err:
|
330
|
+
self._is_validated = False
|
331
|
+
# .exconly() doesn't exist on SchemaError
|
332
|
+
raise ValidationError(str(err)) from err
|
333
|
+
else:
|
334
|
+
result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
|
335
|
+
registry: CanCurate = result["registry"]
|
336
|
+
inspector = registry.inspect(
|
337
|
+
self._dataset.columns,
|
338
|
+
result["field"],
|
339
|
+
mute=True,
|
340
|
+
)
|
341
|
+
if len(inspector.non_validated) > 0:
|
342
|
+
# also check public ontology
|
343
|
+
if hasattr(registry, "public"):
|
344
|
+
registry.from_values(
|
345
|
+
inspector.non_validated, result["field"], mute=True
|
346
|
+
).save()
|
347
|
+
inspector = registry.inspect(
|
348
|
+
inspector.non_validated, result["field"], mute=True
|
349
|
+
)
|
350
|
+
if len(inspector.non_validated) > 0:
|
351
|
+
self._is_validated = False
|
352
|
+
raise ValidationError(
|
353
|
+
f"Invalid identifiers for {self._schema.itype}: {inspector.non_validated}"
|
354
|
+
)
|
355
|
+
|
356
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
357
|
+
def save_artifact(
|
358
|
+
self,
|
359
|
+
*,
|
360
|
+
key: str | None = None,
|
361
|
+
description: str | None = None,
|
362
|
+
revises: Artifact | None = None,
|
363
|
+
run: Run | None = None,
|
364
|
+
):
|
365
|
+
"""{}""" # noqa: D415
|
366
|
+
if not self._is_validated:
|
367
|
+
self.validate() # raises ValidationError if doesn't validate
|
368
|
+
result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
|
369
|
+
return save_artifact( # type: ignore
|
370
|
+
self._dataset,
|
371
|
+
description=description,
|
372
|
+
fields=self._cat_manager.categoricals,
|
373
|
+
columns_field=result["field"],
|
374
|
+
key=key,
|
375
|
+
artifact=self._artifact,
|
376
|
+
revises=revises,
|
377
|
+
run=run,
|
378
|
+
schema=self._schema,
|
379
|
+
)
|
380
|
+
|
381
|
+
|
382
|
+
class AnnDataCurator(Curator):
|
383
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
384
|
+
"""Curator for a DataFrame object.
|
385
|
+
|
386
|
+
See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
|
387
|
+
|
388
|
+
Args:
|
389
|
+
dataset: The AnnData-like object to validate & annotate.
|
390
|
+
schema: A `Schema` object that defines the validation constraints.
|
391
|
+
|
392
|
+
Example::
|
393
|
+
|
394
|
+
import lamindb as ln
|
395
|
+
import bionty as bt
|
396
|
+
|
397
|
+
# define valid labels
|
398
|
+
cell_medium = ln.ULabel(name="CellMedium", is_type=True).save()
|
399
|
+
ln.ULabel(name="DMSO", type=cell_medium).save()
|
400
|
+
ln.ULabel(name="IFNG", type=cell_medium).save()
|
401
|
+
bt.CellType.from_source(name="B cell").save()
|
402
|
+
bt.CellType.from_source(name="T cell").save()
|
403
|
+
|
404
|
+
# define obs schema
|
405
|
+
obs_schema = ln.Schema(
|
406
|
+
name="small_dataset1_obs_level_metadata",
|
407
|
+
features=[
|
408
|
+
ln.Feature(name="cell_medium", dtype="cat[ULabel[CellMedium]]").save(),
|
409
|
+
ln.Feature(name="sample_note", dtype=str).save(),
|
410
|
+
ln.Feature(name="cell_type_by_expert", dtype=bt.CellType").save(),
|
411
|
+
ln.Feature(name="cell_type_by_model", dtype=bt.CellType").save(),
|
412
|
+
],
|
413
|
+
).save()
|
414
|
+
|
415
|
+
# define var schema
|
416
|
+
var_schema = ln.Schema(
|
417
|
+
name="scRNA_seq_var_schema",
|
418
|
+
itype=bt.Gene.ensembl_gene_id,
|
419
|
+
dtype="num",
|
420
|
+
).save()
|
421
|
+
|
422
|
+
# define composite schema
|
423
|
+
anndata_schema = ln.Schema(
|
424
|
+
name="small_dataset1_anndata_schema",
|
425
|
+
otype="AnnData",
|
426
|
+
components={"obs": obs_schema, "var": var_schema},
|
427
|
+
).save()
|
428
|
+
|
429
|
+
# curate an AnnData
|
430
|
+
adata = datasets.small_dataset1(otype="AnnData")
|
431
|
+
curator = ln.curators.AnnDataCurator(adata, anndata_schema)
|
432
|
+
artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
|
433
|
+
assert artifact.schema == anndata_schema
|
434
|
+
"""
|
435
|
+
|
436
|
+
def __init__(
|
437
|
+
self,
|
438
|
+
dataset: AnnData | Artifact,
|
439
|
+
schema: Schema,
|
440
|
+
) -> None:
|
441
|
+
super().__init__(dataset=dataset, schema=schema)
|
442
|
+
if not data_is_anndata(self._dataset):
|
443
|
+
raise InvalidArgument("dataset must be AnnData-like.")
|
444
|
+
if schema.otype != "AnnData":
|
445
|
+
raise InvalidArgument("Schema otype must be 'AnnData'.")
|
446
|
+
self._obs_curator = DataFrameCurator(
|
447
|
+
self._dataset.obs, schema._get_component("obs")
|
448
|
+
)
|
449
|
+
self._var_curator = DataFrameCurator(
|
450
|
+
self._dataset.var.T, schema._get_component("var")
|
451
|
+
)
|
452
|
+
|
453
|
+
@doc_args(VALIDATE_DOCSTRING)
|
454
|
+
def validate(self) -> None:
|
455
|
+
"""{}""" # noqa: D415
|
456
|
+
self._obs_curator.validate()
|
457
|
+
self._var_curator.validate()
|
458
|
+
self._is_validated = True
|
459
|
+
|
460
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
461
|
+
def save_artifact(self, *, key=None, description=None, revises=None, run=None):
|
462
|
+
"""{}""" # noqa: D415
|
463
|
+
if not self._is_validated:
|
464
|
+
self.validate() # raises ValidationError if doesn't validate
|
465
|
+
result = parse_dtype_single_cat(self._var_curator._schema.itype, is_itype=True)
|
466
|
+
return save_artifact( # type: ignore
|
467
|
+
self._dataset,
|
468
|
+
description=description,
|
469
|
+
fields=self._obs_curator._cat_manager.categoricals,
|
470
|
+
columns_field=result["field"],
|
471
|
+
key=key,
|
472
|
+
artifact=self._artifact,
|
473
|
+
revises=revises,
|
474
|
+
run=run,
|
475
|
+
schema=self._schema,
|
476
|
+
)
|
477
|
+
|
478
|
+
|
479
|
+
class CatManager:
|
480
|
+
"""Manage valid categoricals by updating registries.
|
481
|
+
|
482
|
+
A `CatManager` object makes it easy to validate, standardize & annotate datasets.
|
483
|
+
|
484
|
+
Example:
|
485
|
+
|
486
|
+
>>> cat_manager = ln.CatManager(
|
487
|
+
>>> dataset,
|
488
|
+
>>> # define validation criteria as mappings
|
489
|
+
>>> columns=Feature.name, # map column names
|
490
|
+
>>> categoricals={"perturbation": ULabel.name}, # map categories
|
491
|
+
>>> )
|
492
|
+
>>> cat_manager.validate() # validate the dataframe
|
493
|
+
>>> artifact = cat_manager.save_artifact(description="my RNA-seq")
|
494
|
+
>>> artifact.describe() # see annotations
|
495
|
+
|
496
|
+
`cat_manager.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
|
497
|
+
|
498
|
+
If you find non-validated values, you have several options:
|
499
|
+
|
500
|
+
- new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCatManager.add_new_from`
|
501
|
+
- non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCatManager.non_validated` and addressed manually
|
502
|
+
"""
|
503
|
+
|
504
|
+
def __init__(
|
505
|
+
self, *, dataset, categoricals, sources, organism, exclude, columns_field=None
|
506
|
+
):
|
507
|
+
# the below is shared with Curator
|
508
|
+
self._artifact: Artifact = None # pass the dataset as an artifact
|
509
|
+
self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
|
510
|
+
if isinstance(self._dataset, Artifact):
|
511
|
+
self._artifact = self._dataset
|
512
|
+
if self._artifact.otype in {"DataFrame", "AnnData"}:
|
513
|
+
self._dataset = self._dataset.load()
|
514
|
+
self._is_validated: bool = False
|
515
|
+
# shared until here
|
516
|
+
self._categoricals = categoricals or {}
|
517
|
+
self._non_validated = None
|
518
|
+
self._organism = organism
|
519
|
+
self._sources = sources or {}
|
520
|
+
self._exclude = exclude or {}
|
521
|
+
self._columns_field = columns_field
|
522
|
+
self._validate_category_error_messages: str = ""
|
523
|
+
|
524
|
+
@property
|
525
|
+
def non_validated(self) -> dict[str, list[str]]:
|
526
|
+
"""Return the non-validated features and labels."""
|
527
|
+
if self._non_validated is None:
|
528
|
+
raise ValidationError("Please run validate() first!")
|
529
|
+
return self._non_validated
|
114
530
|
|
115
|
-
|
116
|
-
|
117
|
-
|
531
|
+
@property
|
532
|
+
def categoricals(self) -> dict:
|
533
|
+
"""Return the columns fields to validate against."""
|
534
|
+
return self._categoricals
|
118
535
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
536
|
+
def _replace_synonyms(
|
537
|
+
self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
|
538
|
+
):
|
539
|
+
# replace the values in df
|
540
|
+
std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
|
541
|
+
# remove the standardized values from self.non_validated
|
542
|
+
non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
|
543
|
+
if len(non_validated) == 0:
|
544
|
+
self._non_validated.pop(key, None) # type: ignore
|
545
|
+
else:
|
546
|
+
self._non_validated[key] = non_validated # type: ignore
|
547
|
+
# logging
|
548
|
+
n = len(syn_mapper)
|
549
|
+
if n > 0:
|
550
|
+
syn_mapper_print = _format_values(
|
551
|
+
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
552
|
+
)
|
553
|
+
s = "s" if n > 1 else ""
|
554
|
+
logger.success(
|
555
|
+
f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
|
556
|
+
)
|
557
|
+
return std_values
|
123
558
|
|
124
559
|
def validate(self) -> bool:
|
125
560
|
"""Validate dataset.
|
@@ -127,9 +562,9 @@ class BaseCurator:
|
|
127
562
|
This method also registers the validated records in the current instance.
|
128
563
|
|
129
564
|
Returns:
|
130
|
-
|
565
|
+
The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
|
131
566
|
"""
|
132
|
-
pass
|
567
|
+
pass
|
133
568
|
|
134
569
|
def standardize(self, key: str) -> None:
|
135
570
|
"""Replace synonyms with standardized values.
|
@@ -142,30 +577,48 @@ class BaseCurator:
|
|
142
577
|
Returns:
|
143
578
|
None
|
144
579
|
"""
|
145
|
-
pass #
|
580
|
+
pass # pdagma: no cover
|
146
581
|
|
582
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
147
583
|
def save_artifact(
|
148
584
|
self,
|
149
|
-
|
585
|
+
*,
|
150
586
|
key: str | None = None,
|
587
|
+
description: str | None = None,
|
151
588
|
revises: Artifact | None = None,
|
152
589
|
run: Run | None = None,
|
153
590
|
) -> Artifact:
|
154
|
-
"""
|
591
|
+
"""{}""" # noqa: D415
|
592
|
+
from lamindb.core._settings import settings
|
155
593
|
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
run: The run that creates the artifact.
|
594
|
+
if not self._is_validated:
|
595
|
+
self.validate() # returns True or False
|
596
|
+
if not self._is_validated: # need to raise error manually
|
597
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
161
598
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
599
|
+
# Make sure all labels are saved in the current instance
|
600
|
+
verbosity = settings.verbosity
|
601
|
+
try:
|
602
|
+
settings.verbosity = "warning"
|
603
|
+
self._artifact = save_artifact( # type: ignore
|
604
|
+
self._dataset,
|
605
|
+
description=description,
|
606
|
+
fields=self.categoricals,
|
607
|
+
columns_field=self._columns_field,
|
608
|
+
key=key,
|
609
|
+
artifact=self._artifact,
|
610
|
+
revises=revises,
|
611
|
+
run=run,
|
612
|
+
schema=None,
|
613
|
+
organism=self._organism,
|
614
|
+
)
|
615
|
+
finally:
|
616
|
+
settings.verbosity = verbosity
|
617
|
+
|
618
|
+
return self._artifact
|
166
619
|
|
167
620
|
|
168
|
-
class
|
621
|
+
class DataFrameCatManager(CatManager):
|
169
622
|
"""Curation flow for a DataFrame object.
|
170
623
|
|
171
624
|
See also :class:`~lamindb.Curator`.
|
@@ -174,7 +627,6 @@ class DataFrameCurator(BaseCurator):
|
|
174
627
|
df: The DataFrame object to curate.
|
175
628
|
columns: The field attribute for the feature column.
|
176
629
|
categoricals: A dictionary mapping column names to registry_field.
|
177
|
-
using_key: The reference instance containing registries to validate against.
|
178
630
|
verbosity: The verbosity level.
|
179
631
|
organism: The organism name.
|
180
632
|
sources: A dictionary mapping column names to Source records.
|
@@ -191,165 +643,103 @@ class DataFrameCurator(BaseCurator):
|
|
191
643
|
... df,
|
192
644
|
... categoricals={
|
193
645
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
194
|
-
... "donor_id":
|
646
|
+
... "donor_id": ULabel.name
|
195
647
|
... }
|
196
648
|
... )
|
197
649
|
"""
|
198
650
|
|
199
651
|
def __init__(
|
200
652
|
self,
|
201
|
-
df: pd.DataFrame,
|
653
|
+
df: pd.DataFrame | Artifact,
|
202
654
|
columns: FieldAttr = Feature.name,
|
203
655
|
categoricals: dict[str, FieldAttr] | None = None,
|
204
|
-
using_key: str | None = None,
|
205
656
|
verbosity: str = "hint",
|
206
657
|
organism: str | None = None,
|
207
658
|
sources: dict[str, Record] | None = None,
|
208
659
|
exclude: dict | None = None,
|
209
|
-
check_valid_keys: bool = True,
|
210
660
|
) -> None:
|
211
661
|
from lamindb.core._settings import settings
|
212
662
|
|
213
663
|
if organism is not None and not isinstance(organism, str):
|
214
664
|
raise ValueError("organism must be a string such as 'human' or 'mouse'!")
|
215
665
|
|
216
|
-
self._df = df
|
217
|
-
self._fields = categoricals or {}
|
218
|
-
self._columns_field = columns
|
219
|
-
self._using_key = using_key
|
220
|
-
# TODO: change verbosity back
|
221
666
|
settings.verbosity = verbosity
|
222
|
-
self._artifact = None
|
223
|
-
self._collection = None
|
224
|
-
self._validated = False
|
225
|
-
self._kwargs = {"organism": organism} if organism else {}
|
226
|
-
self._sources = sources or {}
|
227
|
-
self._exclude = exclude or {}
|
228
667
|
self._non_validated = None
|
229
|
-
|
230
|
-
|
668
|
+
super().__init__(
|
669
|
+
dataset=df,
|
670
|
+
columns_field=columns,
|
671
|
+
organism=organism,
|
672
|
+
categoricals=categoricals,
|
673
|
+
sources=sources,
|
674
|
+
exclude=exclude,
|
675
|
+
)
|
231
676
|
self._save_columns()
|
232
677
|
|
233
|
-
|
234
|
-
def non_validated(self) -> dict[str, list[str]]:
|
235
|
-
"""Return the non-validated features and labels."""
|
236
|
-
if self._non_validated is None:
|
237
|
-
raise ValidationError("Please run validate() first!")
|
238
|
-
return self._non_validated
|
239
|
-
|
240
|
-
@property
|
241
|
-
def fields(self) -> dict:
|
242
|
-
"""Return the columns fields to validate against."""
|
243
|
-
return self._fields
|
244
|
-
|
245
|
-
def lookup(
|
246
|
-
self, using_key: str | None = None, public: bool = False
|
247
|
-
) -> CurateLookup:
|
678
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
248
679
|
"""Lookup categories.
|
249
680
|
|
250
681
|
Args:
|
251
|
-
|
252
|
-
if "public", the lookup is performed on the public reference.
|
682
|
+
public: If "public", the lookup is performed on the public reference.
|
253
683
|
"""
|
254
684
|
return CurateLookup(
|
255
|
-
categoricals=self.
|
685
|
+
categoricals=self._categoricals,
|
256
686
|
slots={"columns": self._columns_field},
|
257
|
-
using_key=using_key or self._using_key,
|
258
687
|
public=public,
|
259
688
|
)
|
260
689
|
|
261
|
-
def _check_valid_keys(self, extra: set | None = None) -> None:
|
262
|
-
extra = extra or set()
|
263
|
-
for name, d in {
|
264
|
-
"categoricals": self._fields,
|
265
|
-
"sources": self._sources,
|
266
|
-
"exclude": self._exclude,
|
267
|
-
}.items():
|
268
|
-
if not isinstance(d, dict):
|
269
|
-
raise TypeError(f"{name} must be a dictionary!")
|
270
|
-
valid_keys = set(self._df.columns) | {"columns"} | extra
|
271
|
-
nonval_keys = [key for key in d.keys() if key not in valid_keys]
|
272
|
-
n = len(nonval_keys)
|
273
|
-
s = "s" if n > 1 else ""
|
274
|
-
are = "are" if n > 1 else "is"
|
275
|
-
if len(nonval_keys) > 0:
|
276
|
-
raise ValidationError(
|
277
|
-
f"key{s} passed to {name} {are} not present in columns: {colors.yellow(_format_values(nonval_keys))}"
|
278
|
-
)
|
279
|
-
|
280
690
|
def _save_columns(self, validated_only: bool = True) -> None:
|
281
691
|
"""Save column name records."""
|
282
692
|
# Always save features specified as the fields keys
|
283
693
|
update_registry(
|
284
|
-
values=list(self.
|
694
|
+
values=list(self.categoricals.keys()),
|
285
695
|
field=self._columns_field,
|
286
696
|
key="columns",
|
287
|
-
using_key=self._using_key,
|
288
697
|
validated_only=False,
|
289
698
|
source=self._sources.get("columns"),
|
290
699
|
exclude=self._exclude.get("columns"),
|
291
|
-
**self._kwargs, # type: ignore
|
292
700
|
)
|
293
701
|
|
294
702
|
# Save the rest of the columns based on validated_only
|
295
|
-
additional_columns = set(self.
|
703
|
+
additional_columns = set(self._dataset.columns) - set(self.categoricals.keys())
|
296
704
|
if additional_columns:
|
297
705
|
update_registry(
|
298
706
|
values=list(additional_columns),
|
299
707
|
field=self._columns_field,
|
300
708
|
key="columns",
|
301
|
-
using_key=self._using_key,
|
302
709
|
validated_only=validated_only,
|
303
|
-
df=self.
|
710
|
+
df=self._dataset, # Get the Feature type from df
|
304
711
|
source=self._sources.get("columns"),
|
305
712
|
exclude=self._exclude.get("columns"),
|
306
|
-
**self._kwargs, # type: ignore
|
307
713
|
)
|
308
714
|
|
309
|
-
|
310
|
-
|
715
|
+
@deprecated(new_name="is run by default")
|
716
|
+
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
717
|
+
pass
|
718
|
+
|
719
|
+
def validate(self) -> bool:
|
720
|
+
"""Validate variables and categorical observations.
|
721
|
+
|
722
|
+
This method also registers the validated records in the current instance:
|
723
|
+
- from public sources
|
311
724
|
|
312
725
|
Args:
|
313
|
-
key: The key referencing the slot in the DataFrame from which to draw terms.
|
314
726
|
organism: The organism name.
|
315
|
-
|
727
|
+
|
728
|
+
Returns:
|
729
|
+
Whether the DataFrame is validated.
|
316
730
|
"""
|
317
|
-
|
318
|
-
|
319
|
-
self.
|
320
|
-
self.
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
stacklevel=2,
|
731
|
+
# add all validated records to the current instance
|
732
|
+
self._update_registry_all()
|
733
|
+
self._validate_category_error_messages = "" # reset the error messages
|
734
|
+
self._is_validated, self._non_validated = validate_categories_in_df( # type: ignore
|
735
|
+
self._dataset,
|
736
|
+
fields=self.categoricals,
|
737
|
+
sources=self._sources,
|
738
|
+
exclude=self._exclude,
|
739
|
+
curator=self,
|
740
|
+
organism=self._organism,
|
328
741
|
)
|
329
|
-
|
330
|
-
|
331
|
-
def _replace_synonyms(
|
332
|
-
self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
|
333
|
-
):
|
334
|
-
# replace the values in df
|
335
|
-
std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
|
336
|
-
# remove the standardized values from self.non_validated
|
337
|
-
non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
|
338
|
-
if len(non_validated) == 0:
|
339
|
-
self._non_validated.pop(key, None) # type: ignore
|
340
|
-
else:
|
341
|
-
self._non_validated[key] = non_validated # type: ignore
|
342
|
-
# logging
|
343
|
-
n = len(syn_mapper)
|
344
|
-
if n > 0:
|
345
|
-
syn_mapper_print = _format_values(
|
346
|
-
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
347
|
-
)
|
348
|
-
s = "s" if n > 1 else ""
|
349
|
-
logger.success(
|
350
|
-
f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
|
351
|
-
)
|
352
|
-
return std_values
|
742
|
+
return self._is_validated
|
353
743
|
|
354
744
|
def standardize(self, key: str) -> None:
|
355
745
|
"""Replace synonyms with standardized values.
|
@@ -359,6 +749,8 @@ class DataFrameCurator(BaseCurator):
|
|
359
749
|
Args:
|
360
750
|
key: The key referencing the column in the DataFrame to standardize.
|
361
751
|
"""
|
752
|
+
if self._artifact is not None:
|
753
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
362
754
|
# list is needed to avoid RuntimeError: dictionary changed size during iteration
|
363
755
|
avail_keys = list(self.non_validated.keys())
|
364
756
|
if len(avail_keys) == 0:
|
@@ -367,137 +759,74 @@ class DataFrameCurator(BaseCurator):
|
|
367
759
|
|
368
760
|
if key == "all":
|
369
761
|
for k in avail_keys:
|
370
|
-
if k in self.
|
762
|
+
if k in self._categoricals: # needed to exclude var_index
|
371
763
|
syn_mapper = standardize_categories(
|
372
764
|
self.non_validated[k],
|
373
|
-
field=self.
|
374
|
-
using_key=self._using_key,
|
765
|
+
field=self._categoricals[k],
|
375
766
|
source=self._sources.get(k),
|
376
|
-
**self._kwargs,
|
377
767
|
)
|
378
|
-
self.
|
768
|
+
self._dataset[k] = self._replace_synonyms(
|
769
|
+
k, syn_mapper, self._dataset[k]
|
770
|
+
)
|
379
771
|
else:
|
380
772
|
if key not in avail_keys:
|
381
|
-
if key in self.
|
773
|
+
if key in self._categoricals:
|
382
774
|
logger.info(f"No unstandardized values found for {key!r}")
|
383
775
|
else:
|
384
776
|
raise KeyError(
|
385
777
|
f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
|
386
778
|
)
|
387
779
|
else:
|
388
|
-
if key in self.
|
780
|
+
if key in self._categoricals: # needed to exclude var_index
|
389
781
|
syn_mapper = standardize_categories(
|
390
782
|
self.non_validated[key],
|
391
|
-
field=self.
|
392
|
-
using_key=self._using_key,
|
783
|
+
field=self._categoricals[key],
|
393
784
|
source=self._sources.get(key),
|
394
|
-
|
785
|
+
organism=self._organism,
|
395
786
|
)
|
396
|
-
self.
|
397
|
-
key, syn_mapper, self.
|
787
|
+
self._dataset[key] = self._replace_synonyms(
|
788
|
+
key, syn_mapper, self._dataset[key]
|
398
789
|
)
|
399
790
|
|
791
|
+
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
792
|
+
"""Save labels for all features."""
|
793
|
+
for name in self.categoricals.keys():
|
794
|
+
self._update_registry(name, validated_only=validated_only, **kwargs)
|
795
|
+
|
400
796
|
def _update_registry(
|
401
797
|
self, categorical: str, validated_only: bool = True, **kwargs
|
402
798
|
) -> None:
|
403
799
|
if categorical == "all":
|
404
800
|
self._update_registry_all(validated_only=validated_only, **kwargs)
|
405
801
|
else:
|
406
|
-
if categorical not in self.
|
802
|
+
if categorical not in self.categoricals:
|
407
803
|
raise ValidationError(
|
408
804
|
f"Feature {categorical} is not part of the fields!"
|
409
805
|
)
|
410
806
|
update_registry(
|
411
|
-
values=_flatten_unique(self.
|
412
|
-
field=self.
|
807
|
+
values=_flatten_unique(self._dataset[categorical]),
|
808
|
+
field=self.categoricals[categorical],
|
413
809
|
key=categorical,
|
414
|
-
using_key=self._using_key,
|
415
810
|
validated_only=validated_only,
|
416
811
|
source=self._sources.get(categorical),
|
417
812
|
exclude=self._exclude.get(categorical),
|
418
|
-
|
813
|
+
organism=self._organism,
|
419
814
|
)
|
420
815
|
# adding new records removes them from non_validated
|
421
816
|
if not validated_only and self._non_validated:
|
422
817
|
self._non_validated.pop(categorical, None) # type: ignore
|
423
818
|
|
424
|
-
def
|
425
|
-
"""
|
426
|
-
for name in self.fields.keys():
|
427
|
-
self._update_registry(name, validated_only=validated_only, **kwargs)
|
428
|
-
|
429
|
-
def validate(self, organism: str | None = None) -> bool:
|
430
|
-
"""Validate variables and categorical observations.
|
431
|
-
|
432
|
-
This method also registers the validated records in the current instance:
|
433
|
-
- from public sources
|
434
|
-
- from the using_key instance
|
819
|
+
def add_new_from(self, key: str, **kwargs):
|
820
|
+
"""Add validated & new categories.
|
435
821
|
|
436
822
|
Args:
|
823
|
+
key: The key referencing the slot in the DataFrame from which to draw terms.
|
437
824
|
organism: The organism name.
|
438
|
-
|
439
|
-
Returns:
|
440
|
-
Whether the DataFrame is validated.
|
441
|
-
"""
|
442
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
443
|
-
|
444
|
-
# add all validated records to the current instance
|
445
|
-
self._update_registry_all()
|
446
|
-
|
447
|
-
self._validated, self._non_validated = validate_categories_in_df( # type: ignore
|
448
|
-
self._df,
|
449
|
-
fields=self.fields,
|
450
|
-
using_key=self._using_key,
|
451
|
-
sources=self._sources,
|
452
|
-
exclude=self._exclude,
|
453
|
-
**self._kwargs,
|
454
|
-
)
|
455
|
-
return self._validated
|
456
|
-
|
457
|
-
def save_artifact(
|
458
|
-
self,
|
459
|
-
description: str | None = None,
|
460
|
-
key: str | None = None,
|
461
|
-
revises: Artifact | None = None,
|
462
|
-
run: Run | None = None,
|
463
|
-
) -> Artifact:
|
464
|
-
"""Save the validated DataFrame and metadata.
|
465
|
-
|
466
|
-
Args:
|
467
|
-
description: Description of the DataFrame object.
|
468
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
469
|
-
Artifacts with the same key form a revision family.
|
470
|
-
revises: Previous version of the artifact. Triggers a revision.
|
471
|
-
run: The run that creates the artifact.
|
472
|
-
|
473
|
-
Returns:
|
474
|
-
A saved artifact record.
|
825
|
+
**kwargs: Additional keyword arguments to pass to create new records
|
475
826
|
"""
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
self.validate()
|
480
|
-
if not self._validated:
|
481
|
-
raise ValidationError("Dataset does not validate. Please curate.")
|
482
|
-
|
483
|
-
# Make sure all labels are saved in the current instance
|
484
|
-
verbosity = settings.verbosity
|
485
|
-
try:
|
486
|
-
settings.verbosity = "warning"
|
487
|
-
self._artifact = save_artifact(
|
488
|
-
self._df,
|
489
|
-
description=description,
|
490
|
-
fields=self.fields,
|
491
|
-
columns_field=self._columns_field,
|
492
|
-
key=key,
|
493
|
-
revises=revises,
|
494
|
-
run=run,
|
495
|
-
**self._kwargs,
|
496
|
-
)
|
497
|
-
finally:
|
498
|
-
settings.verbosity = verbosity
|
499
|
-
|
500
|
-
return self._artifact
|
827
|
+
if len(kwargs) > 0 and key == "all":
|
828
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
829
|
+
self._update_registry(key, validated_only=False, **kwargs)
|
501
830
|
|
502
831
|
def clean_up_failed_runs(self):
|
503
832
|
"""Clean up previous failed runs that don't save any outputs."""
|
@@ -509,21 +838,14 @@ class DataFrameCurator(BaseCurator):
|
|
509
838
|
).delete()
|
510
839
|
|
511
840
|
|
512
|
-
class
|
513
|
-
"""
|
514
|
-
|
515
|
-
See also :class:`~lamindb.Curator`.
|
516
|
-
|
517
|
-
Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
|
518
|
-
|
519
|
-
See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
|
841
|
+
class AnnDataCatManager(CatManager):
|
842
|
+
"""Manage categorical curation.
|
520
843
|
|
521
844
|
Args:
|
522
845
|
data: The AnnData object or an AnnData-like path.
|
523
846
|
var_index: The registry field for mapping the ``.var`` index.
|
524
847
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
525
848
|
obs_columns: The registry field for mapping the ``.obs.columns``.
|
526
|
-
using_key: A reference LaminDB instance.
|
527
849
|
verbosity: The verbosity level.
|
528
850
|
organism: The organism name.
|
529
851
|
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
@@ -538,7 +860,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
538
860
|
... var_index=bt.Gene.ensembl_gene_id,
|
539
861
|
... categoricals={
|
540
862
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
541
|
-
... "donor_id":
|
863
|
+
... "donor_id": ULabel.name
|
542
864
|
... },
|
543
865
|
... organism="human",
|
544
866
|
... )
|
@@ -546,56 +868,48 @@ class AnnDataCurator(DataFrameCurator):
|
|
546
868
|
|
547
869
|
def __init__(
|
548
870
|
self,
|
549
|
-
data: ad.AnnData |
|
871
|
+
data: ad.AnnData | Artifact,
|
550
872
|
var_index: FieldAttr,
|
551
873
|
categoricals: dict[str, FieldAttr] | None = None,
|
552
874
|
obs_columns: FieldAttr = Feature.name,
|
553
|
-
using_key: str | None = None,
|
554
875
|
verbosity: str = "hint",
|
555
876
|
organism: str | None = None,
|
556
877
|
sources: dict[str, Record] | None = None,
|
557
878
|
exclude: dict | None = None,
|
558
879
|
) -> None:
|
559
|
-
from lamindb_setup.core import upath
|
560
|
-
|
561
880
|
if isinstance(var_index, str):
|
562
881
|
raise TypeError("var_index parameter has to be a bionty field")
|
563
882
|
|
564
|
-
from .._artifact import data_is_anndata
|
565
|
-
|
566
883
|
if sources is None:
|
567
884
|
sources = {}
|
568
885
|
if not data_is_anndata(data):
|
569
|
-
raise TypeError(
|
570
|
-
"data has to be an AnnData object or a path to AnnData-like"
|
571
|
-
)
|
572
|
-
if isinstance(data, ad.AnnData):
|
573
|
-
self._adata = data
|
574
|
-
else: # pragma: no cover
|
575
|
-
from lamindb.core.storage._backed_access import backed_access
|
576
|
-
|
577
|
-
self._adata = backed_access(upath.create_path(data))
|
886
|
+
raise TypeError("data has to be an AnnData object")
|
578
887
|
|
579
888
|
if "symbol" in str(var_index):
|
580
889
|
logger.warning(
|
581
890
|
"indexing datasets with gene symbols can be problematic: https://docs.lamin.ai/faq/symbol-mapping"
|
582
891
|
)
|
583
892
|
|
584
|
-
self.
|
893
|
+
self._obs_fields = categoricals or {}
|
585
894
|
self._var_field = var_index
|
586
895
|
super().__init__(
|
587
|
-
|
896
|
+
dataset=data,
|
588
897
|
categoricals=categoricals,
|
898
|
+
sources=sources,
|
899
|
+
organism=organism,
|
900
|
+
exclude=exclude,
|
901
|
+
columns_field=var_index,
|
902
|
+
)
|
903
|
+
self._adata = self._dataset
|
904
|
+
self._obs_df_curator = DataFrameCatManager(
|
905
|
+
df=self._adata.obs,
|
906
|
+
categoricals=self.categoricals,
|
589
907
|
columns=obs_columns,
|
590
|
-
using_key=using_key,
|
591
908
|
verbosity=verbosity,
|
592
|
-
organism=
|
909
|
+
organism=None,
|
593
910
|
sources=sources,
|
594
911
|
exclude=exclude,
|
595
|
-
check_valid_keys=False,
|
596
912
|
)
|
597
|
-
self._obs_fields = categoricals or {}
|
598
|
-
self._check_valid_keys(extra={"var_index"})
|
599
913
|
|
600
914
|
@property
|
601
915
|
def var_index(self) -> FieldAttr:
|
@@ -607,54 +921,53 @@ class AnnDataCurator(DataFrameCurator):
|
|
607
921
|
"""Return the obs fields to validate against."""
|
608
922
|
return self._obs_fields
|
609
923
|
|
610
|
-
def lookup(
|
611
|
-
self, using_key: str | None = None, public: bool = False
|
612
|
-
) -> CurateLookup:
|
924
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
613
925
|
"""Lookup categories.
|
614
926
|
|
615
927
|
Args:
|
616
|
-
|
617
|
-
if "public", the lookup is performed on the public reference.
|
928
|
+
public: If "public", the lookup is performed on the public reference.
|
618
929
|
"""
|
619
930
|
return CurateLookup(
|
620
931
|
categoricals=self._obs_fields,
|
621
932
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
622
|
-
using_key=using_key or self._using_key,
|
623
933
|
public=public,
|
624
934
|
)
|
625
935
|
|
626
936
|
def _save_from_var_index(
|
627
|
-
self,
|
937
|
+
self,
|
938
|
+
validated_only: bool = True,
|
628
939
|
):
|
629
940
|
"""Save variable records."""
|
630
941
|
update_registry(
|
631
942
|
values=list(self._adata.var.index),
|
632
943
|
field=self.var_index,
|
633
944
|
key="var_index",
|
634
|
-
using_key=self._using_key,
|
635
945
|
validated_only=validated_only,
|
636
|
-
organism=
|
946
|
+
organism=self._organism,
|
637
947
|
source=self._sources.get("var_index"),
|
638
948
|
exclude=self._exclude.get("var_index"),
|
639
949
|
)
|
640
950
|
|
641
|
-
def
|
642
|
-
"""
|
643
|
-
self._save_from_var_index(validated_only=validated_only, **self._kwargs)
|
644
|
-
for name in self._obs_fields.keys():
|
645
|
-
self._update_registry(name, validated_only=validated_only, **self._kwargs)
|
951
|
+
def add_new_from(self, key: str, **kwargs):
|
952
|
+
"""Add validated & new categories.
|
646
953
|
|
647
|
-
|
954
|
+
Args:
|
955
|
+
key: The key referencing the slot in the DataFrame from which to draw terms.
|
956
|
+
organism: The organism name.
|
957
|
+
**kwargs: Additional keyword arguments to pass to create new records
|
958
|
+
"""
|
959
|
+
self._obs_df_curator.add_new_from(key, **kwargs)
|
960
|
+
|
961
|
+
def add_new_from_var_index(self, **kwargs):
|
648
962
|
"""Update variable records.
|
649
963
|
|
650
964
|
Args:
|
651
965
|
organism: The organism name.
|
652
966
|
**kwargs: Additional keyword arguments to pass to create new records.
|
653
967
|
"""
|
654
|
-
self.
|
655
|
-
self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
|
968
|
+
self._save_from_var_index(validated_only=False, **kwargs)
|
656
969
|
|
657
|
-
def validate(self
|
970
|
+
def validate(self) -> bool:
|
658
971
|
"""Validate categories.
|
659
972
|
|
660
973
|
This method also registers the validated records in the current instance.
|
@@ -665,38 +978,25 @@ class AnnDataCurator(DataFrameCurator):
|
|
665
978
|
Returns:
|
666
979
|
Whether the AnnData object is validated.
|
667
980
|
"""
|
668
|
-
self.
|
669
|
-
if self._using_key is not None and self._using_key != "default":
|
670
|
-
logger.important(
|
671
|
-
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
672
|
-
)
|
981
|
+
self._validate_category_error_messages = "" # reset the error messages
|
673
982
|
|
674
983
|
# add all validated records to the current instance
|
675
|
-
self.
|
676
|
-
|
984
|
+
self._save_from_var_index(validated_only=True)
|
677
985
|
validated_var, non_validated_var = validate_categories(
|
678
986
|
self._adata.var.index,
|
679
987
|
field=self._var_field,
|
680
988
|
key="var_index",
|
681
|
-
using_key=self._using_key,
|
682
989
|
source=self._sources.get("var_index"),
|
683
990
|
hint_print=".add_new_from_var_index()",
|
684
991
|
exclude=self._exclude.get("var_index"),
|
685
|
-
|
686
|
-
)
|
687
|
-
validated_obs, non_validated_obs = validate_categories_in_df(
|
688
|
-
self._adata.obs,
|
689
|
-
fields=self.categoricals,
|
690
|
-
using_key=self._using_key,
|
691
|
-
sources=self._sources,
|
692
|
-
exclude=self._exclude,
|
693
|
-
**self._kwargs,
|
992
|
+
organism=self._organism, # type: ignore
|
694
993
|
)
|
695
|
-
|
994
|
+
validated_obs = self._obs_df_curator.validate()
|
995
|
+
self._non_validated = self._obs_df_curator._non_validated # type: ignore
|
696
996
|
if len(non_validated_var) > 0:
|
697
997
|
self._non_validated["var_index"] = non_validated_var # type: ignore
|
698
|
-
self.
|
699
|
-
return self.
|
998
|
+
self._is_validated = validated_var and validated_obs
|
999
|
+
return self._is_validated
|
700
1000
|
|
701
1001
|
def standardize(self, key: str):
|
702
1002
|
"""Replace synonyms with standardized values.
|
@@ -709,68 +1009,26 @@ class AnnDataCurator(DataFrameCurator):
|
|
709
1009
|
|
710
1010
|
Inplace modification of the dataset.
|
711
1011
|
"""
|
1012
|
+
if self._artifact is not None:
|
1013
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
712
1014
|
if key in self._adata.obs.columns or key == "all":
|
713
1015
|
# standardize obs columns
|
714
|
-
|
1016
|
+
self._obs_df_curator.standardize(key)
|
715
1017
|
# in addition to the obs columns, standardize the var.index
|
716
1018
|
if key == "var_index" or key == "all":
|
717
1019
|
syn_mapper = standardize_categories(
|
718
1020
|
self._adata.var.index,
|
719
1021
|
field=self.var_index,
|
720
|
-
using_key=self._using_key,
|
721
1022
|
source=self._sources.get("var_index"),
|
722
|
-
|
1023
|
+
organism=self._organism,
|
723
1024
|
)
|
724
1025
|
if "var_index" in self._non_validated: # type: ignore
|
725
1026
|
self._adata.var.index = self._replace_synonyms(
|
726
1027
|
"var_index", syn_mapper, self._adata.var.index
|
727
1028
|
)
|
728
1029
|
|
729
|
-
def save_artifact(
|
730
|
-
self,
|
731
|
-
description: str | None = None,
|
732
|
-
key: str | None = None,
|
733
|
-
revises: Artifact | None = None,
|
734
|
-
run: Run | None = None,
|
735
|
-
) -> Artifact:
|
736
|
-
"""Save the validated ``AnnData`` and metadata.
|
737
|
-
|
738
|
-
Args:
|
739
|
-
description: A description of the ``AnnData`` object.
|
740
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
|
741
|
-
Artifacts with the same key form a revision family.
|
742
|
-
revises: Previous version of the artifact. Triggers a revision.
|
743
|
-
run: The run that creates the artifact.
|
744
|
-
|
745
|
-
Returns:
|
746
|
-
A saved artifact record.
|
747
|
-
"""
|
748
|
-
from lamindb.core._settings import settings
|
749
|
-
|
750
|
-
if not self._validated:
|
751
|
-
self.validate()
|
752
|
-
if not self._validated:
|
753
|
-
raise ValidationError("Dataset does not validate. Please curate.")
|
754
|
-
verbosity = settings.verbosity
|
755
|
-
try:
|
756
|
-
settings.verbosity = "warning"
|
757
|
-
self._artifact = save_artifact(
|
758
|
-
self._data,
|
759
|
-
adata=self._adata,
|
760
|
-
description=description,
|
761
|
-
columns_field=self.var_index,
|
762
|
-
fields=self.categoricals,
|
763
|
-
key=key,
|
764
|
-
revises=revises,
|
765
|
-
run=run,
|
766
|
-
**self._kwargs,
|
767
|
-
)
|
768
|
-
finally:
|
769
|
-
settings.verbosity = verbosity
|
770
|
-
return self._artifact
|
771
|
-
|
772
1030
|
|
773
|
-
class
|
1031
|
+
class MuDataCatManager(CatManager):
|
774
1032
|
"""Curation flow for a ``MuData`` object.
|
775
1033
|
|
776
1034
|
See also :class:`~lamindb.Curator`.
|
@@ -782,10 +1040,9 @@ class MuDataCurator:
|
|
782
1040
|
mdata: The MuData object to curate.
|
783
1041
|
var_index: The registry field for mapping the ``.var`` index for each modality.
|
784
1042
|
For example:
|
785
|
-
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2":
|
1043
|
+
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
|
786
1044
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
787
1045
|
Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
|
788
|
-
using_key: A reference LaminDB instance.
|
789
1046
|
verbosity: The verbosity level.
|
790
1047
|
organism: The organism name.
|
791
1048
|
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
@@ -799,11 +1056,11 @@ class MuDataCurator:
|
|
799
1056
|
... mdata,
|
800
1057
|
... var_index={
|
801
1058
|
... "rna": bt.Gene.ensembl_gene_id,
|
802
|
-
... "adt":
|
1059
|
+
... "adt": CellMarker.name
|
803
1060
|
... },
|
804
1061
|
... categoricals={
|
805
1062
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
806
|
-
... "donor_id":
|
1063
|
+
... "donor_id": ULabel.name
|
807
1064
|
... },
|
808
1065
|
... organism="human",
|
809
1066
|
... )
|
@@ -811,52 +1068,47 @@ class MuDataCurator:
|
|
811
1068
|
|
812
1069
|
def __init__(
|
813
1070
|
self,
|
814
|
-
mdata: MuData,
|
1071
|
+
mdata: MuData | Artifact,
|
815
1072
|
var_index: dict[str, FieldAttr],
|
816
1073
|
categoricals: dict[str, FieldAttr] | None = None,
|
817
|
-
using_key: str | None = None,
|
818
1074
|
verbosity: str = "hint",
|
819
1075
|
organism: str | None = None,
|
820
1076
|
sources: dict[str, Record] | None = None,
|
821
1077
|
exclude: dict | None = None, # {modality: {field: [values]}}
|
822
1078
|
) -> None:
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
self.
|
1079
|
+
super().__init__(
|
1080
|
+
dataset=mdata,
|
1081
|
+
categoricals={},
|
1082
|
+
sources=sources,
|
1083
|
+
organism=organism,
|
1084
|
+
exclude=exclude,
|
1085
|
+
)
|
1086
|
+
self._columns_field = var_index # this is for consistency with BaseCatManager
|
831
1087
|
self._var_fields = var_index
|
832
1088
|
self._verify_modality(self._var_fields.keys())
|
833
1089
|
self._obs_fields = self._parse_categoricals(categoricals)
|
834
1090
|
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
835
|
-
self._using_key = using_key
|
836
1091
|
self._verbosity = verbosity
|
837
1092
|
self._obs_df_curator = None
|
838
1093
|
if "obs" in self._modalities:
|
839
|
-
self._obs_df_curator =
|
840
|
-
df=
|
1094
|
+
self._obs_df_curator = DataFrameCatManager(
|
1095
|
+
df=self._dataset.obs,
|
841
1096
|
columns=Feature.name,
|
842
1097
|
categoricals=self._obs_fields.get("obs", {}),
|
843
|
-
using_key=using_key,
|
844
1098
|
verbosity=verbosity,
|
845
1099
|
sources=self._sources.get("obs"),
|
846
1100
|
exclude=self._exclude.get("obs"),
|
847
|
-
|
848
|
-
**self._kwargs,
|
1101
|
+
organism=organism,
|
849
1102
|
)
|
850
1103
|
self._mod_adata_curators = {
|
851
|
-
modality:
|
852
|
-
data=
|
1104
|
+
modality: AnnDataCatManager(
|
1105
|
+
data=self._dataset[modality],
|
853
1106
|
var_index=var_index.get(modality),
|
854
1107
|
categoricals=self._obs_fields.get(modality),
|
855
|
-
using_key=using_key,
|
856
1108
|
verbosity=verbosity,
|
857
1109
|
sources=self._sources.get(modality),
|
858
1110
|
exclude=self._exclude.get(modality),
|
859
|
-
|
1111
|
+
organism=organism,
|
860
1112
|
)
|
861
1113
|
for modality in self._modalities
|
862
1114
|
if modality != "obs"
|
@@ -874,7 +1126,7 @@ class MuDataCurator:
|
|
874
1126
|
return self._obs_fields
|
875
1127
|
|
876
1128
|
@property
|
877
|
-
def non_validated(self) -> dict[str, dict[str, list[str]]]:
|
1129
|
+
def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
|
878
1130
|
"""Return the non-validated features and labels."""
|
879
1131
|
if self._non_validated is None:
|
880
1132
|
raise ValidationError("Please run validate() first!")
|
@@ -883,15 +1135,15 @@ class MuDataCurator:
|
|
883
1135
|
def _verify_modality(self, modalities: Iterable[str]):
|
884
1136
|
"""Verify the modality exists."""
|
885
1137
|
for modality in modalities:
|
886
|
-
if modality not in self.
|
1138
|
+
if modality not in self._dataset.mod.keys():
|
887
1139
|
raise ValidationError(f"modality '{modality}' does not exist!")
|
888
1140
|
|
889
1141
|
def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
|
890
1142
|
"""Parse the categorical fields."""
|
891
|
-
prefixes = {f"{k}:" for k in self.
|
1143
|
+
prefixes = {f"{k}:" for k in self._dataset.mod.keys()}
|
892
1144
|
obs_fields: dict[str, dict[str, FieldAttr]] = {}
|
893
1145
|
for k, v in categoricals.items():
|
894
|
-
if k not in self.
|
1146
|
+
if k not in self._dataset.obs.columns:
|
895
1147
|
raise ValidationError(f"column '{k}' does not exist in mdata.obs!")
|
896
1148
|
if any(k.startswith(prefix) for prefix in prefixes):
|
897
1149
|
modality, col = k.split(":")[0], k.split(":")[1]
|
@@ -904,14 +1156,11 @@ class MuDataCurator:
|
|
904
1156
|
obs_fields["obs"][k] = v
|
905
1157
|
return obs_fields
|
906
1158
|
|
907
|
-
def lookup(
|
908
|
-
self, using_key: str | None = None, public: bool = False
|
909
|
-
) -> CurateLookup:
|
1159
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
910
1160
|
"""Lookup categories.
|
911
1161
|
|
912
1162
|
Args:
|
913
|
-
|
914
|
-
if "public", the lookup is performed on the public reference.
|
1163
|
+
public: Perform lookup on public source ontologies.
|
915
1164
|
"""
|
916
1165
|
obs_fields = {}
|
917
1166
|
for mod, fields in self._obs_fields.items():
|
@@ -925,27 +1174,19 @@ class MuDataCurator:
|
|
925
1174
|
slots={
|
926
1175
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
927
1176
|
},
|
928
|
-
using_key=using_key or self._using_key,
|
929
1177
|
public=public,
|
930
1178
|
)
|
931
1179
|
|
1180
|
+
@deprecated(new_name="is run by default")
|
932
1181
|
def add_new_from_columns(
|
933
1182
|
self,
|
934
1183
|
modality: str,
|
935
1184
|
column_names: list[str] | None = None,
|
936
|
-
organism: str | None = None,
|
937
1185
|
**kwargs,
|
938
1186
|
):
|
939
|
-
|
940
|
-
warnings.warn(
|
941
|
-
"`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
|
942
|
-
DeprecationWarning,
|
943
|
-
stacklevel=2,
|
944
|
-
)
|
1187
|
+
pass
|
945
1188
|
|
946
|
-
def add_new_from_var_index(
|
947
|
-
self, modality: str, organism: str | None = None, **kwargs
|
948
|
-
):
|
1189
|
+
def add_new_from_var_index(self, modality: str, **kwargs):
|
949
1190
|
"""Update variable records.
|
950
1191
|
|
951
1192
|
Args:
|
@@ -953,25 +1194,19 @@ class MuDataCurator:
|
|
953
1194
|
organism: The organism name.
|
954
1195
|
**kwargs: Additional keyword arguments to pass to create new records.
|
955
1196
|
"""
|
956
|
-
self.
|
957
|
-
self._mod_adata_curators[modality].add_new_from_var_index(
|
958
|
-
**self._kwargs, **kwargs
|
959
|
-
)
|
1197
|
+
self._mod_adata_curators[modality].add_new_from_var_index(**kwargs)
|
960
1198
|
|
961
1199
|
def _update_registry_all(self):
|
962
1200
|
"""Update all registries."""
|
963
1201
|
if self._obs_df_curator is not None:
|
964
|
-
self._obs_df_curator._update_registry_all(
|
965
|
-
validated_only=True, **self._kwargs
|
966
|
-
)
|
1202
|
+
self._obs_df_curator._update_registry_all(validated_only=True)
|
967
1203
|
for _, adata_curator in self._mod_adata_curators.items():
|
968
|
-
adata_curator._update_registry_all(validated_only=True
|
1204
|
+
adata_curator._obs_df_curator._update_registry_all(validated_only=True)
|
969
1205
|
|
970
1206
|
def add_new_from(
|
971
1207
|
self,
|
972
1208
|
key: str,
|
973
1209
|
modality: str | None = None,
|
974
|
-
organism: str | None = None,
|
975
1210
|
**kwargs,
|
976
1211
|
):
|
977
1212
|
"""Add validated & new categories.
|
@@ -984,24 +1219,17 @@ class MuDataCurator:
|
|
984
1219
|
"""
|
985
1220
|
if len(kwargs) > 0 and key == "all":
|
986
1221
|
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
987
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
988
1222
|
modality = modality or "obs"
|
989
1223
|
if modality in self._mod_adata_curators:
|
990
1224
|
adata_curator = self._mod_adata_curators[modality]
|
991
|
-
adata_curator.add_new_from(key=key, **
|
1225
|
+
adata_curator.add_new_from(key=key, **kwargs)
|
992
1226
|
if modality == "obs":
|
993
|
-
self._obs_df_curator.add_new_from(key=key, **
|
1227
|
+
self._obs_df_curator.add_new_from(key=key, **kwargs)
|
994
1228
|
|
995
|
-
def validate(self
|
1229
|
+
def validate(self) -> bool:
|
996
1230
|
"""Validate categories."""
|
997
1231
|
from lamindb.core._settings import settings
|
998
1232
|
|
999
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
1000
|
-
if self._using_key is not None and self._using_key != "default":
|
1001
|
-
logger.important(
|
1002
|
-
f"validating using registries of instance {colors.italic(self._using_key)}"
|
1003
|
-
)
|
1004
|
-
|
1005
1233
|
# add all validated records to the current instance
|
1006
1234
|
verbosity = settings.verbosity
|
1007
1235
|
try:
|
@@ -1015,20 +1243,20 @@ class MuDataCurator:
|
|
1015
1243
|
obs_validated = True
|
1016
1244
|
if "obs" in self._modalities:
|
1017
1245
|
logger.info('validating categoricals in "obs"...')
|
1018
|
-
obs_validated &= self._obs_df_curator.validate(
|
1246
|
+
obs_validated &= self._obs_df_curator.validate()
|
1019
1247
|
self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore
|
1020
1248
|
logger.print("")
|
1021
1249
|
|
1022
1250
|
mods_validated = True
|
1023
1251
|
for modality, adata_curator in self._mod_adata_curators.items():
|
1024
1252
|
logger.info(f'validating categoricals in modality "{modality}"...')
|
1025
|
-
mods_validated &= adata_curator.validate(
|
1253
|
+
mods_validated &= adata_curator.validate()
|
1026
1254
|
if len(adata_curator.non_validated) > 0:
|
1027
1255
|
self._non_validated[modality] = adata_curator.non_validated # type: ignore
|
1028
1256
|
logger.print("")
|
1029
1257
|
|
1030
|
-
self.
|
1031
|
-
return self.
|
1258
|
+
self._is_validated = obs_validated & mods_validated
|
1259
|
+
return self._is_validated
|
1032
1260
|
|
1033
1261
|
def standardize(self, key: str, modality: str | None = None):
|
1034
1262
|
"""Replace synonyms with standardized values.
|
@@ -1039,6 +1267,8 @@ class MuDataCurator:
|
|
1039
1267
|
|
1040
1268
|
Inplace modification of the dataset.
|
1041
1269
|
"""
|
1270
|
+
if self._artifact is not None:
|
1271
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
1042
1272
|
modality = modality or "obs"
|
1043
1273
|
if modality in self._mod_adata_curators:
|
1044
1274
|
adata_curator = self._mod_adata_curators[modality]
|
@@ -1046,47 +1276,6 @@ class MuDataCurator:
|
|
1046
1276
|
if modality == "obs":
|
1047
1277
|
self._obs_df_curator.standardize(key=key)
|
1048
1278
|
|
1049
|
-
def save_artifact(
|
1050
|
-
self,
|
1051
|
-
description: str | None = None,
|
1052
|
-
key: str | None = None,
|
1053
|
-
revises: Artifact | None = None,
|
1054
|
-
run: Run | None = None,
|
1055
|
-
) -> Artifact:
|
1056
|
-
"""Save the validated ``MuData`` and metadata.
|
1057
|
-
|
1058
|
-
Args:
|
1059
|
-
description: A description of the ``MuData`` object.
|
1060
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
1061
|
-
revises: Previous version of the artifact. Triggers a revision.
|
1062
|
-
run: The run that creates the artifact.
|
1063
|
-
|
1064
|
-
Returns:
|
1065
|
-
A saved artifact record.
|
1066
|
-
"""
|
1067
|
-
from lamindb.core._settings import settings
|
1068
|
-
|
1069
|
-
if not self._validated:
|
1070
|
-
self.validate()
|
1071
|
-
if not self._validated:
|
1072
|
-
raise ValidationError("Dataset does not validate. Please curate.")
|
1073
|
-
verbosity = settings.verbosity
|
1074
|
-
try:
|
1075
|
-
settings.verbosity = "warning"
|
1076
|
-
self._artifact = save_artifact(
|
1077
|
-
self._mdata,
|
1078
|
-
description=description,
|
1079
|
-
columns_field=self.var_index,
|
1080
|
-
fields=self.categoricals,
|
1081
|
-
key=key,
|
1082
|
-
revises=revises,
|
1083
|
-
run=run,
|
1084
|
-
**self._kwargs,
|
1085
|
-
)
|
1086
|
-
finally:
|
1087
|
-
settings.verbosity = verbosity
|
1088
|
-
return self._artifact
|
1089
|
-
|
1090
1279
|
|
1091
1280
|
def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
1092
1281
|
if (n := len(nonval_keys)) > 0:
|
@@ -1097,8 +1286,8 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
|
|
1097
1286
|
)
|
1098
1287
|
|
1099
1288
|
|
1100
|
-
class
|
1101
|
-
"""Curation flow for
|
1289
|
+
class TiledbsomaCatManager(CatManager):
|
1290
|
+
"""Curation flow for `tiledbsoma.Experiment`.
|
1102
1291
|
|
1103
1292
|
See also :class:`~lamindb.Curator`.
|
1104
1293
|
|
@@ -1123,7 +1312,7 @@ class SOMACurator(BaseCurator):
|
|
1123
1312
|
... var_index={"RNA": ("var_id", bt.Gene.symbol)},
|
1124
1313
|
... categoricals={
|
1125
1314
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
1126
|
-
... "donor_id":
|
1315
|
+
... "donor_id": ULabel.name
|
1127
1316
|
... },
|
1128
1317
|
... organism="human",
|
1129
1318
|
... )
|
@@ -1138,23 +1327,21 @@ class SOMACurator(BaseCurator):
|
|
1138
1327
|
organism: str | None = None,
|
1139
1328
|
sources: dict[str, Record] | None = None,
|
1140
1329
|
exclude: dict[str, str | list[str]] | None = None,
|
1141
|
-
using_key: str | None = None,
|
1142
1330
|
):
|
1143
1331
|
self._obs_fields = categoricals or {}
|
1144
1332
|
self._var_fields = var_index
|
1145
1333
|
self._columns_field = obs_columns
|
1146
1334
|
if isinstance(experiment_uri, Artifact):
|
1147
|
-
self.
|
1335
|
+
self._dataset = experiment_uri.path
|
1148
1336
|
self._artifact = experiment_uri
|
1149
1337
|
else:
|
1150
|
-
self.
|
1338
|
+
self._dataset = UPath(experiment_uri)
|
1151
1339
|
self._artifact = None
|
1152
1340
|
self._organism = organism
|
1153
|
-
self._using_key = using_key
|
1154
1341
|
self._sources = sources or {}
|
1155
1342
|
self._exclude = exclude or {}
|
1156
1343
|
|
1157
|
-
self.
|
1344
|
+
self._is_validated: bool | None = False
|
1158
1345
|
self._non_validated_values: dict[str, list] | None = None
|
1159
1346
|
self._validated_values: dict[str, list] = {}
|
1160
1347
|
# filled by _check_save_keys
|
@@ -1172,7 +1359,7 @@ class SOMACurator(BaseCurator):
|
|
1172
1359
|
def _check_save_keys(self):
|
1173
1360
|
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1174
1361
|
|
1175
|
-
with _open_tiledbsoma(self.
|
1362
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1176
1363
|
experiment_obs = experiment.obs
|
1177
1364
|
self._n_obs = len(experiment_obs)
|
1178
1365
|
self._obs_pa_schema = experiment_obs.schema
|
@@ -1228,7 +1415,6 @@ class SOMACurator(BaseCurator):
|
|
1228
1415
|
values=register_columns,
|
1229
1416
|
field=self._columns_field,
|
1230
1417
|
key="columns",
|
1231
|
-
using_key=self._using_key,
|
1232
1418
|
validated_only=False,
|
1233
1419
|
organism=organism,
|
1234
1420
|
source=self._sources.get("columns"),
|
@@ -1244,7 +1430,6 @@ class SOMACurator(BaseCurator):
|
|
1244
1430
|
values=additional_columns,
|
1245
1431
|
field=self._columns_field,
|
1246
1432
|
key="columns",
|
1247
|
-
using_key=self._using_key,
|
1248
1433
|
validated_only=True,
|
1249
1434
|
organism=organism,
|
1250
1435
|
source=self._sources.get("columns"),
|
@@ -1257,7 +1442,7 @@ class SOMACurator(BaseCurator):
|
|
1257
1442
|
|
1258
1443
|
validated = True
|
1259
1444
|
self._non_validated_values = {}
|
1260
|
-
with _open_tiledbsoma(self.
|
1445
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1261
1446
|
for ms, (key, field) in self._var_fields.items():
|
1262
1447
|
var_ms = experiment.ms[ms].var
|
1263
1448
|
var_ms_key = f"{ms}__{key}"
|
@@ -1274,7 +1459,6 @@ class SOMACurator(BaseCurator):
|
|
1274
1459
|
values=var_ms_values,
|
1275
1460
|
field=field,
|
1276
1461
|
key=var_ms_key,
|
1277
|
-
using_key=self._using_key,
|
1278
1462
|
validated_only=True,
|
1279
1463
|
organism=organism,
|
1280
1464
|
source=self._sources.get(var_ms_key),
|
@@ -1284,7 +1468,6 @@ class SOMACurator(BaseCurator):
|
|
1284
1468
|
values=var_ms_values,
|
1285
1469
|
field=field,
|
1286
1470
|
key=var_ms_key,
|
1287
|
-
using_key=self._using_key,
|
1288
1471
|
organism=organism,
|
1289
1472
|
source=self._sources.get(var_ms_key),
|
1290
1473
|
exclude=self._exclude.get(var_ms_key),
|
@@ -1310,7 +1493,6 @@ class SOMACurator(BaseCurator):
|
|
1310
1493
|
values=values,
|
1311
1494
|
field=field,
|
1312
1495
|
key=key,
|
1313
|
-
using_key=self._using_key,
|
1314
1496
|
validated_only=True,
|
1315
1497
|
organism=organism,
|
1316
1498
|
source=self._sources.get(key),
|
@@ -1320,7 +1502,6 @@ class SOMACurator(BaseCurator):
|
|
1320
1502
|
values=values,
|
1321
1503
|
field=field,
|
1322
1504
|
key=key,
|
1323
|
-
using_key=self._using_key,
|
1324
1505
|
organism=organism,
|
1325
1506
|
source=self._sources.get(key),
|
1326
1507
|
exclude=self._exclude.get(key),
|
@@ -1330,8 +1511,8 @@ class SOMACurator(BaseCurator):
|
|
1330
1511
|
self._non_validated_values[key] = non_val
|
1331
1512
|
else:
|
1332
1513
|
self._validated_values[key] = values
|
1333
|
-
self.
|
1334
|
-
return self.
|
1514
|
+
self._is_validated = validated
|
1515
|
+
return self._is_validated
|
1335
1516
|
|
1336
1517
|
def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
|
1337
1518
|
assert self._non_validated_values is not None # noqa: S101
|
@@ -1346,7 +1527,7 @@ class SOMACurator(BaseCurator):
|
|
1346
1527
|
values = self._non_validated_values.get(key, [])
|
1347
1528
|
return values, field
|
1348
1529
|
|
1349
|
-
def add_new_from(self, key: str) -> None:
|
1530
|
+
def add_new_from(self, key: str, **kwargs) -> None:
|
1350
1531
|
"""Add validated & new categories.
|
1351
1532
|
|
1352
1533
|
Args:
|
@@ -1378,11 +1559,11 @@ class SOMACurator(BaseCurator):
|
|
1378
1559
|
values=values,
|
1379
1560
|
field=field,
|
1380
1561
|
key=k,
|
1381
|
-
using_key=self._using_key,
|
1382
1562
|
validated_only=False,
|
1383
1563
|
organism=organism,
|
1384
1564
|
source=self._sources.get(k),
|
1385
1565
|
exclude=self._exclude.get(k),
|
1566
|
+
**kwargs,
|
1386
1567
|
)
|
1387
1568
|
# update non-validated values list but keep the key there
|
1388
1569
|
# it will be removed by .validate()
|
@@ -1405,19 +1586,15 @@ class SOMACurator(BaseCurator):
|
|
1405
1586
|
"""Return the obs fields to validate against."""
|
1406
1587
|
return self._obs_fields
|
1407
1588
|
|
1408
|
-
def lookup(
|
1409
|
-
self, using_key: str | None = None, public: bool = False
|
1410
|
-
) -> CurateLookup:
|
1589
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
1411
1590
|
"""Lookup categories.
|
1412
1591
|
|
1413
1592
|
Args:
|
1414
|
-
|
1415
|
-
if "public", the lookup is performed on the public reference.
|
1593
|
+
public: If "public", the lookup is performed on the public reference.
|
1416
1594
|
"""
|
1417
1595
|
return CurateLookup(
|
1418
1596
|
categoricals=self._obs_fields,
|
1419
1597
|
slots={"columns": self._columns_field, **self._var_fields_flat},
|
1420
|
-
using_key=using_key or self._using_key,
|
1421
1598
|
public=public,
|
1422
1599
|
)
|
1423
1600
|
|
@@ -1462,7 +1639,6 @@ class SOMACurator(BaseCurator):
|
|
1462
1639
|
syn_mapper = standardize_categories(
|
1463
1640
|
values=values,
|
1464
1641
|
field=field,
|
1465
|
-
using_key=self._using_key,
|
1466
1642
|
source=self._sources.get(k),
|
1467
1643
|
organism=organism,
|
1468
1644
|
)
|
@@ -1471,7 +1647,7 @@ class SOMACurator(BaseCurator):
|
|
1471
1647
|
|
1472
1648
|
from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
|
1473
1649
|
|
1474
|
-
with _open_tiledbsoma(self.
|
1650
|
+
with _open_tiledbsoma(self._dataset, mode="r") as experiment:
|
1475
1651
|
value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
|
1476
1652
|
table = slot(experiment).read(value_filter=value_filter).concat()
|
1477
1653
|
|
@@ -1484,7 +1660,7 @@ class SOMACurator(BaseCurator):
|
|
1484
1660
|
lambda val: syn_mapper.get(val, val) # noqa
|
1485
1661
|
)
|
1486
1662
|
# write the mapped values
|
1487
|
-
with _open_tiledbsoma(self.
|
1663
|
+
with _open_tiledbsoma(self._dataset, mode="w") as experiment:
|
1488
1664
|
slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
|
1489
1665
|
# update non_validated dict
|
1490
1666
|
non_val_k = [
|
@@ -1502,8 +1678,9 @@ class SOMACurator(BaseCurator):
|
|
1502
1678
|
|
1503
1679
|
def save_artifact(
|
1504
1680
|
self,
|
1505
|
-
|
1681
|
+
*,
|
1506
1682
|
key: str | None = None,
|
1683
|
+
description: str | None = None,
|
1507
1684
|
revises: Artifact | None = None,
|
1508
1685
|
run: Run | None = None,
|
1509
1686
|
) -> Artifact:
|
@@ -1512,7 +1689,7 @@ class SOMACurator(BaseCurator):
|
|
1512
1689
|
Args:
|
1513
1690
|
description: A description of the ``tiledbsoma`` store.
|
1514
1691
|
key: A path-like key to reference artifact in default storage,
|
1515
|
-
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a
|
1692
|
+
e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
|
1516
1693
|
revises: Previous version of the artifact. Triggers a revision.
|
1517
1694
|
run: The run that creates the artifact.
|
1518
1695
|
|
@@ -1521,14 +1698,14 @@ class SOMACurator(BaseCurator):
|
|
1521
1698
|
"""
|
1522
1699
|
from lamindb.core._data import add_labels
|
1523
1700
|
|
1524
|
-
if not self.
|
1701
|
+
if not self._is_validated:
|
1525
1702
|
self.validate()
|
1526
|
-
if not self.
|
1703
|
+
if not self._is_validated:
|
1527
1704
|
raise ValidationError("Dataset does not validate. Please curate.")
|
1528
1705
|
|
1529
1706
|
if self._artifact is None:
|
1530
1707
|
artifact = Artifact(
|
1531
|
-
self.
|
1708
|
+
self._dataset,
|
1532
1709
|
description=description,
|
1533
1710
|
key=key,
|
1534
1711
|
revises=revises,
|
@@ -1540,7 +1717,7 @@ class SOMACurator(BaseCurator):
|
|
1540
1717
|
else:
|
1541
1718
|
artifact = self._artifact
|
1542
1719
|
|
1543
|
-
|
1720
|
+
feature_sets = {}
|
1544
1721
|
if len(self._obs_fields) > 0:
|
1545
1722
|
organism = check_registry_organism(
|
1546
1723
|
self._columns_field.field.model, self._organism
|
@@ -1550,7 +1727,7 @@ class SOMACurator(BaseCurator):
|
|
1550
1727
|
empty_dict, schema=self._obs_pa_schema
|
1551
1728
|
).to_pandas()
|
1552
1729
|
# in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
|
1553
|
-
|
1730
|
+
feature_sets["obs"] = Schema.from_df(
|
1554
1731
|
df=mock_df,
|
1555
1732
|
field=self._columns_field,
|
1556
1733
|
mute=True,
|
@@ -1561,238 +1738,1370 @@ class SOMACurator(BaseCurator):
|
|
1561
1738
|
organism = check_registry_organism(
|
1562
1739
|
var_field.field.model, self._organism
|
1563
1740
|
).get("organism")
|
1564
|
-
|
1741
|
+
feature_sets[f"{ms}__var"] = Schema.from_values(
|
1565
1742
|
values=self._validated_values[f"{ms}__{var_key}"],
|
1566
1743
|
field=var_field,
|
1567
1744
|
organism=organism,
|
1568
1745
|
raise_validation_error=False,
|
1569
1746
|
)
|
1570
|
-
artifact.
|
1747
|
+
artifact._staged_feature_sets = feature_sets
|
1748
|
+
|
1749
|
+
feature_ref_is_name = _ref_is_name(self._columns_field)
|
1750
|
+
features = Feature.lookup().dict()
|
1751
|
+
for key, field in self._obs_fields.items():
|
1752
|
+
feature = features.get(key)
|
1753
|
+
registry = field.field.model
|
1754
|
+
organism = check_registry_organism(field.field.model, self._organism).get(
|
1755
|
+
"organism"
|
1756
|
+
)
|
1757
|
+
labels = registry.from_values(
|
1758
|
+
values=self._validated_values[key], field=field, organism=organism
|
1759
|
+
)
|
1760
|
+
if len(labels) == 0:
|
1761
|
+
continue
|
1762
|
+
if hasattr(registry, "_name_field"):
|
1763
|
+
label_ref_is_name = field.field.name == registry._name_field
|
1764
|
+
add_labels(
|
1765
|
+
artifact,
|
1766
|
+
records=labels,
|
1767
|
+
feature=feature,
|
1768
|
+
feature_ref_is_name=feature_ref_is_name,
|
1769
|
+
label_ref_is_name=label_ref_is_name,
|
1770
|
+
from_curator=True,
|
1771
|
+
)
|
1772
|
+
|
1773
|
+
return artifact.save()
|
1774
|
+
|
1775
|
+
|
1776
|
+
class SpatialDataCatManager(CatManager):
|
1777
|
+
"""Curation flow for a ``Spatialdata`` object.
|
1778
|
+
|
1779
|
+
See also :class:`~lamindb.Curator`.
|
1780
|
+
|
1781
|
+
Note that if genes or other measurements are removed from the SpatialData object,
|
1782
|
+
the object should be recreated.
|
1783
|
+
|
1784
|
+
In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
|
1785
|
+
|
1786
|
+
Args:
|
1787
|
+
sdata: The SpatialData object to curate.
|
1788
|
+
var_index: A dictionary mapping table keys to the ``.var`` indices.
|
1789
|
+
categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
|
1790
|
+
|
1791
|
+
organism: The organism name.
|
1792
|
+
sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
|
1793
|
+
exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
|
1794
|
+
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1795
|
+
using the exclude parameter ensures they are not validated.
|
1796
|
+
verbosity: The verbosity level of the logger.
|
1797
|
+
sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
|
1798
|
+
|
1799
|
+
Examples:
|
1800
|
+
>>> import bionty as bt
|
1801
|
+
>>> curator = SpatialDataCatManager(
|
1802
|
+
... sdata,
|
1803
|
+
... var_index={
|
1804
|
+
... "table_1": bt.Gene.ensembl_gene_id,
|
1805
|
+
... },
|
1806
|
+
... categoricals={
|
1807
|
+
... "table1":
|
1808
|
+
... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
|
1809
|
+
... "sample":
|
1810
|
+
... {"experimental_factor": bt.ExperimentalFactor.name},
|
1811
|
+
... },
|
1812
|
+
... organism="human",
|
1813
|
+
... )
|
1814
|
+
"""
|
1815
|
+
|
1816
|
+
def __init__(
|
1817
|
+
self,
|
1818
|
+
sdata: Any,
|
1819
|
+
var_index: dict[str, FieldAttr],
|
1820
|
+
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
1821
|
+
verbosity: str = "hint",
|
1822
|
+
organism: str | None = None,
|
1823
|
+
sources: dict[str, dict[str, Record]] | None = None,
|
1824
|
+
exclude: dict[str, dict] | None = None,
|
1825
|
+
*,
|
1826
|
+
sample_metadata_key: str | None = "sample",
|
1827
|
+
) -> None:
|
1828
|
+
super().__init__(
|
1829
|
+
dataset=sdata,
|
1830
|
+
categoricals={},
|
1831
|
+
sources=sources,
|
1832
|
+
organism=organism,
|
1833
|
+
exclude=exclude,
|
1834
|
+
)
|
1835
|
+
if isinstance(sdata, Artifact):
|
1836
|
+
# TODO: load() doesn't yet work
|
1837
|
+
self._sdata = sdata.load()
|
1838
|
+
else:
|
1839
|
+
self._sdata = self._dataset
|
1840
|
+
self._sample_metadata_key = sample_metadata_key
|
1841
|
+
self._var_fields = var_index
|
1842
|
+
self._verify_accessor_exists(self._var_fields.keys())
|
1843
|
+
self._categoricals = categoricals
|
1844
|
+
self._table_keys = set(self._var_fields.keys()) | set(
|
1845
|
+
self._categoricals.keys() - {self._sample_metadata_key}
|
1846
|
+
)
|
1847
|
+
self._verbosity = verbosity
|
1848
|
+
self._sample_df_curator = None
|
1849
|
+
if self._sample_metadata_key is not None:
|
1850
|
+
self._sample_metadata = self._sdata.get_attrs(
|
1851
|
+
key=self._sample_metadata_key, return_as="df", flatten=True
|
1852
|
+
)
|
1853
|
+
self._is_validated = False
|
1854
|
+
|
1855
|
+
# Check validity of keys in categoricals
|
1856
|
+
nonval_keys = []
|
1857
|
+
for accessor, accessor_categoricals in self._categoricals.items():
|
1858
|
+
if (
|
1859
|
+
accessor == self._sample_metadata_key
|
1860
|
+
and self._sample_metadata is not None
|
1861
|
+
):
|
1862
|
+
for key in accessor_categoricals.keys():
|
1863
|
+
if key not in self._sample_metadata.columns:
|
1864
|
+
nonval_keys.append(key)
|
1865
|
+
else:
|
1866
|
+
for key in accessor_categoricals.keys():
|
1867
|
+
if key not in self._sdata[accessor].obs.columns:
|
1868
|
+
nonval_keys.append(key)
|
1869
|
+
|
1870
|
+
_maybe_curation_keys_not_present(nonval_keys, "categoricals")
|
1871
|
+
|
1872
|
+
# check validity of keys in sources and exclude
|
1873
|
+
for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
|
1874
|
+
nonval_keys = []
|
1875
|
+
for accessor, accessor_sources in dct.items():
|
1876
|
+
if (
|
1877
|
+
accessor == self._sample_metadata_key
|
1878
|
+
and self._sample_metadata is not None
|
1879
|
+
):
|
1880
|
+
columns = self._sample_metadata.columns
|
1881
|
+
elif accessor != self._sample_metadata_key:
|
1882
|
+
columns = self._sdata[accessor].obs.columns
|
1883
|
+
else:
|
1884
|
+
continue
|
1885
|
+
for key in accessor_sources:
|
1886
|
+
if key not in columns:
|
1887
|
+
nonval_keys.append(key)
|
1888
|
+
_maybe_curation_keys_not_present(nonval_keys, name)
|
1889
|
+
|
1890
|
+
# Set up sample level metadata and table Curator objects
|
1891
|
+
if (
|
1892
|
+
self._sample_metadata_key is not None
|
1893
|
+
and self._sample_metadata_key in self._categoricals
|
1894
|
+
):
|
1895
|
+
self._sample_df_curator = DataFrameCatManager(
|
1896
|
+
df=self._sample_metadata,
|
1897
|
+
columns=Feature.name,
|
1898
|
+
categoricals=self._categoricals.get(self._sample_metadata_key, {}),
|
1899
|
+
verbosity=verbosity,
|
1900
|
+
sources=self._sources.get(self._sample_metadata_key),
|
1901
|
+
exclude=self._exclude.get(self._sample_metadata_key),
|
1902
|
+
organism=organism,
|
1903
|
+
)
|
1904
|
+
self._table_adata_curators = {
|
1905
|
+
table: AnnDataCatManager(
|
1906
|
+
data=self._sdata[table],
|
1907
|
+
var_index=var_index.get(table),
|
1908
|
+
categoricals=self._categoricals.get(table),
|
1909
|
+
verbosity=verbosity,
|
1910
|
+
sources=self._sources.get(table),
|
1911
|
+
exclude=self._exclude.get(table),
|
1912
|
+
organism=organism,
|
1913
|
+
)
|
1914
|
+
for table in self._table_keys
|
1915
|
+
}
|
1916
|
+
|
1917
|
+
self._non_validated = None
|
1918
|
+
|
1919
|
+
@property
|
1920
|
+
def var_index(self) -> FieldAttr:
|
1921
|
+
"""Return the registry fields to validate variables indices against."""
|
1922
|
+
return self._var_fields
|
1923
|
+
|
1924
|
+
@property
|
1925
|
+
def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
|
1926
|
+
"""Return the categorical keys and fields to validate against."""
|
1927
|
+
return self._categoricals
|
1928
|
+
|
1929
|
+
@property
|
1930
|
+
def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
|
1931
|
+
"""Return the non-validated features and labels."""
|
1932
|
+
if self._non_validated is None:
|
1933
|
+
raise ValidationError("Please run validate() first!")
|
1934
|
+
return self._non_validated
|
1935
|
+
|
1936
|
+
def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
|
1937
|
+
"""Verify that the accessors exist (either a valid table or in attrs)."""
|
1938
|
+
for acc in accessors:
|
1939
|
+
is_present = False
|
1940
|
+
try:
|
1941
|
+
self._sdata.get_attrs(key=acc)
|
1942
|
+
is_present = True
|
1943
|
+
except KeyError:
|
1944
|
+
if acc in self._sdata.tables.keys():
|
1945
|
+
is_present = True
|
1946
|
+
if not is_present:
|
1947
|
+
raise ValidationError(f"Accessor '{acc}' does not exist!")
|
1948
|
+
|
1949
|
+
def lookup(self, public: bool = False) -> CurateLookup:
|
1950
|
+
"""Look up categories.
|
1951
|
+
|
1952
|
+
Args:
|
1953
|
+
public: Whether the lookup is performed on the public reference.
|
1954
|
+
"""
|
1955
|
+
cat_values_dict = list(self.categoricals.values())[0]
|
1956
|
+
return CurateLookup(
|
1957
|
+
categoricals=cat_values_dict,
|
1958
|
+
slots={"accessors": cat_values_dict.keys()},
|
1959
|
+
public=public,
|
1960
|
+
)
|
1961
|
+
|
1962
|
+
def _update_registry_all(self) -> None:
|
1963
|
+
"""Saves labels of all features for sample and table metadata."""
|
1964
|
+
if self._sample_df_curator is not None:
|
1965
|
+
self._sample_df_curator._update_registry_all(
|
1966
|
+
validated_only=True,
|
1967
|
+
)
|
1968
|
+
for _, adata_curator in self._table_adata_curators.items():
|
1969
|
+
adata_curator._obs_df_curator._update_registry_all(
|
1970
|
+
validated_only=True,
|
1971
|
+
)
|
1972
|
+
|
1973
|
+
def add_new_from_var_index(self, table: str, **kwargs) -> None:
|
1974
|
+
"""Save new values from ``.var.index`` of table.
|
1975
|
+
|
1976
|
+
Args:
|
1977
|
+
table: The table key.
|
1978
|
+
organism: The organism name.
|
1979
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
1980
|
+
"""
|
1981
|
+
if self._non_validated is None:
|
1982
|
+
raise ValidationError("Run .validate() first.")
|
1983
|
+
self._table_adata_curators[table].add_new_from_var_index(**kwargs)
|
1984
|
+
if table in self.non_validated.keys():
|
1985
|
+
if "var_index" in self._non_validated[table]:
|
1986
|
+
self._non_validated[table].pop("var_index")
|
1987
|
+
|
1988
|
+
if len(self.non_validated[table].values()) == 0:
|
1989
|
+
self.non_validated.pop(table)
|
1990
|
+
|
1991
|
+
def add_new_from(
|
1992
|
+
self,
|
1993
|
+
key: str,
|
1994
|
+
accessor: str | None = None,
|
1995
|
+
**kwargs,
|
1996
|
+
) -> None:
|
1997
|
+
"""Save new values of categorical from sample level metadata or table.
|
1998
|
+
|
1999
|
+
Args:
|
2000
|
+
key: The key referencing the slot in the DataFrame.
|
2001
|
+
accessor: The accessor key such as 'sample' or 'table x'.
|
2002
|
+
organism: The organism name.
|
2003
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
2004
|
+
"""
|
2005
|
+
if self._non_validated is None:
|
2006
|
+
raise ValidationError("Run .validate() first.")
|
2007
|
+
|
2008
|
+
if len(kwargs) > 0 and key == "all":
|
2009
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
2010
|
+
|
2011
|
+
if accessor not in self.categoricals:
|
2012
|
+
raise ValueError(
|
2013
|
+
f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
|
2014
|
+
)
|
2015
|
+
|
2016
|
+
if accessor in self._table_adata_curators:
|
2017
|
+
adata_curator = self._table_adata_curators[accessor]
|
2018
|
+
adata_curator.add_new_from(key=key, **kwargs)
|
2019
|
+
if accessor == self._sample_metadata_key:
|
2020
|
+
self._sample_df_curator.add_new_from(key=key, **kwargs)
|
2021
|
+
|
2022
|
+
if accessor in self.non_validated.keys():
|
2023
|
+
if len(self.non_validated[accessor].values()) == 0:
|
2024
|
+
self.non_validated.pop(accessor)
|
2025
|
+
|
2026
|
+
def standardize(self, key: str, accessor: str | None = None) -> None:
|
2027
|
+
"""Replace synonyms with canonical values.
|
2028
|
+
|
2029
|
+
Modifies the dataset inplace.
|
2030
|
+
|
2031
|
+
Args:
|
2032
|
+
key: The key referencing the slot in the table or sample metadata.
|
2033
|
+
accessor: The accessor key such as 'sample_key' or 'table_key'.
|
2034
|
+
"""
|
2035
|
+
if len(self.non_validated) == 0:
|
2036
|
+
logger.warning("values are already standardized")
|
2037
|
+
return
|
2038
|
+
if self._artifact is not None:
|
2039
|
+
raise RuntimeError("can't mutate the dataset when an artifact is passed!")
|
2040
|
+
|
2041
|
+
if accessor == self._sample_metadata_key:
|
2042
|
+
if key not in self._sample_metadata.columns:
|
2043
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2044
|
+
else:
|
2045
|
+
if (
|
2046
|
+
key == "var_index" and self._sdata.tables[accessor].var.index is None
|
2047
|
+
) or (
|
2048
|
+
key != "var_index"
|
2049
|
+
and key not in self._sdata.tables[accessor].obs.columns
|
2050
|
+
):
|
2051
|
+
raise ValueError(f"key '{key}' not present in '{accessor}'!")
|
2052
|
+
|
2053
|
+
if accessor in self._table_adata_curators.keys():
|
2054
|
+
adata_curator = self._table_adata_curators[accessor]
|
2055
|
+
adata_curator.standardize(key)
|
2056
|
+
if accessor == self._sample_metadata_key:
|
2057
|
+
self._sample_df_curator.standardize(key)
|
2058
|
+
|
2059
|
+
if len(self.non_validated[accessor].values()) == 0:
|
2060
|
+
self.non_validated.pop(accessor)
|
2061
|
+
|
2062
|
+
def validate(self) -> bool:
|
2063
|
+
"""Validate variables and categorical observations.
|
2064
|
+
|
2065
|
+
This method also registers the validated records in the current instance:
|
2066
|
+
- from public sources
|
2067
|
+
|
2068
|
+
Args:
|
2069
|
+
organism: The organism name.
|
2070
|
+
|
2071
|
+
Returns:
|
2072
|
+
Whether the SpatialData object is validated.
|
2073
|
+
"""
|
2074
|
+
from lamindb.core._settings import settings
|
2075
|
+
|
2076
|
+
# add all validated records to the current instance
|
2077
|
+
verbosity = settings.verbosity
|
2078
|
+
try:
|
2079
|
+
settings.verbosity = "error"
|
2080
|
+
self._update_registry_all()
|
2081
|
+
finally:
|
2082
|
+
settings.verbosity = verbosity
|
2083
|
+
|
2084
|
+
self._non_validated = {} # type: ignore
|
2085
|
+
|
2086
|
+
sample_validated = True
|
2087
|
+
if self._sample_df_curator:
|
2088
|
+
logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
|
2089
|
+
sample_validated &= self._sample_df_curator.validate()
|
2090
|
+
if len(self._sample_df_curator.non_validated) > 0:
|
2091
|
+
self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
|
2092
|
+
logger.print("")
|
2093
|
+
|
2094
|
+
mods_validated = True
|
2095
|
+
for table, adata_curator in self._table_adata_curators.items():
|
2096
|
+
logger.info(f"validating categoricals of table '{table}' ...")
|
2097
|
+
mods_validated &= adata_curator.validate()
|
2098
|
+
if len(adata_curator.non_validated) > 0:
|
2099
|
+
self._non_validated[table] = adata_curator.non_validated # type: ignore
|
2100
|
+
logger.print("")
|
2101
|
+
|
2102
|
+
self._is_validated = sample_validated & mods_validated
|
2103
|
+
return self._is_validated
|
2104
|
+
|
2105
|
+
def save_artifact(
|
2106
|
+
self,
|
2107
|
+
*,
|
2108
|
+
key: str | None = None,
|
2109
|
+
description: str | None = None,
|
2110
|
+
revises: Artifact | None = None,
|
2111
|
+
run: Run | None = None,
|
2112
|
+
) -> Artifact:
|
2113
|
+
if not self._is_validated:
|
2114
|
+
self.validate()
|
2115
|
+
if not self._is_validated:
|
2116
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
2117
|
+
|
2118
|
+
verbosity = settings.verbosity
|
2119
|
+
try:
|
2120
|
+
settings.verbosity = "warning"
|
2121
|
+
|
2122
|
+
if self._artifact is None:
|
2123
|
+
# Write the SpatialData object to a random path in tmp directory
|
2124
|
+
# The Artifact constructor will move it to the cache
|
2125
|
+
write_path = (
|
2126
|
+
f"{settings.cache_dir}/{random.randint(10**7, 10**8 - 1)}.zarr"
|
2127
|
+
)
|
2128
|
+
self._sdata.write(write_path)
|
2129
|
+
|
2130
|
+
# Create the Artifact and associate Artifact metadata
|
2131
|
+
self._artifact = Artifact(
|
2132
|
+
write_path,
|
2133
|
+
description=description,
|
2134
|
+
key=key,
|
2135
|
+
revises=revises,
|
2136
|
+
run=run,
|
2137
|
+
)
|
2138
|
+
# According to Tim it is not easy to calculate the number of observations.
|
2139
|
+
# We would have to write custom code to iterate over labels (which might not even exist at that point)
|
2140
|
+
self._artifact.otype = "spatialdata"
|
2141
|
+
self._artifact.save()
|
2142
|
+
|
2143
|
+
# Link schemas
|
2144
|
+
feature_kwargs = check_registry_organism(
|
2145
|
+
(list(self._var_fields.values())[0].field.model),
|
2146
|
+
self._organism,
|
2147
|
+
)
|
2148
|
+
|
2149
|
+
def _add_set_from_spatialdata(
|
2150
|
+
host: Artifact | Collection | Run,
|
2151
|
+
var_fields: dict[str, FieldAttr],
|
2152
|
+
obs_fields: dict[str, FieldAttr] = None,
|
2153
|
+
mute: bool = False,
|
2154
|
+
organism: str | Record | None = None,
|
2155
|
+
):
|
2156
|
+
"""Add Schemas from SpatialData."""
|
2157
|
+
if obs_fields is None:
|
2158
|
+
obs_fields = {}
|
2159
|
+
assert host.otype == "spatialdata" # noqa: S101
|
2160
|
+
|
2161
|
+
feature_sets = {}
|
2162
|
+
|
2163
|
+
# sample features
|
2164
|
+
sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
|
2165
|
+
if len(sample_features) > 0:
|
2166
|
+
feature_sets[self._sample_metadata_key] = Schema(
|
2167
|
+
features=sample_features
|
2168
|
+
)
|
2169
|
+
|
2170
|
+
# table features
|
2171
|
+
for table, field in var_fields.items():
|
2172
|
+
table_fs = parse_staged_feature_sets_from_anndata(
|
2173
|
+
self._sdata[table],
|
2174
|
+
var_field=field,
|
2175
|
+
obs_field=obs_fields.get(table, Feature.name),
|
2176
|
+
mute=mute,
|
2177
|
+
organism=organism,
|
2178
|
+
)
|
2179
|
+
for k, v in table_fs.items():
|
2180
|
+
feature_sets[f"['{table}'].{k}"] = v
|
2181
|
+
|
2182
|
+
def _unify_staged_feature_sets_by_hash(
|
2183
|
+
feature_sets: MutableMapping[str, Schema],
|
2184
|
+
):
|
2185
|
+
unique_values: dict[str, Any] = {}
|
2186
|
+
|
2187
|
+
for key, value in feature_sets.items():
|
2188
|
+
value_hash = (
|
2189
|
+
value.hash
|
2190
|
+
) # Assuming each value has a .hash attribute
|
2191
|
+
if value_hash in unique_values:
|
2192
|
+
feature_sets[key] = unique_values[value_hash]
|
2193
|
+
else:
|
2194
|
+
unique_values[value_hash] = value
|
2195
|
+
|
2196
|
+
return feature_sets
|
2197
|
+
|
2198
|
+
# link feature sets
|
2199
|
+
host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
|
2200
|
+
feature_sets
|
2201
|
+
)
|
2202
|
+
host.save()
|
2203
|
+
|
2204
|
+
_add_set_from_spatialdata(
|
2205
|
+
self._artifact, var_fields=self._var_fields, **feature_kwargs
|
2206
|
+
)
|
2207
|
+
|
2208
|
+
# Link labels
|
2209
|
+
def _add_labels_from_spatialdata(
|
2210
|
+
data,
|
2211
|
+
artifact: Artifact,
|
2212
|
+
fields: dict[str, FieldAttr],
|
2213
|
+
feature_ref_is_name: bool | None = None,
|
2214
|
+
):
|
2215
|
+
"""Add Labels from SpatialData."""
|
2216
|
+
features = Feature.lookup().dict()
|
2217
|
+
for key, field in fields.items():
|
2218
|
+
feature = features.get(key)
|
2219
|
+
registry = field.field.model
|
2220
|
+
filter_kwargs = check_registry_organism(registry, self._organism)
|
2221
|
+
filter_kwargs_current = get_current_filter_kwargs(
|
2222
|
+
registry, filter_kwargs
|
2223
|
+
)
|
2224
|
+
df = data if isinstance(data, pd.DataFrame) else data.obs
|
2225
|
+
labels = registry.from_values(
|
2226
|
+
df[key],
|
2227
|
+
field=field,
|
2228
|
+
**filter_kwargs_current,
|
2229
|
+
)
|
2230
|
+
if len(labels) == 0:
|
2231
|
+
continue
|
2232
|
+
|
2233
|
+
label_ref_is_name = None
|
2234
|
+
if hasattr(registry, "_name_field"):
|
2235
|
+
label_ref_is_name = field.field.name == registry._name_field
|
2236
|
+
add_labels(
|
2237
|
+
artifact,
|
2238
|
+
records=labels,
|
2239
|
+
feature=feature,
|
2240
|
+
feature_ref_is_name=feature_ref_is_name,
|
2241
|
+
label_ref_is_name=label_ref_is_name,
|
2242
|
+
from_curator=True,
|
2243
|
+
)
|
2244
|
+
|
2245
|
+
for accessor, accessor_fields in self._categoricals.items():
|
2246
|
+
column_field = self._var_fields.get(accessor)
|
2247
|
+
if accessor == self._sample_metadata_key:
|
2248
|
+
_add_labels_from_spatialdata(
|
2249
|
+
self._sample_metadata,
|
2250
|
+
self._artifact,
|
2251
|
+
accessor_fields,
|
2252
|
+
feature_ref_is_name=(
|
2253
|
+
None if column_field is None else _ref_is_name(column_field)
|
2254
|
+
),
|
2255
|
+
)
|
2256
|
+
else:
|
2257
|
+
_add_labels_from_spatialdata(
|
2258
|
+
self._sdata.tables[accessor],
|
2259
|
+
self._artifact,
|
2260
|
+
accessor_fields,
|
2261
|
+
feature_ref_is_name=(
|
2262
|
+
None if column_field is None else _ref_is_name(column_field)
|
2263
|
+
),
|
2264
|
+
)
|
2265
|
+
|
2266
|
+
finally:
|
2267
|
+
settings.verbosity = verbosity
|
2268
|
+
|
2269
|
+
slug = ln_setup.settings.instance.slug
|
2270
|
+
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
2271
|
+
logger.important(
|
2272
|
+
f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
|
2273
|
+
)
|
2274
|
+
|
2275
|
+
return self._artifact
|
2276
|
+
|
2277
|
+
|
2278
|
+
def _restrict_obs_fields(
|
2279
|
+
obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
|
2280
|
+
) -> dict[str, str]:
|
2281
|
+
"""Restrict the obs fields to name return only available obs fields.
|
2282
|
+
|
2283
|
+
To simplify the curation, we only validate against either name or ontology_id.
|
2284
|
+
If both are available, we validate against ontology_id.
|
2285
|
+
If none are available, we validate against name.
|
2286
|
+
"""
|
2287
|
+
obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
|
2288
|
+
for name, field in obs_fields.items():
|
2289
|
+
if name.endswith("_ontology_term_id"):
|
2290
|
+
continue
|
2291
|
+
# if both the ontology id and the name are present, only validate on the ontology_id
|
2292
|
+
if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
|
2293
|
+
obs_fields_unique.pop(name)
|
2294
|
+
# if the neither name nor ontology id are present, validate on the name
|
2295
|
+
# this will raise error downstream, we just use name to be more readable
|
2296
|
+
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
2297
|
+
obs_fields_unique[name] = field
|
2298
|
+
|
2299
|
+
# Only retain obs_fields_unique that have keys in adata.obs.columns
|
2300
|
+
available_obs_fields = {
|
2301
|
+
k: v for k, v in obs_fields_unique.items() if k in obs.columns
|
2302
|
+
}
|
2303
|
+
|
2304
|
+
return available_obs_fields
|
2305
|
+
|
2306
|
+
|
2307
|
+
def _add_defaults_to_obs(
|
2308
|
+
obs: pd.DataFrame,
|
2309
|
+
defaults: dict[str, str],
|
2310
|
+
) -> None:
|
2311
|
+
"""Add default columns and values to obs DataFrame."""
|
2312
|
+
added_defaults: dict = {}
|
2313
|
+
for name, default in defaults.items():
|
2314
|
+
if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
|
2315
|
+
obs[name] = default
|
2316
|
+
added_defaults[name] = default
|
2317
|
+
logger.important(
|
2318
|
+
f"added default value '{default}' to the adata.obs['{name}']"
|
2319
|
+
)
|
2320
|
+
|
2321
|
+
|
2322
|
+
class CellxGeneAnnDataCatManager(AnnDataCatManager):
|
2323
|
+
"""Annotation flow of AnnData based on CELLxGENE schema."""
|
2324
|
+
|
2325
|
+
_controls_were_created: bool | None = None
|
2326
|
+
|
2327
|
+
def __init__(
|
2328
|
+
self,
|
2329
|
+
adata: ad.AnnData | UPathStr,
|
2330
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
2331
|
+
organism: Literal["human", "mouse"] = "human",
|
2332
|
+
*,
|
2333
|
+
defaults: dict[str, str] = None,
|
2334
|
+
extra_sources: dict[str, Record] = None,
|
2335
|
+
schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
|
2336
|
+
verbosity: str = "hint",
|
2337
|
+
) -> None:
|
2338
|
+
"""CELLxGENE schema curator.
|
2339
|
+
|
2340
|
+
Args:
|
2341
|
+
adata: Path to or AnnData object to curate against the CELLxGENE schema.
|
2342
|
+
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
2343
|
+
The CELLxGENE Curator maps against the required CELLxGENE fields by default.
|
2344
|
+
organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
|
2345
|
+
defaults: Default values that are set if columns or column values are missing.
|
2346
|
+
extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
|
2347
|
+
These extra sources are joined with the CELLxGENE fixed sources.
|
2348
|
+
Use this parameter when subclassing.
|
2349
|
+
exclude: A dictionary mapping column names to values to exclude.
|
2350
|
+
schema_version: The CELLxGENE schema version to curate against.
|
2351
|
+
verbosity: The verbosity level.
|
2352
|
+
|
2353
|
+
"""
|
2354
|
+
import bionty as bt
|
2355
|
+
|
2356
|
+
CellxGeneAnnDataCatManager._init_categoricals_additional_values()
|
2357
|
+
|
2358
|
+
var_index: FieldAttr = bt.Gene.ensembl_gene_id
|
2359
|
+
|
2360
|
+
if categoricals is None:
|
2361
|
+
categoricals = CellxGeneAnnDataCatManager._get_categoricals()
|
2362
|
+
|
2363
|
+
self.organism = organism
|
2364
|
+
|
2365
|
+
VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
|
2366
|
+
if schema_version not in VALID_SCHEMA_VERSIONS:
|
2367
|
+
valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
|
2368
|
+
raise ValueError(
|
2369
|
+
f"Invalid schema_version: {schema_version}. "
|
2370
|
+
f"Valid versions are: {valid_versions}"
|
2371
|
+
)
|
2372
|
+
self.schema_version = schema_version
|
2373
|
+
self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
|
2374
|
+
with resources.path(
|
2375
|
+
"lamindb.curators._cellxgene_schemas", "schema_versions.yml"
|
2376
|
+
) as schema_versions_path:
|
2377
|
+
self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
|
2378
|
+
self.schema_version
|
2379
|
+
]
|
2380
|
+
|
2381
|
+
# Fetch AnnData obs to be able to set defaults and get sources
|
2382
|
+
if isinstance(adata, ad.AnnData):
|
2383
|
+
self._adata_obs = adata.obs
|
2384
|
+
else:
|
2385
|
+
self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
|
2386
|
+
|
2387
|
+
# Add defaults first to ensure that we fetch valid sources
|
2388
|
+
if defaults:
|
2389
|
+
_add_defaults_to_obs(self._adata_obs, defaults)
|
2390
|
+
|
2391
|
+
self.sources = self._create_sources(self._adata_obs)
|
2392
|
+
self.sources = {
|
2393
|
+
entity: source
|
2394
|
+
for entity, source in self.sources.items()
|
2395
|
+
if source is not None
|
2396
|
+
}
|
2397
|
+
|
2398
|
+
# These sources are not a part of the cellxgene schema but rather passed through.
|
2399
|
+
# This is useful when other Curators extend the CELLxGENE curator
|
2400
|
+
if extra_sources:
|
2401
|
+
self.sources = self.sources | extra_sources
|
2402
|
+
|
2403
|
+
# Exclude default values from validation because they are not available in the pinned sources
|
2404
|
+
exclude_keys = {
|
2405
|
+
entity: default
|
2406
|
+
for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
|
2407
|
+
if entity in self._adata_obs.columns # type: ignore
|
2408
|
+
}
|
2409
|
+
|
2410
|
+
super().__init__(
|
2411
|
+
data=adata,
|
2412
|
+
var_index=var_index,
|
2413
|
+
categoricals=_restrict_obs_fields(self._adata_obs, categoricals),
|
2414
|
+
verbosity=verbosity,
|
2415
|
+
organism=organism,
|
2416
|
+
sources=self.sources,
|
2417
|
+
exclude=exclude_keys,
|
2418
|
+
)
|
2419
|
+
|
2420
|
+
@classmethod
|
2421
|
+
def _init_categoricals_additional_values(cls) -> None:
|
2422
|
+
import bionty as bt
|
2423
|
+
|
2424
|
+
import lamindb as ln
|
2425
|
+
|
2426
|
+
# Note: if you add another control below, be mindful to change the if condition that
|
2427
|
+
# triggers whether creating these records is re-considered
|
2428
|
+
if cls._controls_were_created is None:
|
2429
|
+
cls._controls_were_created = (
|
2430
|
+
ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
|
2431
|
+
is not None
|
2432
|
+
)
|
2433
|
+
if not cls._controls_were_created:
|
2434
|
+
logger.important("Creating control labels in the CellxGene schema.")
|
2435
|
+
bt.CellType(
|
2436
|
+
ontology_id="unknown",
|
2437
|
+
name="unknown",
|
2438
|
+
description="From CellxGene schema.",
|
2439
|
+
).save()
|
2440
|
+
pato = bt.Source.filter(name="pato", version="2024-03-28").one()
|
2441
|
+
normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
|
2442
|
+
bt.Disease(
|
2443
|
+
uid=normal.uid,
|
2444
|
+
name=normal.name,
|
2445
|
+
ontology_id=normal.ontology_id,
|
2446
|
+
description=normal.description,
|
2447
|
+
source=normal.source,
|
2448
|
+
).save()
|
2449
|
+
bt.Ethnicity(
|
2450
|
+
ontology_id="na", name="na", description="From CellxGene schema."
|
2451
|
+
).save()
|
2452
|
+
bt.Ethnicity(
|
2453
|
+
ontology_id="unknown",
|
2454
|
+
name="unknown",
|
2455
|
+
description="From CellxGene schema.",
|
2456
|
+
).save()
|
2457
|
+
bt.DevelopmentalStage(
|
2458
|
+
ontology_id="unknown",
|
2459
|
+
name="unknown",
|
2460
|
+
description="From CellxGene schema.",
|
2461
|
+
).save()
|
2462
|
+
bt.Phenotype(
|
2463
|
+
ontology_id="unknown",
|
2464
|
+
name="unknown",
|
2465
|
+
description="From CellxGene schema.",
|
2466
|
+
).save()
|
2467
|
+
|
2468
|
+
tissue_type = ln.ULabel(
|
2469
|
+
name="TissueType",
|
2470
|
+
is_type=True,
|
2471
|
+
description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
|
2472
|
+
).save()
|
2473
|
+
ln.ULabel(
|
2474
|
+
name="tissue", type=tissue_type, description="From CellxGene schema."
|
2475
|
+
).save()
|
2476
|
+
ln.ULabel(
|
2477
|
+
name="organoid", type=tissue_type, description="From CellxGene schema."
|
2478
|
+
).save()
|
2479
|
+
ln.ULabel(
|
2480
|
+
name="cell culture",
|
2481
|
+
type=tissue_type,
|
2482
|
+
description="From CellxGene schema.",
|
2483
|
+
).save()
|
2484
|
+
|
2485
|
+
suspension_type = ln.ULabel(
|
2486
|
+
name="SuspensionType",
|
2487
|
+
is_type=True,
|
2488
|
+
description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
|
2489
|
+
).save()
|
2490
|
+
ln.ULabel(
|
2491
|
+
name="cell", type=suspension_type, description="From CellxGene schema."
|
2492
|
+
).save()
|
2493
|
+
ln.ULabel(
|
2494
|
+
name="nucleus",
|
2495
|
+
type=suspension_type,
|
2496
|
+
description="From CellxGene schema.",
|
2497
|
+
).save()
|
2498
|
+
ln.ULabel(name="na", type=suspension_type).save()
|
2499
|
+
|
2500
|
+
@classmethod
|
2501
|
+
def _get_categoricals(cls) -> dict[str, FieldAttr]:
|
2502
|
+
import bionty as bt
|
2503
|
+
|
2504
|
+
return {
|
2505
|
+
"assay": bt.ExperimentalFactor.name,
|
2506
|
+
"assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
|
2507
|
+
"cell_type": bt.CellType.name,
|
2508
|
+
"cell_type_ontology_term_id": bt.CellType.ontology_id,
|
2509
|
+
"development_stage": bt.DevelopmentalStage.name,
|
2510
|
+
"development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
|
2511
|
+
"disease": bt.Disease.name,
|
2512
|
+
"disease_ontology_term_id": bt.Disease.ontology_id,
|
2513
|
+
# "donor_id": "str", via pandera
|
2514
|
+
"self_reported_ethnicity": bt.Ethnicity.name,
|
2515
|
+
"self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
|
2516
|
+
"sex": bt.Phenotype.name,
|
2517
|
+
"sex_ontology_term_id": bt.Phenotype.ontology_id,
|
2518
|
+
"suspension_type": ULabel.name,
|
2519
|
+
"tissue": bt.Tissue.name,
|
2520
|
+
"tissue_ontology_term_id": bt.Tissue.ontology_id,
|
2521
|
+
"tissue_type": ULabel.name,
|
2522
|
+
"organism": bt.Organism.name,
|
2523
|
+
"organism_ontology_term_id": bt.Organism.ontology_id,
|
2524
|
+
}
|
2525
|
+
|
2526
|
+
@classmethod
|
2527
|
+
def _get_categoricals_defaults(cls) -> dict[str, str]:
|
2528
|
+
return {
|
2529
|
+
"cell_type": "unknown",
|
2530
|
+
"development_stage": "unknown",
|
2531
|
+
"disease": "normal",
|
2532
|
+
"donor_id": "unknown",
|
2533
|
+
"self_reported_ethnicity": "unknown",
|
2534
|
+
"sex": "unknown",
|
2535
|
+
"suspension_type": "cell",
|
2536
|
+
"tissue_type": "tissue",
|
2537
|
+
}
|
2538
|
+
|
2539
|
+
@property
|
2540
|
+
def pinned_ontologies(self) -> pd.DataFrame:
|
2541
|
+
return self._pinned_ontologies
|
2542
|
+
|
2543
|
+
@property
|
2544
|
+
def adata(self) -> AnnData:
|
2545
|
+
return self._adata
|
2546
|
+
|
2547
|
+
def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
|
2548
|
+
"""Creates a sources dictionary that can be passed to AnnDataCatManager."""
|
2549
|
+
import bionty as bt
|
2550
|
+
|
2551
|
+
# fmt: off
|
2552
|
+
def _fetch_bionty_source(
|
2553
|
+
entity: str, organism: str, source: str
|
2554
|
+
) -> bt.Source | None:
|
2555
|
+
"""Fetch the Bionty source of the pinned ontology.
|
2556
|
+
|
2557
|
+
Returns None if the source does not exist.
|
2558
|
+
"""
|
2559
|
+
version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
|
2560
|
+
(self._pinned_ontologies["organism"] == organism) &
|
2561
|
+
(self._pinned_ontologies["source"] == source), "version"].iloc[0]
|
2562
|
+
return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
|
2563
|
+
|
2564
|
+
entity_mapping = {
|
2565
|
+
"var_index": ("Gene", self.organism, "ensembl"),
|
2566
|
+
"cell_type": ("CellType", "all", "cl"),
|
2567
|
+
"assay": ("ExperimentalFactor", "all", "efo"),
|
2568
|
+
"self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
|
2569
|
+
"development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
|
2570
|
+
"disease": ("Disease", "all", "mondo"),
|
2571
|
+
# "organism": ("Organism", "vertebrates", "ensembl"),
|
2572
|
+
"sex": ("Phenotype", "all", "pato"),
|
2573
|
+
"tissue": ("Tissue", "all", "uberon"),
|
2574
|
+
}
|
2575
|
+
# fmt: on
|
2576
|
+
|
2577
|
+
# Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
|
2578
|
+
entity_to_sources = {
|
2579
|
+
entity: _fetch_bionty_source(*params)
|
2580
|
+
for entity, params in entity_mapping.items()
|
2581
|
+
if entity in obs.columns
|
2582
|
+
or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
|
2583
|
+
or entity == "var_index"
|
2584
|
+
}
|
2585
|
+
|
2586
|
+
return entity_to_sources
|
2587
|
+
|
2588
|
+
def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
|
2589
|
+
"""Converts a column that stores a name into a column that stores the ontology id.
|
2590
|
+
|
2591
|
+
cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
|
2592
|
+
"""
|
2593
|
+
field_name = field.field.name
|
2594
|
+
assert field_name == "name" # noqa: S101
|
2595
|
+
cols = ["name", "ontology_id"]
|
2596
|
+
registry = field.field.model
|
2597
|
+
|
2598
|
+
if hasattr(registry, "ontology_id"):
|
2599
|
+
validated_records = registry.filter(**{f"{field_name}__in": values})
|
2600
|
+
mapper = (
|
2601
|
+
pd.DataFrame(validated_records.values_list(*cols))
|
2602
|
+
.set_index(0)
|
2603
|
+
.to_dict()[1]
|
2604
|
+
)
|
2605
|
+
return values.map(mapper)
|
2606
|
+
|
2607
|
+
def validate(self) -> bool: # type: ignore
|
2608
|
+
"""Validates the AnnData object against most cellxgene requirements."""
|
2609
|
+
# Verify that all required obs columns are present
|
2610
|
+
missing_obs_fields = [
|
2611
|
+
name
|
2612
|
+
for name in CellxGeneAnnDataCatManager._get_categoricals_defaults().keys()
|
2613
|
+
if name not in self._adata.obs.columns
|
2614
|
+
and f"{name}_ontology_term_id" not in self._adata.obs.columns
|
2615
|
+
]
|
2616
|
+
if len(missing_obs_fields) > 0:
|
2617
|
+
missing_obs_fields_str = ", ".join(list(missing_obs_fields))
|
2618
|
+
logger.error(f"missing required obs columns {missing_obs_fields_str}")
|
2619
|
+
logger.info(
|
2620
|
+
"consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
|
2621
|
+
"to automatically add these columns with default values."
|
2622
|
+
)
|
2623
|
+
return False
|
2624
|
+
|
2625
|
+
# Verify that no cellxgene reserved names are present
|
2626
|
+
reserved_names = {
|
2627
|
+
"ethnicity",
|
2628
|
+
"ethnicity_ontology_term_id",
|
2629
|
+
"X_normalization",
|
2630
|
+
"default_field",
|
2631
|
+
"layer_descriptions",
|
2632
|
+
"tags",
|
2633
|
+
"versions",
|
2634
|
+
"contributors",
|
2635
|
+
"preprint_doi",
|
2636
|
+
"project_description",
|
2637
|
+
"project_links",
|
2638
|
+
"project_name",
|
2639
|
+
"publication_doi",
|
2640
|
+
}
|
2641
|
+
matched_columns = [
|
2642
|
+
column for column in self._adata.obs.columns if column in reserved_names
|
2643
|
+
]
|
2644
|
+
if len(matched_columns) > 0:
|
2645
|
+
raise ValueError(
|
2646
|
+
f"AnnData object must not contain obs columns {matched_columns} which are"
|
2647
|
+
" reserved from previous schema versions."
|
2648
|
+
)
|
1571
2649
|
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
2650
|
+
return super().validate()
|
2651
|
+
|
2652
|
+
def to_cellxgene_anndata(
|
2653
|
+
self, is_primary_data: bool, title: str | None = None
|
2654
|
+
) -> ad.AnnData:
|
2655
|
+
"""Converts the AnnData object to the cellxgene-schema input format.
|
2656
|
+
|
2657
|
+
cellxgene expects the obs fields to be {entity}_ontology_id fields and has many further requirements which are
|
2658
|
+
documented here: https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema.
|
2659
|
+
This function checks for most but not all requirements of the CELLxGENE schema.
|
2660
|
+
If you want to ensure that it fully adheres to the CELLxGENE schema, run `cellxgene-schema` on the AnnData object.
|
2661
|
+
|
2662
|
+
Args:
|
2663
|
+
is_primary_data: Whether the measured data is primary data or not.
|
2664
|
+
title: Title of the AnnData object. Commonly the name of the publication.
|
2665
|
+
|
2666
|
+
Returns:
|
2667
|
+
An AnnData object which adheres to the cellxgene-schema.
|
2668
|
+
"""
|
2669
|
+
# Create a copy since we modify the AnnData object extensively
|
2670
|
+
adata_cxg = self._adata.copy()
|
2671
|
+
|
2672
|
+
# cellxgene requires an embedding
|
2673
|
+
embedding_pattern = r"^[a-zA-Z][a-zA-Z0-9_.-]*$"
|
2674
|
+
exclude_key = "spatial"
|
2675
|
+
matching_keys = [
|
2676
|
+
key
|
2677
|
+
for key in adata_cxg.obsm.keys()
|
2678
|
+
if re.match(embedding_pattern, key) and key != exclude_key
|
2679
|
+
]
|
2680
|
+
if len(matching_keys) == 0:
|
2681
|
+
raise ValueError(
|
2682
|
+
"Unable to find an embedding key. Please calculate an embedding."
|
1579
2683
|
)
|
1580
|
-
|
1581
|
-
|
2684
|
+
|
2685
|
+
# convert name column to ontology_term_id column
|
2686
|
+
for column in adata_cxg.obs.columns:
|
2687
|
+
if column in self.categoricals and not column.endswith("_ontology_term_id"):
|
2688
|
+
mapped_column = self._convert_name_to_ontology_id(
|
2689
|
+
adata_cxg.obs[column], field=self.categoricals.get(column)
|
2690
|
+
)
|
2691
|
+
if mapped_column is not None:
|
2692
|
+
adata_cxg.obs[f"{column}_ontology_term_id"] = mapped_column
|
2693
|
+
|
2694
|
+
# drop the name columns for ontologies. cellxgene does not allow them.
|
2695
|
+
drop_columns = [
|
2696
|
+
i
|
2697
|
+
for i in adata_cxg.obs.columns
|
2698
|
+
if f"{i}_ontology_term_id" in adata_cxg.obs.columns
|
2699
|
+
]
|
2700
|
+
adata_cxg.obs.drop(columns=drop_columns, inplace=True)
|
2701
|
+
|
2702
|
+
# Add cellxgene metadata to AnnData object
|
2703
|
+
if "is_primary_data" not in adata_cxg.obs.columns:
|
2704
|
+
adata_cxg.obs["is_primary_data"] = is_primary_data
|
2705
|
+
if "feature_is_filtered" not in adata_cxg.var.columns:
|
2706
|
+
logger.warn(
|
2707
|
+
"column 'feature_is_filtered' not present in var. Setting to default"
|
2708
|
+
" value of False."
|
1582
2709
|
)
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1586
|
-
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
|
2710
|
+
adata_cxg.var["feature_is_filtered"] = False
|
2711
|
+
if title is None:
|
2712
|
+
raise ValueError("please pass a title!")
|
2713
|
+
else:
|
2714
|
+
adata_cxg.uns["title"] = title
|
2715
|
+
adata_cxg.uns["cxg_lamin_schema_reference"] = self.schema_reference
|
2716
|
+
adata_cxg.uns["cxg_lamin_schema_version"] = self.schema_version
|
2717
|
+
|
2718
|
+
return adata_cxg
|
2719
|
+
|
2720
|
+
|
2721
|
+
class ValueUnit:
|
2722
|
+
"""Base class for handling value-unit combinations."""
|
2723
|
+
|
2724
|
+
@staticmethod
|
2725
|
+
def parse_value_unit(value: str, is_dose: bool = True) -> tuple[str, str] | None:
|
2726
|
+
"""Parse a string containing a value and unit into a tuple."""
|
2727
|
+
if not isinstance(value, str) or not value.strip():
|
2728
|
+
return None
|
2729
|
+
|
2730
|
+
value = str(value).strip()
|
2731
|
+
match = re.match(r"^(\d*\.?\d{0,1})\s*([a-zA-ZμµΜ]+)$", value)
|
2732
|
+
|
2733
|
+
if not match:
|
2734
|
+
raise ValueError(
|
2735
|
+
f"Invalid format: {value}. Expected format: number with max 1 decimal place + unit"
|
2736
|
+
)
|
2737
|
+
|
2738
|
+
number, unit = match.groups()
|
2739
|
+
formatted_number = f"{float(number):.1f}"
|
2740
|
+
|
2741
|
+
if is_dose:
|
2742
|
+
standardized_unit = DoseHandler.standardize_unit(unit)
|
2743
|
+
if not DoseHandler.validate_unit(standardized_unit):
|
2744
|
+
raise ValueError(
|
2745
|
+
f"Invalid dose unit: {unit}. Must be convertible to one of: nM, μM, mM, M"
|
2746
|
+
)
|
2747
|
+
else:
|
2748
|
+
standardized_unit = TimeHandler.standardize_unit(unit)
|
2749
|
+
if not TimeHandler.validate_unit(standardized_unit):
|
2750
|
+
raise ValueError(
|
2751
|
+
f"Invalid time unit: {unit}. Must be convertible to one of: h, m, s, d, y"
|
1594
2752
|
)
|
1595
2753
|
|
1596
|
-
return
|
2754
|
+
return formatted_number, standardized_unit
|
1597
2755
|
|
1598
2756
|
|
1599
|
-
class
|
1600
|
-
"""
|
2757
|
+
class DoseHandler:
|
2758
|
+
"""Handler for dose-related operations."""
|
1601
2759
|
|
1602
|
-
|
2760
|
+
VALID_UNITS = {"nM", "μM", "µM", "mM", "M"}
|
2761
|
+
UNIT_MAP = {
|
2762
|
+
"nm": "nM",
|
2763
|
+
"NM": "nM",
|
2764
|
+
"um": "μM",
|
2765
|
+
"UM": "μM",
|
2766
|
+
"μm": "μM",
|
2767
|
+
"μM": "μM",
|
2768
|
+
"µm": "μM",
|
2769
|
+
"µM": "μM",
|
2770
|
+
"mm": "mM",
|
2771
|
+
"MM": "mM",
|
2772
|
+
"m": "M",
|
2773
|
+
"M": "M",
|
2774
|
+
}
|
1603
2775
|
|
1604
|
-
|
2776
|
+
@classmethod
|
2777
|
+
def validate_unit(cls, unit: str) -> bool:
|
2778
|
+
"""Validate if the dose unit is acceptable."""
|
2779
|
+
return unit in cls.VALID_UNITS
|
1605
2780
|
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
>>> categoricals={"perturbation": ln.ULabel.name}, # map categories
|
1611
|
-
>>> )
|
1612
|
-
>>> curator.validate() # validate the data in df
|
1613
|
-
>>> artifact = curator.save_artifact(description="my RNA-seq")
|
1614
|
-
>>> artifact.describe() # see annotations
|
2781
|
+
@classmethod
|
2782
|
+
def standardize_unit(cls, unit: str) -> str:
|
2783
|
+
"""Standardize dose unit to standard formats."""
|
2784
|
+
return cls.UNIT_MAP.get(unit, unit)
|
1615
2785
|
|
1616
|
-
|
2786
|
+
@classmethod
|
2787
|
+
def validate_values(cls, values: pd.Series) -> list:
|
2788
|
+
"""Validate pert_dose values with strict case checking."""
|
2789
|
+
errors = []
|
1617
2790
|
|
1618
|
-
|
2791
|
+
for idx, value in values.items():
|
2792
|
+
if pd.isna(value):
|
2793
|
+
continue
|
1619
2794
|
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
2795
|
+
if isinstance(value, (int, float)):
|
2796
|
+
errors.append(
|
2797
|
+
f"Row {idx} - Missing unit for dose: {value}. Must include a unit (nM, μM, mM, M)"
|
2798
|
+
)
|
2799
|
+
continue
|
2800
|
+
|
2801
|
+
try:
|
2802
|
+
ValueUnit.parse_value_unit(value, is_dose=True)
|
2803
|
+
except ValueError as e:
|
2804
|
+
errors.append(f"Row {idx} - {str(e)}")
|
2805
|
+
|
2806
|
+
return errors
|
2807
|
+
|
2808
|
+
|
2809
|
+
class TimeHandler:
|
2810
|
+
"""Handler for time-related operations."""
|
2811
|
+
|
2812
|
+
VALID_UNITS = {"h", "m", "s", "d", "y"}
|
1623
2813
|
|
1624
2814
|
@classmethod
|
1625
|
-
|
1626
|
-
|
1627
|
-
cls
|
1628
|
-
df: pd.DataFrame,
|
1629
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
1630
|
-
columns: FieldAttr = Feature.name,
|
1631
|
-
using_key: str | None = None,
|
1632
|
-
verbosity: str = "hint",
|
1633
|
-
organism: str | None = None,
|
1634
|
-
) -> DataFrameCurator:
|
1635
|
-
"""{}""" # noqa: D415
|
1636
|
-
return DataFrameCurator(
|
1637
|
-
df=df,
|
1638
|
-
categoricals=categoricals,
|
1639
|
-
columns=columns,
|
1640
|
-
using_key=using_key,
|
1641
|
-
verbosity=verbosity,
|
1642
|
-
organism=organism,
|
1643
|
-
)
|
2815
|
+
def validate_unit(cls, unit: str) -> bool:
|
2816
|
+
"""Validate if the time unit is acceptable."""
|
2817
|
+
return unit == unit.lower() and unit in cls.VALID_UNITS
|
1644
2818
|
|
1645
2819
|
@classmethod
|
1646
|
-
|
1647
|
-
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
1654
|
-
|
1655
|
-
organism: str | None = None,
|
1656
|
-
sources: dict[str, Record] | None = None,
|
1657
|
-
) -> AnnDataCurator:
|
1658
|
-
"""{}""" # noqa: D415
|
1659
|
-
return AnnDataCurator(
|
1660
|
-
data=data,
|
1661
|
-
var_index=var_index,
|
1662
|
-
categoricals=categoricals,
|
1663
|
-
obs_columns=obs_columns,
|
1664
|
-
using_key=using_key,
|
1665
|
-
verbosity=verbosity,
|
1666
|
-
organism=organism,
|
1667
|
-
sources=sources,
|
1668
|
-
)
|
2820
|
+
def standardize_unit(cls, unit: str) -> str:
|
2821
|
+
"""Standardize time unit to standard formats."""
|
2822
|
+
if unit.startswith("hr"):
|
2823
|
+
return "h"
|
2824
|
+
elif unit.startswith("min"):
|
2825
|
+
return "m"
|
2826
|
+
elif unit.startswith("sec"):
|
2827
|
+
return "s"
|
2828
|
+
return unit[0].lower()
|
1669
2829
|
|
1670
2830
|
@classmethod
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
2831
|
+
def validate_values(cls, values: pd.Series) -> list:
|
2832
|
+
"""Validate pert_time values."""
|
2833
|
+
errors = []
|
2834
|
+
|
2835
|
+
for idx, value in values.items():
|
2836
|
+
if pd.isna(value):
|
2837
|
+
continue
|
2838
|
+
|
2839
|
+
if isinstance(value, (int, float)):
|
2840
|
+
errors.append(
|
2841
|
+
f"Row {idx} - Missing unit for time: {value}. Must include a unit (h, m, s, d, y)"
|
2842
|
+
)
|
2843
|
+
continue
|
2844
|
+
|
2845
|
+
try:
|
2846
|
+
ValueUnit.parse_value_unit(value, is_dose=False)
|
2847
|
+
except ValueError as e:
|
2848
|
+
errors.append(f"Row {idx} - {str(e)}")
|
2849
|
+
|
2850
|
+
return errors
|
2851
|
+
|
2852
|
+
|
2853
|
+
class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
|
2854
|
+
"""Curator flow for Perturbation data."""
|
2855
|
+
|
2856
|
+
PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
|
2857
|
+
|
2858
|
+
def __init__(
|
2859
|
+
self,
|
2860
|
+
adata: ad.AnnData,
|
2861
|
+
organism: Literal["human", "mouse"] = "human",
|
2862
|
+
pert_dose: bool = True,
|
2863
|
+
pert_time: bool = True,
|
2864
|
+
*,
|
1678
2865
|
verbosity: str = "hint",
|
1679
|
-
|
1680
|
-
)
|
1681
|
-
"""
|
1682
|
-
|
1683
|
-
|
1684
|
-
|
1685
|
-
|
1686
|
-
|
2866
|
+
cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
|
2867
|
+
):
|
2868
|
+
"""Initialize the curator with configuration and validation settings."""
|
2869
|
+
import bionty as bt
|
2870
|
+
|
2871
|
+
self._pert_time = pert_time
|
2872
|
+
self._pert_dose = pert_dose
|
2873
|
+
|
2874
|
+
self._validate_initial_data(adata)
|
2875
|
+
self._setup_configuration(adata)
|
2876
|
+
|
2877
|
+
self._setup_sources(adata)
|
2878
|
+
self._setup_compound_source()
|
2879
|
+
|
2880
|
+
super().__init__(
|
2881
|
+
adata=adata,
|
2882
|
+
categoricals=self.PT_CATEGORICALS,
|
2883
|
+
defaults=self.PT_DEFAULT_VALUES,
|
1687
2884
|
verbosity=verbosity,
|
1688
2885
|
organism=organism,
|
2886
|
+
extra_sources=self.PT_SOURCES,
|
2887
|
+
schema_version=cxg_schema_version,
|
1689
2888
|
)
|
1690
2889
|
|
1691
|
-
|
1692
|
-
|
1693
|
-
|
1694
|
-
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1698
|
-
|
1699
|
-
|
1700
|
-
|
1701
|
-
|
1702
|
-
exclude: dict[str, str | list[str]] | None = None,
|
1703
|
-
) -> SOMACurator:
|
1704
|
-
"""{}""" # noqa: D415
|
1705
|
-
return SOMACurator(
|
1706
|
-
experiment_uri=experiment_uri,
|
1707
|
-
var_index=var_index,
|
1708
|
-
categoricals=categoricals,
|
1709
|
-
obs_columns=obs_columns,
|
1710
|
-
using_key=using_key,
|
1711
|
-
organism=organism,
|
1712
|
-
sources=sources,
|
1713
|
-
exclude=exclude,
|
2890
|
+
def _setup_configuration(self, adata: ad.AnnData):
|
2891
|
+
"""Set up default configuration values."""
|
2892
|
+
import bionty as bt
|
2893
|
+
import wetlab as wl
|
2894
|
+
|
2895
|
+
self.PT_DEFAULT_VALUES = (
|
2896
|
+
CellxGeneAnnDataCatManager._get_categoricals_defaults()
|
2897
|
+
| {
|
2898
|
+
"cell_line": "unknown",
|
2899
|
+
"pert_target": "unknown",
|
2900
|
+
}
|
1714
2901
|
)
|
1715
2902
|
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1719
|
-
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1726
|
-
|
1727
|
-
|
1728
|
-
|
1729
|
-
|
1730
|
-
|
2903
|
+
self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
|
2904
|
+
k: v
|
2905
|
+
for k, v in {
|
2906
|
+
"cell_line": bt.CellLine.name,
|
2907
|
+
"pert_target": wl.PerturbationTarget.name,
|
2908
|
+
"pert_genetic": wl.GeneticPerturbation.name,
|
2909
|
+
"pert_compound": wl.Compound.name,
|
2910
|
+
"pert_biologic": wl.Biologic.name,
|
2911
|
+
"pert_physical": wl.EnvironmentalPerturbation.name,
|
2912
|
+
}.items()
|
2913
|
+
if k in adata.obs.columns
|
2914
|
+
}
|
2915
|
+
# if "donor_id" in self.PT_CATEGORICALS:
|
2916
|
+
# self.PT_CATEGORICALS["donor_id"] = Donor.name
|
2917
|
+
|
2918
|
+
def _setup_sources(self, adata: ad.AnnData):
|
2919
|
+
"""Set up data sources."""
|
2920
|
+
self.PT_SOURCES = {}
|
2921
|
+
# if "cell_line" in adata.obs.columns:
|
2922
|
+
# self.PT_SOURCES["cell_line"] = (
|
2923
|
+
# bt.Source.filter(name="depmap").first()
|
2924
|
+
# )
|
2925
|
+
if "pert_compound" in adata.obs.columns:
|
2926
|
+
import bionty as bt
|
2927
|
+
|
2928
|
+
self.PT_SOURCES["pert_compound"] = bt.Source.filter(
|
2929
|
+
entity="wetlab.Compound", name="chebi"
|
2930
|
+
).first()
|
2931
|
+
|
2932
|
+
def _validate_initial_data(self, adata: ad.AnnData):
|
2933
|
+
"""Validate the initial data structure."""
|
2934
|
+
self._validate_required_columns(adata)
|
2935
|
+
self._validate_perturbation_types(adata)
|
2936
|
+
|
2937
|
+
def _validate_required_columns(self, adata: ad.AnnData):
|
2938
|
+
"""Validate required columns are present."""
|
2939
|
+
if "pert_target" not in adata.obs.columns:
|
2940
|
+
if (
|
2941
|
+
"pert_name" not in adata.obs.columns
|
2942
|
+
or "pert_type" not in adata.obs.columns
|
2943
|
+
):
|
2944
|
+
raise ValidationError(
|
2945
|
+
"either 'pert_target' or both 'pert_name' and 'pert_type' must be present"
|
2946
|
+
)
|
2947
|
+
else:
|
2948
|
+
if "pert_name" not in adata.obs.columns:
|
2949
|
+
logger.warning(
|
2950
|
+
"no 'pert' column found in adata.obs, will only curate 'pert_target'"
|
2951
|
+
)
|
2952
|
+
elif "pert_type" not in adata.obs.columns:
|
2953
|
+
raise ValidationError("both 'pert' and 'pert_type' must be present")
|
2954
|
+
|
2955
|
+
def _validate_perturbation_types(self, adata: ad.AnnData):
|
2956
|
+
"""Validate perturbation types."""
|
2957
|
+
if "pert_type" in adata.obs.columns:
|
2958
|
+
data_pert_types = set(adata.obs["pert_type"].unique())
|
2959
|
+
invalid_pert_types = data_pert_types - self.PERT_COLUMNS
|
2960
|
+
if invalid_pert_types:
|
2961
|
+
raise ValidationError(
|
2962
|
+
f"invalid pert_type found: {invalid_pert_types}!\n"
|
2963
|
+
f" → allowed values: {self.PERT_COLUMNS}"
|
2964
|
+
)
|
2965
|
+
self._process_perturbation_types(adata, data_pert_types)
|
2966
|
+
|
2967
|
+
def _process_perturbation_types(self, adata: ad.AnnData, pert_types: set):
|
2968
|
+
"""Process and map perturbation types."""
|
2969
|
+
for pert_type in pert_types:
|
2970
|
+
col_name = "pert_" + pert_type
|
2971
|
+
adata.obs[col_name] = adata.obs["pert_name"].where(
|
2972
|
+
adata.obs["pert_type"] == pert_type, None
|
2973
|
+
)
|
2974
|
+
if adata.obs[col_name].dtype.name == "category":
|
2975
|
+
adata.obs[col_name].cat.remove_unused_categories()
|
2976
|
+
logger.important(f"mapped 'pert_name' to '{col_name}'")
|
1731
2977
|
|
1732
|
-
|
2978
|
+
def _setup_compound_source(self):
|
2979
|
+
"""Set up the compound source with muted logging."""
|
2980
|
+
import bionty as bt
|
2981
|
+
import wetlab as wl
|
2982
|
+
|
2983
|
+
with logger.mute():
|
2984
|
+
chebi_source = bt.Source.filter(
|
2985
|
+
entity="wetlab.Compound", name="chebi"
|
2986
|
+
).first()
|
2987
|
+
if not chebi_source:
|
2988
|
+
wl.Compound.add_source(
|
2989
|
+
bt.Source.filter(entity="Drug", name="chebi").first()
|
2990
|
+
)
|
1733
2991
|
|
1734
|
-
|
1735
|
-
the object
|
2992
|
+
def validate(self) -> bool: # type: ignore
|
2993
|
+
"""Validate the AnnData object."""
|
2994
|
+
validated = super().validate()
|
2995
|
+
|
2996
|
+
if self._pert_dose:
|
2997
|
+
validated &= self._validate_dose_column()
|
2998
|
+
if self._pert_time:
|
2999
|
+
validated &= self._validate_time_column()
|
3000
|
+
|
3001
|
+
self._is_validated = validated
|
3002
|
+
|
3003
|
+
# sort columns
|
3004
|
+
first_columns = [
|
3005
|
+
"pert_target",
|
3006
|
+
"pert_genetic",
|
3007
|
+
"pert_compound",
|
3008
|
+
"pert_biologic",
|
3009
|
+
"pert_physical",
|
3010
|
+
"pert_dose",
|
3011
|
+
"pert_time",
|
3012
|
+
"organism",
|
3013
|
+
"cell_line",
|
3014
|
+
"cell_type",
|
3015
|
+
"disease",
|
3016
|
+
"tissue_type",
|
3017
|
+
"tissue",
|
3018
|
+
"assay",
|
3019
|
+
"suspension_type",
|
3020
|
+
"donor_id",
|
3021
|
+
"sex",
|
3022
|
+
"self_reported_ethnicity",
|
3023
|
+
"development_stage",
|
3024
|
+
"pert_name",
|
3025
|
+
"pert_type",
|
3026
|
+
]
|
3027
|
+
sorted_columns = [
|
3028
|
+
col for col in first_columns if col in self._adata.obs.columns
|
3029
|
+
] + [col for col in self._adata.obs.columns if col not in first_columns]
|
3030
|
+
# must assign to self._df to ensure .standardize works correctly
|
3031
|
+
self._obs_df = self._adata.obs[sorted_columns]
|
3032
|
+
self._adata.obs = self._obs_df
|
3033
|
+
return validated
|
3034
|
+
|
3035
|
+
def standardize(self, key: str) -> pd.DataFrame:
|
3036
|
+
"""Standardize the AnnData object."""
|
3037
|
+
super().standardize(key)
|
3038
|
+
self._adata.obs = self._obs_df
|
3039
|
+
|
3040
|
+
def _validate_dose_column(self) -> bool:
|
3041
|
+
"""Validate the dose column."""
|
3042
|
+
if not Feature.filter(name="pert_dose").exists():
|
3043
|
+
Feature(name="pert_dose", dtype="str").save() # type: ignore
|
3044
|
+
|
3045
|
+
dose_errors = DoseHandler.validate_values(self._adata.obs["pert_dose"])
|
3046
|
+
if dose_errors:
|
3047
|
+
self._log_validation_errors("pert_dose", dose_errors)
|
3048
|
+
return False
|
3049
|
+
return True
|
3050
|
+
|
3051
|
+
def _validate_time_column(self) -> bool:
|
3052
|
+
"""Validate the time column."""
|
3053
|
+
if not Feature.filter(name="pert_time").exists():
|
3054
|
+
Feature(name="pert_time", dtype="str").save() # type: ignore
|
3055
|
+
|
3056
|
+
time_errors = TimeHandler.validate_values(self._adata.obs["pert_time"])
|
3057
|
+
if time_errors:
|
3058
|
+
self._log_validation_errors("pert_time", time_errors)
|
3059
|
+
return False
|
3060
|
+
return True
|
3061
|
+
|
3062
|
+
def _log_validation_errors(self, column: str, errors: list):
|
3063
|
+
"""Log validation errors with formatting."""
|
3064
|
+
errors_print = "\n ".join(errors)
|
3065
|
+
logger.warning(
|
3066
|
+
f"invalid {column} values found!\n {errors_print}\n"
|
3067
|
+
f" → run {colors.cyan('standardize_dose_time()')}"
|
3068
|
+
)
|
1736
3069
|
|
1737
|
-
|
3070
|
+
def standardize_dose_time(self) -> pd.DataFrame:
|
3071
|
+
"""Standardize dose and time values."""
|
3072
|
+
standardized_df = self._adata.obs.copy()
|
1738
3073
|
|
1739
|
-
|
1740
|
-
|
1741
|
-
|
1742
|
-
|
1743
|
-
using_key: A reference LaminDB instance.
|
1744
|
-
organism: The organism name.
|
1745
|
-
sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
|
1746
|
-
exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
|
1747
|
-
When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
|
1748
|
-
using the exclude parameter ensures they are not validated.
|
1749
|
-
verbosity: The verbosity level of the logger.
|
1750
|
-
sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
|
1751
|
-
|
1752
|
-
Examples:
|
1753
|
-
>>> import lamindb as ln
|
1754
|
-
>>> import bionty as bt
|
1755
|
-
>>> curator = ln.Curator.from_spatialdata(
|
1756
|
-
... sdata,
|
1757
|
-
... var_index={
|
1758
|
-
... "table_1": bt.Gene.ensembl_gene_id,
|
1759
|
-
... },
|
1760
|
-
... categoricals={
|
1761
|
-
... "table1":
|
1762
|
-
... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name},
|
1763
|
-
... "sample":
|
1764
|
-
... {"experimental_factor": bt.ExperimentalFactor.name},
|
1765
|
-
... },
|
1766
|
-
... organism="human",
|
1767
|
-
... )
|
1768
|
-
"""
|
1769
|
-
try:
|
1770
|
-
import spatialdata
|
1771
|
-
except ImportError as e:
|
1772
|
-
raise ImportError(
|
1773
|
-
"Please install spatialdata: pip install spatialdata"
|
1774
|
-
) from e
|
3074
|
+
if "pert_dose" in self._adata.obs.columns:
|
3075
|
+
standardized_df = self._standardize_column(
|
3076
|
+
standardized_df, "pert_dose", is_dose=True
|
3077
|
+
)
|
1775
3078
|
|
1776
|
-
|
3079
|
+
if "pert_time" in self._adata.obs.columns:
|
3080
|
+
standardized_df = self._standardize_column(
|
3081
|
+
standardized_df, "pert_time", is_dose=False
|
3082
|
+
)
|
1777
3083
|
|
1778
|
-
|
1779
|
-
|
1780
|
-
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
3084
|
+
self._adata.obs = standardized_df
|
3085
|
+
return standardized_df
|
3086
|
+
|
3087
|
+
def _standardize_column(
|
3088
|
+
self, df: pd.DataFrame, column: str, is_dose: bool
|
3089
|
+
) -> pd.DataFrame:
|
3090
|
+
"""Standardize values in a specific column."""
|
3091
|
+
for idx, value in self._adata.obs[column].items():
|
3092
|
+
if pd.isna(value) or (
|
3093
|
+
isinstance(value, str) and (not value.strip() or value.lower() == "nan")
|
3094
|
+
):
|
3095
|
+
df.at[idx, column] = None
|
3096
|
+
continue
|
1789
3097
|
|
3098
|
+
try:
|
3099
|
+
num, unit = ValueUnit.parse_value_unit(value, is_dose=is_dose)
|
3100
|
+
df.at[idx, column] = f"{num}{unit}"
|
3101
|
+
except ValueError:
|
3102
|
+
continue
|
1790
3103
|
|
1791
|
-
|
1792
|
-
"""Get a registry instance using a specific instance."""
|
1793
|
-
if using_key is not None and using_key != "default":
|
1794
|
-
return registry.using(using_key)
|
1795
|
-
return registry
|
3104
|
+
return df
|
1796
3105
|
|
1797
3106
|
|
1798
3107
|
def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
@@ -1871,11 +3180,11 @@ def validate_categories(
|
|
1871
3180
|
values: Iterable[str],
|
1872
3181
|
field: FieldAttr,
|
1873
3182
|
key: str,
|
1874
|
-
using_key: str | None = None,
|
1875
3183
|
organism: str | None = None,
|
1876
3184
|
source: Record | None = None,
|
1877
3185
|
exclude: str | list | None = None,
|
1878
3186
|
hint_print: str | None = None,
|
3187
|
+
curator: CatManager | None = None,
|
1879
3188
|
) -> tuple[bool, list]:
|
1880
3189
|
"""Validate ontology terms in a pandas series using LaminDB registries.
|
1881
3190
|
|
@@ -1883,7 +3192,6 @@ def validate_categories(
|
|
1883
3192
|
values: The values to validate.
|
1884
3193
|
field: The field attribute.
|
1885
3194
|
key: The key referencing the slot in the DataFrame.
|
1886
|
-
using_key: A reference LaminDB instance.
|
1887
3195
|
organism: The organism name.
|
1888
3196
|
source: The source record.
|
1889
3197
|
exclude: Exclude specific values from validation.
|
@@ -1918,22 +3226,8 @@ def validate_categories(
|
|
1918
3226
|
non_validated = inspect_result.non_validated
|
1919
3227
|
syn_mapper = inspect_result.synonyms_mapper
|
1920
3228
|
|
1921
|
-
# inspect the non-validated values from the using_key instance
|
1922
|
-
values_validated = []
|
1923
|
-
if using_key is not None and using_key != "default" and non_validated:
|
1924
|
-
registry_using = get_registry_instance(registry, using_key)
|
1925
|
-
inspect_result = inspect_instance(
|
1926
|
-
values=non_validated,
|
1927
|
-
field=field,
|
1928
|
-
registry=registry_using,
|
1929
|
-
exclude=exclude,
|
1930
|
-
**kwargs,
|
1931
|
-
)
|
1932
|
-
non_validated = inspect_result.non_validated
|
1933
|
-
values_validated += inspect_result.validated
|
1934
|
-
syn_mapper.update(inspect_result.synonyms_mapper)
|
1935
|
-
|
1936
3229
|
# inspect the non-validated values from public (bionty only)
|
3230
|
+
values_validated = []
|
1937
3231
|
if hasattr(registry, "public"):
|
1938
3232
|
verbosity = settings.verbosity
|
1939
3233
|
try:
|
@@ -1975,6 +3269,10 @@ def validate_categories(
|
|
1975
3269
|
if logger.indent == "":
|
1976
3270
|
_log_mapping_info()
|
1977
3271
|
logger.warning(warning_message)
|
3272
|
+
if curator is not None:
|
3273
|
+
curator._validate_category_error_messages = strip_ansi_codes(
|
3274
|
+
warning_message
|
3275
|
+
)
|
1978
3276
|
logger.indent = ""
|
1979
3277
|
return False, non_validated
|
1980
3278
|
|
@@ -1982,7 +3280,6 @@ def validate_categories(
|
|
1982
3280
|
def standardize_categories(
|
1983
3281
|
values: Iterable[str],
|
1984
3282
|
field: FieldAttr,
|
1985
|
-
using_key: str | None = None,
|
1986
3283
|
organism: str | None = None,
|
1987
3284
|
source: Record | None = None,
|
1988
3285
|
) -> dict:
|
@@ -1999,30 +3296,15 @@ def standardize_categories(
|
|
1999
3296
|
mute=True,
|
2000
3297
|
return_mapper=True,
|
2001
3298
|
)
|
2002
|
-
|
2003
|
-
if len(values) > len(syn_mapper): # type: ignore
|
2004
|
-
# standardize values using the using_key instance
|
2005
|
-
if using_key is not None and using_key != "default":
|
2006
|
-
registry_using = get_registry_instance(registry, using_key)
|
2007
|
-
syn_mapper.update(
|
2008
|
-
registry_using.standardize(
|
2009
|
-
[v for v in values if v not in syn_mapper],
|
2010
|
-
field=field.field.name,
|
2011
|
-
organism=organism,
|
2012
|
-
source=source,
|
2013
|
-
mute=True,
|
2014
|
-
return_mapper=True,
|
2015
|
-
)
|
2016
|
-
)
|
2017
3299
|
return syn_mapper
|
2018
3300
|
|
2019
3301
|
|
2020
3302
|
def validate_categories_in_df(
|
2021
3303
|
df: pd.DataFrame,
|
2022
3304
|
fields: dict[str, FieldAttr],
|
2023
|
-
using_key: str | None = None,
|
2024
3305
|
sources: dict[str, Record] = None,
|
2025
3306
|
exclude: dict | None = None,
|
3307
|
+
curator: CatManager | None = None,
|
2026
3308
|
**kwargs,
|
2027
3309
|
) -> tuple[bool, dict]:
|
2028
3310
|
"""Validate categories in DataFrame columns using LaminDB registries."""
|
@@ -2038,9 +3320,9 @@ def validate_categories_in_df(
|
|
2038
3320
|
df[key],
|
2039
3321
|
field=field,
|
2040
3322
|
key=key,
|
2041
|
-
using_key=using_key,
|
2042
3323
|
source=sources.get(key),
|
2043
3324
|
exclude=exclude.get(key) if exclude else None,
|
3325
|
+
curator=curator,
|
2044
3326
|
**kwargs,
|
2045
3327
|
)
|
2046
3328
|
validated &= is_val
|
@@ -2055,80 +3337,72 @@ def save_artifact(
|
|
2055
3337
|
columns_field: FieldAttr | dict[str, FieldAttr],
|
2056
3338
|
description: str | None = None,
|
2057
3339
|
organism: str | None = None,
|
2058
|
-
adata: ad.AnnData | None = None,
|
2059
3340
|
key: str | None = None,
|
3341
|
+
artifact: Artifact | None = None,
|
2060
3342
|
revises: Artifact | None = None,
|
2061
3343
|
run: Run | None = None,
|
3344
|
+
schema: Schema | None = None,
|
2062
3345
|
) -> Artifact:
|
2063
3346
|
"""Save all metadata with an Artifact.
|
2064
3347
|
|
2065
3348
|
Args:
|
2066
|
-
data: The DataFrame
|
3349
|
+
data: The DataFrame/AnnData/MuData object to save.
|
2067
3350
|
fields: A dictionary mapping obs_column to registry_field.
|
2068
3351
|
columns_field: The registry field to validate variables index against.
|
2069
3352
|
description: A description of the artifact.
|
2070
3353
|
organism: The organism name.
|
2071
|
-
adata: The AnnData object to save and get n_observations, must be provided if data is a path.
|
2072
3354
|
type: The artifact type.
|
2073
|
-
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a
|
3355
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
3356
|
+
artifact: A already registered artifact. Passing this will not save a new artifact from data.
|
2074
3357
|
revises: Previous version of the artifact. Triggers a revision.
|
2075
3358
|
run: The run that creates the artifact.
|
2076
3359
|
|
2077
3360
|
Returns:
|
2078
3361
|
The saved Artifact.
|
2079
3362
|
"""
|
2080
|
-
from .._artifact import data_is_anndata
|
3363
|
+
from .._artifact import data_is_anndata, data_is_mudata
|
2081
3364
|
from ..core._data import add_labels
|
2082
3365
|
|
2083
|
-
artifact = None
|
2084
|
-
if data_is_anndata(data):
|
2085
|
-
assert adata is not None # noqa: S101
|
2086
|
-
artifact = Artifact.from_anndata(
|
2087
|
-
data, description=description, key=key, revises=revises, run=run
|
2088
|
-
)
|
2089
|
-
artifact.n_observations = adata.shape[0]
|
2090
|
-
data = adata
|
2091
|
-
|
2092
|
-
elif isinstance(data, pd.DataFrame):
|
2093
|
-
artifact = Artifact.from_df(
|
2094
|
-
data, description=description, key=key, revises=revises, run=run
|
2095
|
-
)
|
2096
|
-
else:
|
2097
|
-
try:
|
2098
|
-
from mudata import MuData
|
2099
|
-
|
2100
|
-
if isinstance(data, MuData):
|
2101
|
-
artifact = Artifact.from_mudata(
|
2102
|
-
data,
|
2103
|
-
description=description,
|
2104
|
-
key=key,
|
2105
|
-
revises=revises,
|
2106
|
-
run=run,
|
2107
|
-
)
|
2108
|
-
artifact.n_observations = data.n_obs
|
2109
|
-
except ImportError:
|
2110
|
-
pass
|
2111
3366
|
if artifact is None:
|
2112
|
-
|
3367
|
+
if data_is_anndata(data):
|
3368
|
+
artifact = Artifact.from_anndata(
|
3369
|
+
data, description=description, key=key, revises=revises, run=run
|
3370
|
+
)
|
3371
|
+
elif isinstance(data, pd.DataFrame):
|
3372
|
+
artifact = Artifact.from_df(
|
3373
|
+
data, description=description, key=key, revises=revises, run=run
|
3374
|
+
)
|
3375
|
+
elif data_is_mudata(data):
|
3376
|
+
artifact = Artifact.from_mudata(
|
3377
|
+
data,
|
3378
|
+
description=description,
|
3379
|
+
key=key,
|
3380
|
+
revises=revises,
|
3381
|
+
run=run,
|
3382
|
+
)
|
3383
|
+
artifact.schema = schema
|
2113
3384
|
artifact.save()
|
2114
3385
|
|
2115
|
-
|
2116
|
-
(
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2120
|
-
|
2121
|
-
|
2122
|
-
|
3386
|
+
if organism is not None:
|
3387
|
+
feature_kwargs = check_registry_organism(
|
3388
|
+
(
|
3389
|
+
list(columns_field.values())[0].field.model
|
3390
|
+
if isinstance(columns_field, dict)
|
3391
|
+
else columns_field.field.model
|
3392
|
+
),
|
3393
|
+
organism,
|
3394
|
+
)
|
3395
|
+
else:
|
3396
|
+
feature_kwargs = {}
|
2123
3397
|
|
2124
3398
|
if artifact.otype == "DataFrame":
|
2125
|
-
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
|
3399
|
+
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
|
2126
3400
|
elif artifact.otype == "AnnData":
|
2127
|
-
artifact.features._add_set_from_anndata(
|
3401
|
+
artifact.features._add_set_from_anndata( # type: ignore
|
2128
3402
|
var_field=columns_field, **feature_kwargs
|
2129
3403
|
)
|
2130
3404
|
elif artifact.otype == "MuData":
|
2131
|
-
artifact.features._add_set_from_mudata(
|
3405
|
+
artifact.features._add_set_from_mudata( # type: ignore
|
2132
3406
|
var_fields=columns_field, **feature_kwargs
|
2133
3407
|
)
|
2134
3408
|
else:
|
@@ -2202,7 +3476,7 @@ def save_artifact(
|
|
2202
3476
|
)
|
2203
3477
|
|
2204
3478
|
slug = ln_setup.settings.instance.slug
|
2205
|
-
if ln_setup.settings.instance.is_remote: #
|
3479
|
+
if ln_setup.settings.instance.is_remote: # pdagma: no cover
|
2206
3480
|
logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
|
2207
3481
|
return artifact
|
2208
3482
|
|
@@ -2224,7 +3498,6 @@ def update_registry(
|
|
2224
3498
|
values: list[str],
|
2225
3499
|
field: FieldAttr,
|
2226
3500
|
key: str,
|
2227
|
-
using_key: str | None = None,
|
2228
3501
|
validated_only: bool = True,
|
2229
3502
|
df: pd.DataFrame | None = None,
|
2230
3503
|
organism: str | None = None,
|
@@ -2233,13 +3506,12 @@ def update_registry(
|
|
2233
3506
|
exclude: str | list | None = None,
|
2234
3507
|
**kwargs,
|
2235
3508
|
) -> None:
|
2236
|
-
"""Save features or labels records in the default instance
|
3509
|
+
"""Save features or labels records in the default instance..
|
2237
3510
|
|
2238
3511
|
Args:
|
2239
3512
|
values: A list of values to be saved as labels.
|
2240
3513
|
field: The FieldAttr object representing the field for which labels are being saved.
|
2241
3514
|
key: The name of the feature to save.
|
2242
|
-
using_key: The name of the instance from which to transfer labels (if applicable).
|
2243
3515
|
validated_only: If True, only save validated labels.
|
2244
3516
|
df: A DataFrame to save labels from.
|
2245
3517
|
organism: The organism name.
|
@@ -2290,22 +3562,10 @@ def update_registry(
|
|
2290
3562
|
i for i in values if i not in existing_and_public_labels
|
2291
3563
|
]
|
2292
3564
|
|
2293
|
-
# inspect and save validated records the using_key instance
|
2294
|
-
(
|
2295
|
-
labels_saved[f"from {using_key}"],
|
2296
|
-
non_validated_labels,
|
2297
|
-
) = update_registry_from_using_instance(
|
2298
|
-
non_validated_labels,
|
2299
|
-
field=field,
|
2300
|
-
using_key=using_key,
|
2301
|
-
exclude=exclude,
|
2302
|
-
**filter_kwargs,
|
2303
|
-
)
|
2304
|
-
|
2305
3565
|
# save non-validated/new records
|
2306
3566
|
labels_saved["new"] = non_validated_labels
|
2307
3567
|
if not validated_only:
|
2308
|
-
non_validated_records = []
|
3568
|
+
non_validated_records: RecordList[Any] = [] # type: ignore
|
2309
3569
|
if df is not None and registry == Feature:
|
2310
3570
|
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
2311
3571
|
non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
|
@@ -2379,48 +3639,6 @@ def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
|
|
2379
3639
|
is_feature.children.add(*all_records)
|
2380
3640
|
|
2381
3641
|
|
2382
|
-
def update_registry_from_using_instance(
|
2383
|
-
values: list[str],
|
2384
|
-
field: FieldAttr,
|
2385
|
-
using_key: str | None = None,
|
2386
|
-
exclude: str | list | None = None,
|
2387
|
-
**kwargs,
|
2388
|
-
) -> tuple[list[str], list[str]]:
|
2389
|
-
"""Save features or labels records from the using_key instance.
|
2390
|
-
|
2391
|
-
Args:
|
2392
|
-
values: A list of values to be saved as labels.
|
2393
|
-
field: The FieldAttr object representing the field for which labels are being saved.
|
2394
|
-
using_key: The name of the instance from which to transfer labels (if applicable).
|
2395
|
-
kwargs: Additional keyword arguments to pass to the registry model.
|
2396
|
-
|
2397
|
-
Returns:
|
2398
|
-
A tuple containing the list of saved labels and the list of non-saved labels.
|
2399
|
-
"""
|
2400
|
-
labels_saved = []
|
2401
|
-
not_saved = values
|
2402
|
-
|
2403
|
-
if using_key is not None and using_key != "default":
|
2404
|
-
registry_using = get_registry_instance(field.field.model, using_key)
|
2405
|
-
|
2406
|
-
inspect_result_using = inspect_instance(
|
2407
|
-
values=values,
|
2408
|
-
field=field,
|
2409
|
-
registry=registry_using,
|
2410
|
-
exclude=exclude,
|
2411
|
-
**kwargs,
|
2412
|
-
)
|
2413
|
-
labels_using = registry_using.filter(
|
2414
|
-
**{f"{field.field.name}__in": inspect_result_using.validated}
|
2415
|
-
).all()
|
2416
|
-
for label_using in labels_using:
|
2417
|
-
label_using.save()
|
2418
|
-
labels_saved.append(getattr(label_using, field.field.name))
|
2419
|
-
not_saved = inspect_result_using.non_validated
|
2420
|
-
|
2421
|
-
return labels_saved, not_saved
|
2422
|
-
|
2423
|
-
|
2424
3642
|
def _save_organism(name: str):
|
2425
3643
|
"""Save an organism record."""
|
2426
3644
|
import bionty as bt
|
@@ -2445,4 +3663,121 @@ def _ref_is_name(field: FieldAttr) -> bool | None:
|
|
2445
3663
|
return field.field.name == name_field
|
2446
3664
|
|
2447
3665
|
|
2448
|
-
|
3666
|
+
# backward compat constructors ------------------
|
3667
|
+
|
3668
|
+
|
3669
|
+
@classmethod # type: ignore
|
3670
|
+
def from_df(
|
3671
|
+
cls,
|
3672
|
+
df: pd.DataFrame,
|
3673
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
3674
|
+
columns: FieldAttr = Feature.name,
|
3675
|
+
verbosity: str = "hint",
|
3676
|
+
organism: str | None = None,
|
3677
|
+
) -> DataFrameCatManager:
|
3678
|
+
return DataFrameCatManager(
|
3679
|
+
df=df,
|
3680
|
+
categoricals=categoricals,
|
3681
|
+
columns=columns,
|
3682
|
+
verbosity=verbosity,
|
3683
|
+
organism=organism,
|
3684
|
+
)
|
3685
|
+
|
3686
|
+
|
3687
|
+
@classmethod # type: ignore
|
3688
|
+
def from_anndata(
|
3689
|
+
cls,
|
3690
|
+
data: ad.AnnData | UPathStr,
|
3691
|
+
var_index: FieldAttr,
|
3692
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
3693
|
+
obs_columns: FieldAttr = Feature.name,
|
3694
|
+
verbosity: str = "hint",
|
3695
|
+
organism: str | None = None,
|
3696
|
+
sources: dict[str, Record] | None = None,
|
3697
|
+
) -> AnnDataCatManager:
|
3698
|
+
return AnnDataCatManager(
|
3699
|
+
data=data,
|
3700
|
+
var_index=var_index,
|
3701
|
+
categoricals=categoricals,
|
3702
|
+
obs_columns=obs_columns,
|
3703
|
+
verbosity=verbosity,
|
3704
|
+
organism=organism,
|
3705
|
+
sources=sources,
|
3706
|
+
)
|
3707
|
+
|
3708
|
+
|
3709
|
+
@classmethod # type: ignore
|
3710
|
+
def from_mudata(
|
3711
|
+
cls,
|
3712
|
+
mdata: MuData,
|
3713
|
+
var_index: dict[str, dict[str, FieldAttr]],
|
3714
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
3715
|
+
verbosity: str = "hint",
|
3716
|
+
organism: str | None = None,
|
3717
|
+
) -> MuDataCatManager:
|
3718
|
+
return MuDataCatManager(
|
3719
|
+
mdata=mdata,
|
3720
|
+
var_index=var_index,
|
3721
|
+
categoricals=categoricals,
|
3722
|
+
verbosity=verbosity,
|
3723
|
+
organism=organism,
|
3724
|
+
)
|
3725
|
+
|
3726
|
+
|
3727
|
+
@classmethod # type: ignore
|
3728
|
+
def from_tiledbsoma(
|
3729
|
+
cls,
|
3730
|
+
experiment_uri: UPathStr,
|
3731
|
+
var_index: dict[str, tuple[str, FieldAttr]],
|
3732
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
3733
|
+
obs_columns: FieldAttr = Feature.name,
|
3734
|
+
organism: str | None = None,
|
3735
|
+
sources: dict[str, Record] | None = None,
|
3736
|
+
exclude: dict[str, str | list[str]] | None = None,
|
3737
|
+
) -> TiledbsomaCatManager:
|
3738
|
+
return TiledbsomaCatManager(
|
3739
|
+
experiment_uri=experiment_uri,
|
3740
|
+
var_index=var_index,
|
3741
|
+
categoricals=categoricals,
|
3742
|
+
obs_columns=obs_columns,
|
3743
|
+
organism=organism,
|
3744
|
+
sources=sources,
|
3745
|
+
exclude=exclude,
|
3746
|
+
)
|
3747
|
+
|
3748
|
+
|
3749
|
+
@classmethod # type: ignore
|
3750
|
+
def from_spatialdata(
|
3751
|
+
cls,
|
3752
|
+
sdata,
|
3753
|
+
var_index: dict[str, FieldAttr],
|
3754
|
+
categoricals: dict[str, dict[str, FieldAttr]] | None = None,
|
3755
|
+
organism: str | None = None,
|
3756
|
+
sources: dict[str, dict[str, Record]] | None = None,
|
3757
|
+
exclude: dict[str, dict] | None = None,
|
3758
|
+
verbosity: str = "hint",
|
3759
|
+
*,
|
3760
|
+
sample_metadata_key: str = "sample",
|
3761
|
+
):
|
3762
|
+
try:
|
3763
|
+
import spatialdata
|
3764
|
+
except ImportError as e:
|
3765
|
+
raise ImportError("Please install spatialdata: pip install spatialdata") from e
|
3766
|
+
|
3767
|
+
return SpatialDataCatManager(
|
3768
|
+
sdata=sdata,
|
3769
|
+
var_index=var_index,
|
3770
|
+
categoricals=categoricals,
|
3771
|
+
verbosity=verbosity,
|
3772
|
+
organism=organism,
|
3773
|
+
sources=sources,
|
3774
|
+
exclude=exclude,
|
3775
|
+
sample_metadata_key=sample_metadata_key,
|
3776
|
+
)
|
3777
|
+
|
3778
|
+
|
3779
|
+
CatManager.from_df = from_df # type: ignore
|
3780
|
+
CatManager.from_anndata = from_anndata # type: ignore
|
3781
|
+
CatManager.from_mudata = from_mudata # type: ignore
|
3782
|
+
CatManager.from_spatialdata = from_spatialdata # type: ignore
|
3783
|
+
CatManager.from_tiledbsoma = from_tiledbsoma # type: ignore
|