lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. lamindb/__init__.py +14 -5
  2. lamindb/_artifact.py +174 -57
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +85 -51
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +222 -81
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +59 -17
  10. lamindb/_record.py +171 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +33 -10
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +106 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/validation.py +2 -6
  19. lamindb/core/__init__.py +13 -14
  20. lamindb/core/_context.py +39 -36
  21. lamindb/core/_data.py +29 -25
  22. lamindb/core/_describe.py +1 -1
  23. lamindb/core/_django.py +1 -1
  24. lamindb/core/_feature_manager.py +54 -44
  25. lamindb/core/_label_manager.py +4 -4
  26. lamindb/core/_mapped_collection.py +20 -7
  27. lamindb/core/datasets/__init__.py +6 -1
  28. lamindb/core/datasets/_core.py +12 -11
  29. lamindb/core/datasets/_small.py +66 -20
  30. lamindb/core/exceptions.py +1 -90
  31. lamindb/core/loaders.py +7 -13
  32. lamindb/core/relations.py +6 -4
  33. lamindb/core/storage/_anndata_accessor.py +41 -0
  34. lamindb/core/storage/_backed_access.py +2 -2
  35. lamindb/core/storage/_pyarrow_dataset.py +25 -15
  36. lamindb/core/storage/_tiledbsoma.py +56 -12
  37. lamindb/core/storage/paths.py +41 -22
  38. lamindb/core/subsettings/_creation_settings.py +4 -16
  39. lamindb/curators/__init__.py +2168 -833
  40. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  41. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  42. lamindb/errors.py +96 -0
  43. lamindb/integrations/_vitessce.py +3 -3
  44. lamindb/migrations/0069_squashed.py +76 -75
  45. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  46. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  47. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  48. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  49. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  50. lamindb/migrations/0086_various.py +95 -0
  51. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  52. lamindb/migrations/0088_schema_components.py +273 -0
  53. lamindb/migrations/0088_squashed.py +4372 -0
  54. lamindb/models.py +423 -156
  55. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
  56. lamindb-1.1.0.dist-info/RECORD +95 -0
  57. lamindb/curators/_spatial.py +0 -528
  58. lamindb/migrations/0052_squashed.py +0 -1261
  59. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  60. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  61. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  62. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  63. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  64. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  65. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  66. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  67. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  68. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  69. lamindb/migrations/0063_populate_latest_field.py +0 -45
  70. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  71. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  72. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  73. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  74. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  75. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  76. lamindb-1.0.4.dist-info/RECORD +0 -102
  77. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
  78. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
@@ -1,21 +1,52 @@
1
+ """Curators.
2
+
3
+ .. autosummary::
4
+ :toctree: .
5
+
6
+ Curator
7
+ DataFrameCurator
8
+ AnnDataCurator
9
+
10
+ """
11
+
1
12
  from __future__ import annotations
2
13
 
3
14
  import copy
4
- import warnings
15
+ import random
16
+ import re
17
+ from importlib import resources
5
18
  from itertools import chain
6
- from typing import TYPE_CHECKING
19
+ from typing import TYPE_CHECKING, Any, Literal
7
20
 
8
21
  import anndata as ad
9
22
  import lamindb_setup as ln_setup
10
23
  import pandas as pd
24
+ import pandera
11
25
  import pyarrow as pa
12
26
  from lamin_utils import colors, logger
27
+ from lamindb_setup.core import deprecated, upath
13
28
  from lamindb_setup.core._docs import doc_args
14
29
  from lamindb_setup.core.upath import UPath
15
30
 
31
+ from lamindb.core.storage._backed_access import backed_access
32
+
33
+ from ._cellxgene_schemas import _read_schema_versions
34
+
35
+ if TYPE_CHECKING:
36
+ from anndata import AnnData
37
+ from lamindb_setup.core.types import UPathStr
38
+
39
+ from lamindb.base.types import FieldAttr
40
+ from lamindb.models import Record
41
+ from lamindb._feature import parse_dtype, parse_dtype_single_cat
16
42
  from lamindb.base.types import FieldAttr # noqa
43
+ from lamindb.core._data import add_labels
44
+ from lamindb.core._feature_manager import parse_staged_feature_sets_from_anndata
45
+ from lamindb.core._settings import settings
17
46
  from lamindb.models import (
18
47
  Artifact,
48
+ CanCurate,
49
+ Collection,
19
50
  Feature,
20
51
  Record,
21
52
  Run,
@@ -23,15 +54,25 @@ from lamindb.models import (
23
54
  ULabel,
24
55
  )
25
56
 
57
+ from .._artifact import data_is_anndata
26
58
  from .._from_values import _format_values
27
- from ..core.exceptions import ValidationError
59
+ from ..errors import InvalidArgument, ValidationError
28
60
 
29
61
  if TYPE_CHECKING:
30
- from collections.abc import Iterable
62
+ from collections.abc import Iterable, MutableMapping
31
63
  from typing import Any
32
64
 
33
65
  from lamindb_setup.core.types import UPathStr
34
66
  from mudata import MuData
67
+ from spatialdata import SpatialData
68
+
69
+ from lamindb._query_set import RecordList
70
+
71
+
72
+ def strip_ansi_codes(text):
73
+ # This pattern matches ANSI escape sequences
74
+ ansi_pattern = re.compile(r"\x1b\[[0-9;]*m")
75
+ return ansi_pattern.sub("", text)
35
76
 
36
77
 
37
78
  class CurateLookup:
@@ -40,8 +81,6 @@ class CurateLookup:
40
81
  Args:
41
82
  categoricals: A dictionary of categorical fields to lookup.
42
83
  slots: A dictionary of slot fields to lookup.
43
- using_key: The key of the instance to lookup from. Defaults to the
44
- current instance if not specified.
45
84
  public: Whether to lookup from the public instance. Defaults to False.
46
85
 
47
86
  Example:
@@ -55,48 +94,43 @@ class CurateLookup:
55
94
  self,
56
95
  categoricals: dict[str, FieldAttr],
57
96
  slots: dict[str, FieldAttr] = None,
58
- using_key: str | None = None,
59
97
  public: bool = False,
60
98
  ) -> None:
61
99
  slots = slots or {}
62
- self._fields = {**categoricals, **slots}
63
- self._using_key = None if using_key == "default" else using_key
64
- self._using_key_name = self._using_key or ln_setup.settings.instance.slug
100
+ self._categoricals = {**categoricals, **slots}
65
101
  self._public = public
66
- debug_message = f"Lookup objects from {colors.italic(self._using_key_name)}"
67
- logger.debug(debug_message)
68
102
 
69
103
  def __getattr__(self, name):
70
- if name in self._fields:
71
- registry = self._fields[name].field.model
104
+ if name in self._categoricals:
105
+ registry = self._categoricals[name].field.model
72
106
  if self._public and hasattr(registry, "public"):
73
107
  return registry.public().lookup()
74
108
  else:
75
- return get_registry_instance(registry, self._using_key).lookup()
109
+ return registry.lookup()
76
110
  raise AttributeError(
77
111
  f'"{self.__class__.__name__}" object has no attribute "{name}"'
78
112
  )
79
113
 
80
114
  def __getitem__(self, name):
81
- if name in self._fields:
82
- registry = self._fields[name].field.model
115
+ if name in self._categoricals:
116
+ registry = self._categoricals[name].field.model
83
117
  if self._public and hasattr(registry, "public"):
84
118
  return registry.public().lookup()
85
119
  else:
86
- return get_registry_instance(registry, self._using_key).lookup()
120
+ return registry.lookup()
87
121
  raise AttributeError(
88
122
  f'"{self.__class__.__name__}" object has no attribute "{name}"'
89
123
  )
90
124
 
91
125
  def __repr__(self) -> str:
92
- if len(self._fields) > 0:
126
+ if len(self._categoricals) > 0:
93
127
  getattr_keys = "\n ".join(
94
- [f".{key}" for key in self._fields if key.isidentifier()]
128
+ [f".{key}" for key in self._categoricals if key.isidentifier()]
95
129
  )
96
130
  getitem_keys = "\n ".join(
97
- [str([key]) for key in self._fields if not key.isidentifier()]
131
+ [str([key]) for key in self._categoricals if not key.isidentifier()]
98
132
  )
99
- ref = "public" if self._public else self._using_key_name
133
+ ref = "public" if self._public else "registries"
100
134
  return (
101
135
  f"Lookup objects from the {colors.italic(ref)}:\n "
102
136
  f"{colors.green(getattr_keys)}\n "
@@ -105,21 +139,422 @@ class CurateLookup:
105
139
  " → categories.alveolar_type_1_fibroblast_cell\n\n"
106
140
  "To look up public ontologies, use .lookup(public=True)"
107
141
  )
108
- else: # pragma: no cover
142
+ else: # pdagma: no cover
109
143
  return colors.warning("No fields are found!")
110
144
 
111
145
 
112
- class BaseCurator:
113
- """Curate a dataset."""
146
+ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
147
+
148
+
149
+ VALIDATE_DOCSTRING = """Validate dataset.
150
+
151
+ Raises:
152
+ lamindb.errors.ValidationError: If validation fails.
153
+ """
154
+
155
+ SAVE_ARTIFACT_DOCSTRING = """Save an annotated artifact.
156
+
157
+ Args:
158
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
159
+ description: A description.
160
+ revises: Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
161
+ run: The run that creates the artifact.
162
+
163
+ Returns:
164
+ A saved artifact record.
165
+ """
166
+
167
+
168
+ class Curator:
169
+ """Dataset curator.
170
+
171
+ A `Curator` object makes it easy to validate, standardize & annotate datasets.
172
+
173
+ See:
174
+ - :class:`~lamindb.curators.DataFrameCurator`
175
+ - :class:`~lamindb.curators.AnnDataCurator`
176
+ """
177
+
178
+ def __init__(self, dataset: Any, schema: Schema | None = None):
179
+ self._artifact: Artifact = None # pass the dataset as an artifact
180
+ self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
181
+ if isinstance(self._dataset, Artifact):
182
+ self._artifact = self._dataset
183
+ if self._artifact.otype in {"DataFrame", "AnnData"}:
184
+ self._dataset = self._dataset.load()
185
+ self._schema: Schema | None = schema
186
+ self._is_validated: bool = False
187
+ self._cat_manager: CatManager = None # is None for CatManager curators
188
+
189
+ @doc_args(VALIDATE_DOCSTRING)
190
+ def validate(self) -> bool | str:
191
+ """{}""" # noqa: D415
192
+ pass # pdagma: no cover
193
+
194
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
195
+ def save_artifact(
196
+ self,
197
+ *,
198
+ key: str | None = None,
199
+ description: str | None = None,
200
+ revises: Artifact | None = None,
201
+ run: Run | None = None,
202
+ ) -> Artifact:
203
+ """{}""" # noqa: D415
204
+ # Note that this docstring has to be consistent with the Artifact()
205
+ # constructor signature
206
+ pass
207
+
208
+
209
+ class DataFrameCurator(Curator):
210
+ # the example in the docstring is tested in test_curators_quickstart_example
211
+ """Curator for a DataFrame object.
212
+
213
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
214
+
215
+ Args:
216
+ dataset: The DataFrame-like object to validate & annotate.
217
+ schema: A `Schema` object that defines the validation constraints.
218
+
219
+ Example::
220
+
221
+ import lamindb as ln
222
+ import bionty as bt
223
+
224
+ # define valid labels
225
+ cell_medium = ln.ULabel(name="CellMedium", is_type=True).save()
226
+ ln.ULabel(name="DMSO", type=cell_medium).save()
227
+ ln.ULabel(name="IFNG", type=cell_medium).save()
228
+ bt.CellType.from_source(name="B cell").save()
229
+ bt.CellType.from_source(name="T cell").save()
230
+
231
+ # define schema
232
+ schema = ln.Schema(
233
+ name="small_dataset1_obs_level_metadata",
234
+ features=[
235
+ ln.Feature(name="cell_medium", dtype="cat[ULabel[CellMedium]]").save(),
236
+ ln.Feature(name="sample_note", dtype=str).save(),
237
+ ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
238
+ ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
239
+ ],
240
+ ).save()
241
+
242
+ # curate a DataFrame
243
+ df = datasets.small_dataset1(otype="DataFrame")
244
+ curator = ln.curators.DataFrameCurator(df, schema)
245
+ artifact = curator.save_artifact(key="example_datasets/dataset1.parquet")
246
+ assert artifact.schema == schema
247
+ """
248
+
249
+ def __init__(
250
+ self,
251
+ dataset: pd.DataFrame | Artifact,
252
+ schema: Schema,
253
+ ) -> None:
254
+ super().__init__(dataset=dataset, schema=schema)
255
+ if schema.n > 0:
256
+ # populate features
257
+ pandera_columns = {}
258
+ categoricals = {}
259
+ for feature in schema.features.all():
260
+ pandera_dtype = (
261
+ feature.dtype if not feature.dtype.startswith("cat") else "category"
262
+ )
263
+ pandera_columns[feature.name] = pandera.Column(
264
+ pandera_dtype, nullable=feature.nullable
265
+ )
266
+ if feature.dtype.startswith("cat"):
267
+ categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
268
+ self._pandera_schema = pandera.DataFrameSchema(
269
+ pandera_columns, coerce=schema.coerce_dtype
270
+ )
271
+ # now deal with detailed validation of categoricals
272
+ self._cat_manager = DataFrameCatManager(
273
+ self._dataset,
274
+ categoricals=categoricals,
275
+ )
276
+ else:
277
+ assert schema.itype is not None # noqa: S101
278
+
279
+ @property
280
+ @doc_args(CAT_MANAGER_DOCSTRING)
281
+ def cat(self) -> CatManager:
282
+ """{}""" # noqa: D415
283
+ return self._cat_manager
284
+
285
+ def standardize(self) -> None:
286
+ """Standardize the dataset.
287
+
288
+ - Adds missing columns if a default value for a feature is defined.
289
+ - Fills missing values with the default value if a default value for a feature is defined.
290
+ """
291
+ for feature in self._schema.members:
292
+ if feature.name not in self._dataset.columns:
293
+ if feature.default_value is not None:
294
+ self._dataset[feature.name] = feature.default_value
295
+ else:
296
+ raise ValidationError(
297
+ f"Missing column {feature.name} cannot be added because no default value is defined for this feature"
298
+ )
299
+ else:
300
+ if feature.default_value is not None:
301
+ if isinstance(
302
+ self._dataset[feature.name].dtype, pd.CategoricalDtype
303
+ ):
304
+ if (
305
+ feature.default_value
306
+ not in self._dataset[feature.name].cat.categories
307
+ ):
308
+ self._dataset[feature.name] = self._dataset[
309
+ feature.name
310
+ ].cat.add_categories(feature.default_value)
311
+ self._dataset[feature.name] = self._dataset[feature.name].fillna(
312
+ feature.default_value
313
+ )
314
+
315
+ @doc_args(VALIDATE_DOCSTRING)
316
+ def validate(self) -> None:
317
+ """{}""" # noqa: D415
318
+ if self._schema.n > 0:
319
+ self._cat_manager.validate()
320
+ try:
321
+ self._pandera_schema.validate(self._dataset)
322
+ if self._cat_manager._is_validated:
323
+ self._is_validated = True
324
+ else:
325
+ self._is_validated = False
326
+ raise ValidationError(
327
+ self._cat_manager._validate_category_error_messages
328
+ )
329
+ except pandera.errors.SchemaError as err:
330
+ self._is_validated = False
331
+ # .exconly() doesn't exist on SchemaError
332
+ raise ValidationError(str(err)) from err
333
+ else:
334
+ result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
335
+ registry: CanCurate = result["registry"]
336
+ inspector = registry.inspect(
337
+ self._dataset.columns,
338
+ result["field"],
339
+ mute=True,
340
+ )
341
+ if len(inspector.non_validated) > 0:
342
+ # also check public ontology
343
+ if hasattr(registry, "public"):
344
+ registry.from_values(
345
+ inspector.non_validated, result["field"], mute=True
346
+ ).save()
347
+ inspector = registry.inspect(
348
+ inspector.non_validated, result["field"], mute=True
349
+ )
350
+ if len(inspector.non_validated) > 0:
351
+ self._is_validated = False
352
+ raise ValidationError(
353
+ f"Invalid identifiers for {self._schema.itype}: {inspector.non_validated}"
354
+ )
355
+
356
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
357
+ def save_artifact(
358
+ self,
359
+ *,
360
+ key: str | None = None,
361
+ description: str | None = None,
362
+ revises: Artifact | None = None,
363
+ run: Run | None = None,
364
+ ):
365
+ """{}""" # noqa: D415
366
+ if not self._is_validated:
367
+ self.validate() # raises ValidationError if doesn't validate
368
+ result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
369
+ return save_artifact( # type: ignore
370
+ self._dataset,
371
+ description=description,
372
+ fields=self._cat_manager.categoricals,
373
+ columns_field=result["field"],
374
+ key=key,
375
+ artifact=self._artifact,
376
+ revises=revises,
377
+ run=run,
378
+ schema=self._schema,
379
+ )
380
+
381
+
382
+ class AnnDataCurator(Curator):
383
+ # the example in the docstring is tested in test_curators_quickstart_example
384
+ """Curator for a DataFrame object.
385
+
386
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
387
+
388
+ Args:
389
+ dataset: The AnnData-like object to validate & annotate.
390
+ schema: A `Schema` object that defines the validation constraints.
391
+
392
+ Example::
393
+
394
+ import lamindb as ln
395
+ import bionty as bt
396
+
397
+ # define valid labels
398
+ cell_medium = ln.ULabel(name="CellMedium", is_type=True).save()
399
+ ln.ULabel(name="DMSO", type=cell_medium).save()
400
+ ln.ULabel(name="IFNG", type=cell_medium).save()
401
+ bt.CellType.from_source(name="B cell").save()
402
+ bt.CellType.from_source(name="T cell").save()
403
+
404
+ # define obs schema
405
+ obs_schema = ln.Schema(
406
+ name="small_dataset1_obs_level_metadata",
407
+ features=[
408
+ ln.Feature(name="cell_medium", dtype="cat[ULabel[CellMedium]]").save(),
409
+ ln.Feature(name="sample_note", dtype=str).save(),
410
+ ln.Feature(name="cell_type_by_expert", dtype=bt.CellType").save(),
411
+ ln.Feature(name="cell_type_by_model", dtype=bt.CellType").save(),
412
+ ],
413
+ ).save()
414
+
415
+ # define var schema
416
+ var_schema = ln.Schema(
417
+ name="scRNA_seq_var_schema",
418
+ itype=bt.Gene.ensembl_gene_id,
419
+ dtype="num",
420
+ ).save()
421
+
422
+ # define composite schema
423
+ anndata_schema = ln.Schema(
424
+ name="small_dataset1_anndata_schema",
425
+ otype="AnnData",
426
+ components={"obs": obs_schema, "var": var_schema},
427
+ ).save()
428
+
429
+ # curate an AnnData
430
+ adata = datasets.small_dataset1(otype="AnnData")
431
+ curator = ln.curators.AnnDataCurator(adata, anndata_schema)
432
+ artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
433
+ assert artifact.schema == anndata_schema
434
+ """
435
+
436
+ def __init__(
437
+ self,
438
+ dataset: AnnData | Artifact,
439
+ schema: Schema,
440
+ ) -> None:
441
+ super().__init__(dataset=dataset, schema=schema)
442
+ if not data_is_anndata(self._dataset):
443
+ raise InvalidArgument("dataset must be AnnData-like.")
444
+ if schema.otype != "AnnData":
445
+ raise InvalidArgument("Schema otype must be 'AnnData'.")
446
+ self._obs_curator = DataFrameCurator(
447
+ self._dataset.obs, schema._get_component("obs")
448
+ )
449
+ self._var_curator = DataFrameCurator(
450
+ self._dataset.var.T, schema._get_component("var")
451
+ )
452
+
453
+ @doc_args(VALIDATE_DOCSTRING)
454
+ def validate(self) -> None:
455
+ """{}""" # noqa: D415
456
+ self._obs_curator.validate()
457
+ self._var_curator.validate()
458
+ self._is_validated = True
459
+
460
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
461
+ def save_artifact(self, *, key=None, description=None, revises=None, run=None):
462
+ """{}""" # noqa: D415
463
+ if not self._is_validated:
464
+ self.validate() # raises ValidationError if doesn't validate
465
+ result = parse_dtype_single_cat(self._var_curator._schema.itype, is_itype=True)
466
+ return save_artifact( # type: ignore
467
+ self._dataset,
468
+ description=description,
469
+ fields=self._obs_curator._cat_manager.categoricals,
470
+ columns_field=result["field"],
471
+ key=key,
472
+ artifact=self._artifact,
473
+ revises=revises,
474
+ run=run,
475
+ schema=self._schema,
476
+ )
477
+
478
+
479
+ class CatManager:
480
+ """Manage valid categoricals by updating registries.
481
+
482
+ A `CatManager` object makes it easy to validate, standardize & annotate datasets.
483
+
484
+ Example:
485
+
486
+ >>> cat_manager = ln.CatManager(
487
+ >>> dataset,
488
+ >>> # define validation criteria as mappings
489
+ >>> columns=Feature.name, # map column names
490
+ >>> categoricals={"perturbation": ULabel.name}, # map categories
491
+ >>> )
492
+ >>> cat_manager.validate() # validate the dataframe
493
+ >>> artifact = cat_manager.save_artifact(description="my RNA-seq")
494
+ >>> artifact.describe() # see annotations
495
+
496
+ `cat_manager.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
497
+
498
+ If you find non-validated values, you have several options:
499
+
500
+ - new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCatManager.add_new_from`
501
+ - non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCatManager.non_validated` and addressed manually
502
+ """
503
+
504
+ def __init__(
505
+ self, *, dataset, categoricals, sources, organism, exclude, columns_field=None
506
+ ):
507
+ # the below is shared with Curator
508
+ self._artifact: Artifact = None # pass the dataset as an artifact
509
+ self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
510
+ if isinstance(self._dataset, Artifact):
511
+ self._artifact = self._dataset
512
+ if self._artifact.otype in {"DataFrame", "AnnData"}:
513
+ self._dataset = self._dataset.load()
514
+ self._is_validated: bool = False
515
+ # shared until here
516
+ self._categoricals = categoricals or {}
517
+ self._non_validated = None
518
+ self._organism = organism
519
+ self._sources = sources or {}
520
+ self._exclude = exclude or {}
521
+ self._columns_field = columns_field
522
+ self._validate_category_error_messages: str = ""
523
+
524
+ @property
525
+ def non_validated(self) -> dict[str, list[str]]:
526
+ """Return the non-validated features and labels."""
527
+ if self._non_validated is None:
528
+ raise ValidationError("Please run validate() first!")
529
+ return self._non_validated
114
530
 
115
- def __init_subclass__(cls, **kwargs):
116
- super().__init_subclass__(**kwargs)
117
- import sys
531
+ @property
532
+ def categoricals(self) -> dict:
533
+ """Return the columns fields to validate against."""
534
+ return self._categoricals
118
535
 
119
- # Deprecated methods
120
- if "sphinx" not in sys.modules:
121
- if hasattr(cls, "_add_new_from_columns"):
122
- cls.add_new_from_columns = cls._add_new_from_columns
536
+ def _replace_synonyms(
537
+ self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
538
+ ):
539
+ # replace the values in df
540
+ std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
541
+ # remove the standardized values from self.non_validated
542
+ non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
543
+ if len(non_validated) == 0:
544
+ self._non_validated.pop(key, None) # type: ignore
545
+ else:
546
+ self._non_validated[key] = non_validated # type: ignore
547
+ # logging
548
+ n = len(syn_mapper)
549
+ if n > 0:
550
+ syn_mapper_print = _format_values(
551
+ [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
552
+ )
553
+ s = "s" if n > 1 else ""
554
+ logger.success(
555
+ f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
556
+ )
557
+ return std_values
123
558
 
124
559
  def validate(self) -> bool:
125
560
  """Validate dataset.
@@ -127,9 +562,9 @@ class BaseCurator:
127
562
  This method also registers the validated records in the current instance.
128
563
 
129
564
  Returns:
130
- Boolean indicating whether the dataset is validated.
565
+ The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
131
566
  """
132
- pass # pragma: no cover
567
+ pass
133
568
 
134
569
  def standardize(self, key: str) -> None:
135
570
  """Replace synonyms with standardized values.
@@ -142,30 +577,48 @@ class BaseCurator:
142
577
  Returns:
143
578
  None
144
579
  """
145
- pass # pragma: no cover
580
+ pass # pdagma: no cover
146
581
 
582
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
147
583
  def save_artifact(
148
584
  self,
149
- description: str | None = None,
585
+ *,
150
586
  key: str | None = None,
587
+ description: str | None = None,
151
588
  revises: Artifact | None = None,
152
589
  run: Run | None = None,
153
590
  ) -> Artifact:
154
- """Save the dataset as artifact.
591
+ """{}""" # noqa: D415
592
+ from lamindb.core._settings import settings
155
593
 
156
- Args:
157
- description: A description of the DataFrame object.
158
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
159
- revises: Previous version of the artifact. Triggers a revision.
160
- run: The run that creates the artifact.
594
+ if not self._is_validated:
595
+ self.validate() # returns True or False
596
+ if not self._is_validated: # need to raise error manually
597
+ raise ValidationError("Dataset does not validate. Please curate.")
161
598
 
162
- Returns:
163
- A saved artifact record.
164
- """
165
- pass # pragma: no cover
599
+ # Make sure all labels are saved in the current instance
600
+ verbosity = settings.verbosity
601
+ try:
602
+ settings.verbosity = "warning"
603
+ self._artifact = save_artifact( # type: ignore
604
+ self._dataset,
605
+ description=description,
606
+ fields=self.categoricals,
607
+ columns_field=self._columns_field,
608
+ key=key,
609
+ artifact=self._artifact,
610
+ revises=revises,
611
+ run=run,
612
+ schema=None,
613
+ organism=self._organism,
614
+ )
615
+ finally:
616
+ settings.verbosity = verbosity
617
+
618
+ return self._artifact
166
619
 
167
620
 
168
- class DataFrameCurator(BaseCurator):
621
+ class DataFrameCatManager(CatManager):
169
622
  """Curation flow for a DataFrame object.
170
623
 
171
624
  See also :class:`~lamindb.Curator`.
@@ -174,7 +627,6 @@ class DataFrameCurator(BaseCurator):
174
627
  df: The DataFrame object to curate.
175
628
  columns: The field attribute for the feature column.
176
629
  categoricals: A dictionary mapping column names to registry_field.
177
- using_key: The reference instance containing registries to validate against.
178
630
  verbosity: The verbosity level.
179
631
  organism: The organism name.
180
632
  sources: A dictionary mapping column names to Source records.
@@ -191,165 +643,103 @@ class DataFrameCurator(BaseCurator):
191
643
  ... df,
192
644
  ... categoricals={
193
645
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
194
- ... "donor_id": ln.ULabel.name
646
+ ... "donor_id": ULabel.name
195
647
  ... }
196
648
  ... )
197
649
  """
198
650
 
199
651
  def __init__(
200
652
  self,
201
- df: pd.DataFrame,
653
+ df: pd.DataFrame | Artifact,
202
654
  columns: FieldAttr = Feature.name,
203
655
  categoricals: dict[str, FieldAttr] | None = None,
204
- using_key: str | None = None,
205
656
  verbosity: str = "hint",
206
657
  organism: str | None = None,
207
658
  sources: dict[str, Record] | None = None,
208
659
  exclude: dict | None = None,
209
- check_valid_keys: bool = True,
210
660
  ) -> None:
211
661
  from lamindb.core._settings import settings
212
662
 
213
663
  if organism is not None and not isinstance(organism, str):
214
664
  raise ValueError("organism must be a string such as 'human' or 'mouse'!")
215
665
 
216
- self._df = df
217
- self._fields = categoricals or {}
218
- self._columns_field = columns
219
- self._using_key = using_key
220
- # TODO: change verbosity back
221
666
  settings.verbosity = verbosity
222
- self._artifact = None
223
- self._collection = None
224
- self._validated = False
225
- self._kwargs = {"organism": organism} if organism else {}
226
- self._sources = sources or {}
227
- self._exclude = exclude or {}
228
667
  self._non_validated = None
229
- if check_valid_keys:
230
- self._check_valid_keys()
668
+ super().__init__(
669
+ dataset=df,
670
+ columns_field=columns,
671
+ organism=organism,
672
+ categoricals=categoricals,
673
+ sources=sources,
674
+ exclude=exclude,
675
+ )
231
676
  self._save_columns()
232
677
 
233
- @property
234
- def non_validated(self) -> dict[str, list[str]]:
235
- """Return the non-validated features and labels."""
236
- if self._non_validated is None:
237
- raise ValidationError("Please run validate() first!")
238
- return self._non_validated
239
-
240
- @property
241
- def fields(self) -> dict:
242
- """Return the columns fields to validate against."""
243
- return self._fields
244
-
245
- def lookup(
246
- self, using_key: str | None = None, public: bool = False
247
- ) -> CurateLookup:
678
+ def lookup(self, public: bool = False) -> CurateLookup:
248
679
  """Lookup categories.
249
680
 
250
681
  Args:
251
- using_key: The instance where the lookup is performed.
252
- if "public", the lookup is performed on the public reference.
682
+ public: If "public", the lookup is performed on the public reference.
253
683
  """
254
684
  return CurateLookup(
255
- categoricals=self._fields,
685
+ categoricals=self._categoricals,
256
686
  slots={"columns": self._columns_field},
257
- using_key=using_key or self._using_key,
258
687
  public=public,
259
688
  )
260
689
 
261
- def _check_valid_keys(self, extra: set | None = None) -> None:
262
- extra = extra or set()
263
- for name, d in {
264
- "categoricals": self._fields,
265
- "sources": self._sources,
266
- "exclude": self._exclude,
267
- }.items():
268
- if not isinstance(d, dict):
269
- raise TypeError(f"{name} must be a dictionary!")
270
- valid_keys = set(self._df.columns) | {"columns"} | extra
271
- nonval_keys = [key for key in d.keys() if key not in valid_keys]
272
- n = len(nonval_keys)
273
- s = "s" if n > 1 else ""
274
- are = "are" if n > 1 else "is"
275
- if len(nonval_keys) > 0:
276
- raise ValidationError(
277
- f"key{s} passed to {name} {are} not present in columns: {colors.yellow(_format_values(nonval_keys))}"
278
- )
279
-
280
690
  def _save_columns(self, validated_only: bool = True) -> None:
281
691
  """Save column name records."""
282
692
  # Always save features specified as the fields keys
283
693
  update_registry(
284
- values=list(self.fields.keys()),
694
+ values=list(self.categoricals.keys()),
285
695
  field=self._columns_field,
286
696
  key="columns",
287
- using_key=self._using_key,
288
697
  validated_only=False,
289
698
  source=self._sources.get("columns"),
290
699
  exclude=self._exclude.get("columns"),
291
- **self._kwargs, # type: ignore
292
700
  )
293
701
 
294
702
  # Save the rest of the columns based on validated_only
295
- additional_columns = set(self._df.columns) - set(self.fields.keys())
703
+ additional_columns = set(self._dataset.columns) - set(self.categoricals.keys())
296
704
  if additional_columns:
297
705
  update_registry(
298
706
  values=list(additional_columns),
299
707
  field=self._columns_field,
300
708
  key="columns",
301
- using_key=self._using_key,
302
709
  validated_only=validated_only,
303
- df=self._df, # Get the Feature type from df
710
+ df=self._dataset, # Get the Feature type from df
304
711
  source=self._sources.get("columns"),
305
712
  exclude=self._exclude.get("columns"),
306
- **self._kwargs, # type: ignore
307
713
  )
308
714
 
309
- def add_new_from(self, key: str, organism: str | None = None, **kwargs):
310
- """Add validated & new categories.
715
+ @deprecated(new_name="is run by default")
716
+ def add_new_from_columns(self, organism: str | None = None, **kwargs):
717
+ pass
718
+
719
+ def validate(self) -> bool:
720
+ """Validate variables and categorical observations.
721
+
722
+ This method also registers the validated records in the current instance:
723
+ - from public sources
311
724
 
312
725
  Args:
313
- key: The key referencing the slot in the DataFrame from which to draw terms.
314
726
  organism: The organism name.
315
- **kwargs: Additional keyword arguments to pass to create new records
727
+
728
+ Returns:
729
+ Whether the DataFrame is validated.
316
730
  """
317
- if len(kwargs) > 0 and key == "all":
318
- raise ValueError("Cannot pass additional arguments to 'all' key!")
319
- self._kwargs.update({"organism": organism} if organism else {})
320
- self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
321
-
322
- def _add_new_from_columns(self, organism: str | None = None, **kwargs):
323
- """Deprecated to run by default during init."""
324
- warnings.warn(
325
- "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
326
- DeprecationWarning,
327
- stacklevel=2,
731
+ # add all validated records to the current instance
732
+ self._update_registry_all()
733
+ self._validate_category_error_messages = "" # reset the error messages
734
+ self._is_validated, self._non_validated = validate_categories_in_df( # type: ignore
735
+ self._dataset,
736
+ fields=self.categoricals,
737
+ sources=self._sources,
738
+ exclude=self._exclude,
739
+ curator=self,
740
+ organism=self._organism,
328
741
  )
329
- pass
330
-
331
- def _replace_synonyms(
332
- self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
333
- ):
334
- # replace the values in df
335
- std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
336
- # remove the standardized values from self.non_validated
337
- non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
338
- if len(non_validated) == 0:
339
- self._non_validated.pop(key, None) # type: ignore
340
- else:
341
- self._non_validated[key] = non_validated # type: ignore
342
- # logging
343
- n = len(syn_mapper)
344
- if n > 0:
345
- syn_mapper_print = _format_values(
346
- [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
347
- )
348
- s = "s" if n > 1 else ""
349
- logger.success(
350
- f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
351
- )
352
- return std_values
742
+ return self._is_validated
353
743
 
354
744
  def standardize(self, key: str) -> None:
355
745
  """Replace synonyms with standardized values.
@@ -359,6 +749,8 @@ class DataFrameCurator(BaseCurator):
359
749
  Args:
360
750
  key: The key referencing the column in the DataFrame to standardize.
361
751
  """
752
+ if self._artifact is not None:
753
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
362
754
  # list is needed to avoid RuntimeError: dictionary changed size during iteration
363
755
  avail_keys = list(self.non_validated.keys())
364
756
  if len(avail_keys) == 0:
@@ -367,137 +759,74 @@ class DataFrameCurator(BaseCurator):
367
759
 
368
760
  if key == "all":
369
761
  for k in avail_keys:
370
- if k in self._fields: # needed to exclude var_index
762
+ if k in self._categoricals: # needed to exclude var_index
371
763
  syn_mapper = standardize_categories(
372
764
  self.non_validated[k],
373
- field=self._fields[k],
374
- using_key=self._using_key,
765
+ field=self._categoricals[k],
375
766
  source=self._sources.get(k),
376
- **self._kwargs,
377
767
  )
378
- self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k])
768
+ self._dataset[k] = self._replace_synonyms(
769
+ k, syn_mapper, self._dataset[k]
770
+ )
379
771
  else:
380
772
  if key not in avail_keys:
381
- if key in self._fields:
773
+ if key in self._categoricals:
382
774
  logger.info(f"No unstandardized values found for {key!r}")
383
775
  else:
384
776
  raise KeyError(
385
777
  f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
386
778
  )
387
779
  else:
388
- if key in self._fields: # needed to exclude var_index
780
+ if key in self._categoricals: # needed to exclude var_index
389
781
  syn_mapper = standardize_categories(
390
782
  self.non_validated[key],
391
- field=self._fields[key],
392
- using_key=self._using_key,
783
+ field=self._categoricals[key],
393
784
  source=self._sources.get(key),
394
- **self._kwargs,
785
+ organism=self._organism,
395
786
  )
396
- self._df[key] = self._replace_synonyms(
397
- key, syn_mapper, self._df[key]
787
+ self._dataset[key] = self._replace_synonyms(
788
+ key, syn_mapper, self._dataset[key]
398
789
  )
399
790
 
791
+ def _update_registry_all(self, validated_only: bool = True, **kwargs):
792
+ """Save labels for all features."""
793
+ for name in self.categoricals.keys():
794
+ self._update_registry(name, validated_only=validated_only, **kwargs)
795
+
400
796
  def _update_registry(
401
797
  self, categorical: str, validated_only: bool = True, **kwargs
402
798
  ) -> None:
403
799
  if categorical == "all":
404
800
  self._update_registry_all(validated_only=validated_only, **kwargs)
405
801
  else:
406
- if categorical not in self.fields:
802
+ if categorical not in self.categoricals:
407
803
  raise ValidationError(
408
804
  f"Feature {categorical} is not part of the fields!"
409
805
  )
410
806
  update_registry(
411
- values=_flatten_unique(self._df[categorical]),
412
- field=self.fields[categorical],
807
+ values=_flatten_unique(self._dataset[categorical]),
808
+ field=self.categoricals[categorical],
413
809
  key=categorical,
414
- using_key=self._using_key,
415
810
  validated_only=validated_only,
416
811
  source=self._sources.get(categorical),
417
812
  exclude=self._exclude.get(categorical),
418
- **kwargs,
813
+ organism=self._organism,
419
814
  )
420
815
  # adding new records removes them from non_validated
421
816
  if not validated_only and self._non_validated:
422
817
  self._non_validated.pop(categorical, None) # type: ignore
423
818
 
424
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
425
- """Save labels for all features."""
426
- for name in self.fields.keys():
427
- self._update_registry(name, validated_only=validated_only, **kwargs)
428
-
429
- def validate(self, organism: str | None = None) -> bool:
430
- """Validate variables and categorical observations.
431
-
432
- This method also registers the validated records in the current instance:
433
- - from public sources
434
- - from the using_key instance
819
+ def add_new_from(self, key: str, **kwargs):
820
+ """Add validated & new categories.
435
821
 
436
822
  Args:
823
+ key: The key referencing the slot in the DataFrame from which to draw terms.
437
824
  organism: The organism name.
438
-
439
- Returns:
440
- Whether the DataFrame is validated.
441
- """
442
- self._kwargs.update({"organism": organism} if organism else {})
443
-
444
- # add all validated records to the current instance
445
- self._update_registry_all()
446
-
447
- self._validated, self._non_validated = validate_categories_in_df( # type: ignore
448
- self._df,
449
- fields=self.fields,
450
- using_key=self._using_key,
451
- sources=self._sources,
452
- exclude=self._exclude,
453
- **self._kwargs,
454
- )
455
- return self._validated
456
-
457
- def save_artifact(
458
- self,
459
- description: str | None = None,
460
- key: str | None = None,
461
- revises: Artifact | None = None,
462
- run: Run | None = None,
463
- ) -> Artifact:
464
- """Save the validated DataFrame and metadata.
465
-
466
- Args:
467
- description: Description of the DataFrame object.
468
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
469
- Artifacts with the same key form a revision family.
470
- revises: Previous version of the artifact. Triggers a revision.
471
- run: The run that creates the artifact.
472
-
473
- Returns:
474
- A saved artifact record.
825
+ **kwargs: Additional keyword arguments to pass to create new records
475
826
  """
476
- from lamindb.core._settings import settings
477
-
478
- if not self._validated:
479
- self.validate()
480
- if not self._validated:
481
- raise ValidationError("Dataset does not validate. Please curate.")
482
-
483
- # Make sure all labels are saved in the current instance
484
- verbosity = settings.verbosity
485
- try:
486
- settings.verbosity = "warning"
487
- self._artifact = save_artifact(
488
- self._df,
489
- description=description,
490
- fields=self.fields,
491
- columns_field=self._columns_field,
492
- key=key,
493
- revises=revises,
494
- run=run,
495
- **self._kwargs,
496
- )
497
- finally:
498
- settings.verbosity = verbosity
499
-
500
- return self._artifact
827
+ if len(kwargs) > 0 and key == "all":
828
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
829
+ self._update_registry(key, validated_only=False, **kwargs)
501
830
 
502
831
  def clean_up_failed_runs(self):
503
832
  """Clean up previous failed runs that don't save any outputs."""
@@ -509,21 +838,14 @@ class DataFrameCurator(BaseCurator):
509
838
  ).delete()
510
839
 
511
840
 
512
- class AnnDataCurator(DataFrameCurator):
513
- """Curation flow for ``AnnData``.
514
-
515
- See also :class:`~lamindb.Curator`.
516
-
517
- Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
518
-
519
- See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
841
+ class AnnDataCatManager(CatManager):
842
+ """Manage categorical curation.
520
843
 
521
844
  Args:
522
845
  data: The AnnData object or an AnnData-like path.
523
846
  var_index: The registry field for mapping the ``.var`` index.
524
847
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
525
848
  obs_columns: The registry field for mapping the ``.obs.columns``.
526
- using_key: A reference LaminDB instance.
527
849
  verbosity: The verbosity level.
528
850
  organism: The organism name.
529
851
  sources: A dictionary mapping ``.obs.columns`` to Source records.
@@ -538,7 +860,7 @@ class AnnDataCurator(DataFrameCurator):
538
860
  ... var_index=bt.Gene.ensembl_gene_id,
539
861
  ... categoricals={
540
862
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
541
- ... "donor_id": ln.ULabel.name
863
+ ... "donor_id": ULabel.name
542
864
  ... },
543
865
  ... organism="human",
544
866
  ... )
@@ -546,56 +868,48 @@ class AnnDataCurator(DataFrameCurator):
546
868
 
547
869
  def __init__(
548
870
  self,
549
- data: ad.AnnData | UPathStr,
871
+ data: ad.AnnData | Artifact,
550
872
  var_index: FieldAttr,
551
873
  categoricals: dict[str, FieldAttr] | None = None,
552
874
  obs_columns: FieldAttr = Feature.name,
553
- using_key: str | None = None,
554
875
  verbosity: str = "hint",
555
876
  organism: str | None = None,
556
877
  sources: dict[str, Record] | None = None,
557
878
  exclude: dict | None = None,
558
879
  ) -> None:
559
- from lamindb_setup.core import upath
560
-
561
880
  if isinstance(var_index, str):
562
881
  raise TypeError("var_index parameter has to be a bionty field")
563
882
 
564
- from .._artifact import data_is_anndata
565
-
566
883
  if sources is None:
567
884
  sources = {}
568
885
  if not data_is_anndata(data):
569
- raise TypeError(
570
- "data has to be an AnnData object or a path to AnnData-like"
571
- )
572
- if isinstance(data, ad.AnnData):
573
- self._adata = data
574
- else: # pragma: no cover
575
- from lamindb.core.storage._backed_access import backed_access
576
-
577
- self._adata = backed_access(upath.create_path(data))
886
+ raise TypeError("data has to be an AnnData object")
578
887
 
579
888
  if "symbol" in str(var_index):
580
889
  logger.warning(
581
890
  "indexing datasets with gene symbols can be problematic: https://docs.lamin.ai/faq/symbol-mapping"
582
891
  )
583
892
 
584
- self._data = data
893
+ self._obs_fields = categoricals or {}
585
894
  self._var_field = var_index
586
895
  super().__init__(
587
- df=self._adata.obs,
896
+ dataset=data,
588
897
  categoricals=categoricals,
898
+ sources=sources,
899
+ organism=organism,
900
+ exclude=exclude,
901
+ columns_field=var_index,
902
+ )
903
+ self._adata = self._dataset
904
+ self._obs_df_curator = DataFrameCatManager(
905
+ df=self._adata.obs,
906
+ categoricals=self.categoricals,
589
907
  columns=obs_columns,
590
- using_key=using_key,
591
908
  verbosity=verbosity,
592
- organism=organism,
909
+ organism=None,
593
910
  sources=sources,
594
911
  exclude=exclude,
595
- check_valid_keys=False,
596
912
  )
597
- self._obs_fields = categoricals or {}
598
- self._check_valid_keys(extra={"var_index"})
599
913
 
600
914
  @property
601
915
  def var_index(self) -> FieldAttr:
@@ -607,54 +921,53 @@ class AnnDataCurator(DataFrameCurator):
607
921
  """Return the obs fields to validate against."""
608
922
  return self._obs_fields
609
923
 
610
- def lookup(
611
- self, using_key: str | None = None, public: bool = False
612
- ) -> CurateLookup:
924
+ def lookup(self, public: bool = False) -> CurateLookup:
613
925
  """Lookup categories.
614
926
 
615
927
  Args:
616
- using_key: The instance where the lookup is performed.
617
- if "public", the lookup is performed on the public reference.
928
+ public: If "public", the lookup is performed on the public reference.
618
929
  """
619
930
  return CurateLookup(
620
931
  categoricals=self._obs_fields,
621
932
  slots={"columns": self._columns_field, "var_index": self._var_field},
622
- using_key=using_key or self._using_key,
623
933
  public=public,
624
934
  )
625
935
 
626
936
  def _save_from_var_index(
627
- self, validated_only: bool = True, organism: str | None = None
937
+ self,
938
+ validated_only: bool = True,
628
939
  ):
629
940
  """Save variable records."""
630
941
  update_registry(
631
942
  values=list(self._adata.var.index),
632
943
  field=self.var_index,
633
944
  key="var_index",
634
- using_key=self._using_key,
635
945
  validated_only=validated_only,
636
- organism=organism,
946
+ organism=self._organism,
637
947
  source=self._sources.get("var_index"),
638
948
  exclude=self._exclude.get("var_index"),
639
949
  )
640
950
 
641
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
642
- """Save labels for all features."""
643
- self._save_from_var_index(validated_only=validated_only, **self._kwargs)
644
- for name in self._obs_fields.keys():
645
- self._update_registry(name, validated_only=validated_only, **self._kwargs)
951
+ def add_new_from(self, key: str, **kwargs):
952
+ """Add validated & new categories.
646
953
 
647
- def add_new_from_var_index(self, organism: str | None = None, **kwargs):
954
+ Args:
955
+ key: The key referencing the slot in the DataFrame from which to draw terms.
956
+ organism: The organism name.
957
+ **kwargs: Additional keyword arguments to pass to create new records
958
+ """
959
+ self._obs_df_curator.add_new_from(key, **kwargs)
960
+
961
+ def add_new_from_var_index(self, **kwargs):
648
962
  """Update variable records.
649
963
 
650
964
  Args:
651
965
  organism: The organism name.
652
966
  **kwargs: Additional keyword arguments to pass to create new records.
653
967
  """
654
- self._kwargs.update({"organism": organism} if organism else {})
655
- self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
968
+ self._save_from_var_index(validated_only=False, **kwargs)
656
969
 
657
- def validate(self, organism: str | None = None) -> bool:
970
+ def validate(self) -> bool:
658
971
  """Validate categories.
659
972
 
660
973
  This method also registers the validated records in the current instance.
@@ -665,38 +978,25 @@ class AnnDataCurator(DataFrameCurator):
665
978
  Returns:
666
979
  Whether the AnnData object is validated.
667
980
  """
668
- self._kwargs.update({"organism": organism} if organism else {})
669
- if self._using_key is not None and self._using_key != "default":
670
- logger.important(
671
- f"validating metadata using registries of instance {colors.italic(self._using_key)}"
672
- )
981
+ self._validate_category_error_messages = "" # reset the error messages
673
982
 
674
983
  # add all validated records to the current instance
675
- self._update_registry_all()
676
-
984
+ self._save_from_var_index(validated_only=True)
677
985
  validated_var, non_validated_var = validate_categories(
678
986
  self._adata.var.index,
679
987
  field=self._var_field,
680
988
  key="var_index",
681
- using_key=self._using_key,
682
989
  source=self._sources.get("var_index"),
683
990
  hint_print=".add_new_from_var_index()",
684
991
  exclude=self._exclude.get("var_index"),
685
- **self._kwargs, # type: ignore
686
- )
687
- validated_obs, non_validated_obs = validate_categories_in_df(
688
- self._adata.obs,
689
- fields=self.categoricals,
690
- using_key=self._using_key,
691
- sources=self._sources,
692
- exclude=self._exclude,
693
- **self._kwargs,
992
+ organism=self._organism, # type: ignore
694
993
  )
695
- self._non_validated = non_validated_obs # type: ignore
994
+ validated_obs = self._obs_df_curator.validate()
995
+ self._non_validated = self._obs_df_curator._non_validated # type: ignore
696
996
  if len(non_validated_var) > 0:
697
997
  self._non_validated["var_index"] = non_validated_var # type: ignore
698
- self._validated = validated_var and validated_obs
699
- return self._validated
998
+ self._is_validated = validated_var and validated_obs
999
+ return self._is_validated
700
1000
 
701
1001
  def standardize(self, key: str):
702
1002
  """Replace synonyms with standardized values.
@@ -709,68 +1009,26 @@ class AnnDataCurator(DataFrameCurator):
709
1009
 
710
1010
  Inplace modification of the dataset.
711
1011
  """
1012
+ if self._artifact is not None:
1013
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
712
1014
  if key in self._adata.obs.columns or key == "all":
713
1015
  # standardize obs columns
714
- super().standardize(key)
1016
+ self._obs_df_curator.standardize(key)
715
1017
  # in addition to the obs columns, standardize the var.index
716
1018
  if key == "var_index" or key == "all":
717
1019
  syn_mapper = standardize_categories(
718
1020
  self._adata.var.index,
719
1021
  field=self.var_index,
720
- using_key=self._using_key,
721
1022
  source=self._sources.get("var_index"),
722
- **self._kwargs,
1023
+ organism=self._organism,
723
1024
  )
724
1025
  if "var_index" in self._non_validated: # type: ignore
725
1026
  self._adata.var.index = self._replace_synonyms(
726
1027
  "var_index", syn_mapper, self._adata.var.index
727
1028
  )
728
1029
 
729
- def save_artifact(
730
- self,
731
- description: str | None = None,
732
- key: str | None = None,
733
- revises: Artifact | None = None,
734
- run: Run | None = None,
735
- ) -> Artifact:
736
- """Save the validated ``AnnData`` and metadata.
737
-
738
- Args:
739
- description: A description of the ``AnnData`` object.
740
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
741
- Artifacts with the same key form a revision family.
742
- revises: Previous version of the artifact. Triggers a revision.
743
- run: The run that creates the artifact.
744
-
745
- Returns:
746
- A saved artifact record.
747
- """
748
- from lamindb.core._settings import settings
749
-
750
- if not self._validated:
751
- self.validate()
752
- if not self._validated:
753
- raise ValidationError("Dataset does not validate. Please curate.")
754
- verbosity = settings.verbosity
755
- try:
756
- settings.verbosity = "warning"
757
- self._artifact = save_artifact(
758
- self._data,
759
- adata=self._adata,
760
- description=description,
761
- columns_field=self.var_index,
762
- fields=self.categoricals,
763
- key=key,
764
- revises=revises,
765
- run=run,
766
- **self._kwargs,
767
- )
768
- finally:
769
- settings.verbosity = verbosity
770
- return self._artifact
771
-
772
1030
 
773
- class MuDataCurator:
1031
+ class MuDataCatManager(CatManager):
774
1032
  """Curation flow for a ``MuData`` object.
775
1033
 
776
1034
  See also :class:`~lamindb.Curator`.
@@ -782,10 +1040,9 @@ class MuDataCurator:
782
1040
  mdata: The MuData object to curate.
783
1041
  var_index: The registry field for mapping the ``.var`` index for each modality.
784
1042
  For example:
785
- ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
1043
+ ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
786
1044
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
787
1045
  Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
788
- using_key: A reference LaminDB instance.
789
1046
  verbosity: The verbosity level.
790
1047
  organism: The organism name.
791
1048
  sources: A dictionary mapping ``.obs.columns`` to Source records.
@@ -799,11 +1056,11 @@ class MuDataCurator:
799
1056
  ... mdata,
800
1057
  ... var_index={
801
1058
  ... "rna": bt.Gene.ensembl_gene_id,
802
- ... "adt": ln.CellMarker.name
1059
+ ... "adt": CellMarker.name
803
1060
  ... },
804
1061
  ... categoricals={
805
1062
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
806
- ... "donor_id": ln.ULabel.name
1063
+ ... "donor_id": ULabel.name
807
1064
  ... },
808
1065
  ... organism="human",
809
1066
  ... )
@@ -811,52 +1068,47 @@ class MuDataCurator:
811
1068
 
812
1069
  def __init__(
813
1070
  self,
814
- mdata: MuData,
1071
+ mdata: MuData | Artifact,
815
1072
  var_index: dict[str, FieldAttr],
816
1073
  categoricals: dict[str, FieldAttr] | None = None,
817
- using_key: str | None = None,
818
1074
  verbosity: str = "hint",
819
1075
  organism: str | None = None,
820
1076
  sources: dict[str, Record] | None = None,
821
1077
  exclude: dict | None = None, # {modality: {field: [values]}}
822
1078
  ) -> None:
823
- if sources is None:
824
- sources = {}
825
- self._sources = sources
826
- if exclude is None:
827
- exclude = {}
828
- self._exclude = exclude
829
- self._mdata = mdata
830
- self._kwargs = {"organism": organism} if organism else {}
1079
+ super().__init__(
1080
+ dataset=mdata,
1081
+ categoricals={},
1082
+ sources=sources,
1083
+ organism=organism,
1084
+ exclude=exclude,
1085
+ )
1086
+ self._columns_field = var_index # this is for consistency with BaseCatManager
831
1087
  self._var_fields = var_index
832
1088
  self._verify_modality(self._var_fields.keys())
833
1089
  self._obs_fields = self._parse_categoricals(categoricals)
834
1090
  self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
835
- self._using_key = using_key
836
1091
  self._verbosity = verbosity
837
1092
  self._obs_df_curator = None
838
1093
  if "obs" in self._modalities:
839
- self._obs_df_curator = DataFrameCurator(
840
- df=mdata.obs,
1094
+ self._obs_df_curator = DataFrameCatManager(
1095
+ df=self._dataset.obs,
841
1096
  columns=Feature.name,
842
1097
  categoricals=self._obs_fields.get("obs", {}),
843
- using_key=using_key,
844
1098
  verbosity=verbosity,
845
1099
  sources=self._sources.get("obs"),
846
1100
  exclude=self._exclude.get("obs"),
847
- check_valid_keys=False,
848
- **self._kwargs,
1101
+ organism=organism,
849
1102
  )
850
1103
  self._mod_adata_curators = {
851
- modality: AnnDataCurator(
852
- data=mdata[modality],
1104
+ modality: AnnDataCatManager(
1105
+ data=self._dataset[modality],
853
1106
  var_index=var_index.get(modality),
854
1107
  categoricals=self._obs_fields.get(modality),
855
- using_key=using_key,
856
1108
  verbosity=verbosity,
857
1109
  sources=self._sources.get(modality),
858
1110
  exclude=self._exclude.get(modality),
859
- **self._kwargs,
1111
+ organism=organism,
860
1112
  )
861
1113
  for modality in self._modalities
862
1114
  if modality != "obs"
@@ -874,7 +1126,7 @@ class MuDataCurator:
874
1126
  return self._obs_fields
875
1127
 
876
1128
  @property
877
- def non_validated(self) -> dict[str, dict[str, list[str]]]:
1129
+ def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
878
1130
  """Return the non-validated features and labels."""
879
1131
  if self._non_validated is None:
880
1132
  raise ValidationError("Please run validate() first!")
@@ -883,15 +1135,15 @@ class MuDataCurator:
883
1135
  def _verify_modality(self, modalities: Iterable[str]):
884
1136
  """Verify the modality exists."""
885
1137
  for modality in modalities:
886
- if modality not in self._mdata.mod.keys():
1138
+ if modality not in self._dataset.mod.keys():
887
1139
  raise ValidationError(f"modality '{modality}' does not exist!")
888
1140
 
889
1141
  def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
890
1142
  """Parse the categorical fields."""
891
- prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
1143
+ prefixes = {f"{k}:" for k in self._dataset.mod.keys()}
892
1144
  obs_fields: dict[str, dict[str, FieldAttr]] = {}
893
1145
  for k, v in categoricals.items():
894
- if k not in self._mdata.obs.columns:
1146
+ if k not in self._dataset.obs.columns:
895
1147
  raise ValidationError(f"column '{k}' does not exist in mdata.obs!")
896
1148
  if any(k.startswith(prefix) for prefix in prefixes):
897
1149
  modality, col = k.split(":")[0], k.split(":")[1]
@@ -904,14 +1156,11 @@ class MuDataCurator:
904
1156
  obs_fields["obs"][k] = v
905
1157
  return obs_fields
906
1158
 
907
- def lookup(
908
- self, using_key: str | None = None, public: bool = False
909
- ) -> CurateLookup:
1159
+ def lookup(self, public: bool = False) -> CurateLookup:
910
1160
  """Lookup categories.
911
1161
 
912
1162
  Args:
913
- using_key: The instance where the lookup is performed.
914
- if "public", the lookup is performed on the public reference.
1163
+ public: Perform lookup on public source ontologies.
915
1164
  """
916
1165
  obs_fields = {}
917
1166
  for mod, fields in self._obs_fields.items():
@@ -925,27 +1174,19 @@ class MuDataCurator:
925
1174
  slots={
926
1175
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
927
1176
  },
928
- using_key=using_key or self._using_key,
929
1177
  public=public,
930
1178
  )
931
1179
 
1180
+ @deprecated(new_name="is run by default")
932
1181
  def add_new_from_columns(
933
1182
  self,
934
1183
  modality: str,
935
1184
  column_names: list[str] | None = None,
936
- organism: str | None = None,
937
1185
  **kwargs,
938
1186
  ):
939
- """Update columns records."""
940
- warnings.warn(
941
- "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
942
- DeprecationWarning,
943
- stacklevel=2,
944
- )
1187
+ pass
945
1188
 
946
- def add_new_from_var_index(
947
- self, modality: str, organism: str | None = None, **kwargs
948
- ):
1189
+ def add_new_from_var_index(self, modality: str, **kwargs):
949
1190
  """Update variable records.
950
1191
 
951
1192
  Args:
@@ -953,25 +1194,19 @@ class MuDataCurator:
953
1194
  organism: The organism name.
954
1195
  **kwargs: Additional keyword arguments to pass to create new records.
955
1196
  """
956
- self._kwargs.update({"organism": organism} if organism else {})
957
- self._mod_adata_curators[modality].add_new_from_var_index(
958
- **self._kwargs, **kwargs
959
- )
1197
+ self._mod_adata_curators[modality].add_new_from_var_index(**kwargs)
960
1198
 
961
1199
  def _update_registry_all(self):
962
1200
  """Update all registries."""
963
1201
  if self._obs_df_curator is not None:
964
- self._obs_df_curator._update_registry_all(
965
- validated_only=True, **self._kwargs
966
- )
1202
+ self._obs_df_curator._update_registry_all(validated_only=True)
967
1203
  for _, adata_curator in self._mod_adata_curators.items():
968
- adata_curator._update_registry_all(validated_only=True, **self._kwargs)
1204
+ adata_curator._obs_df_curator._update_registry_all(validated_only=True)
969
1205
 
970
1206
  def add_new_from(
971
1207
  self,
972
1208
  key: str,
973
1209
  modality: str | None = None,
974
- organism: str | None = None,
975
1210
  **kwargs,
976
1211
  ):
977
1212
  """Add validated & new categories.
@@ -984,24 +1219,17 @@ class MuDataCurator:
984
1219
  """
985
1220
  if len(kwargs) > 0 and key == "all":
986
1221
  raise ValueError("Cannot pass additional arguments to 'all' key!")
987
- self._kwargs.update({"organism": organism} if organism else {})
988
1222
  modality = modality or "obs"
989
1223
  if modality in self._mod_adata_curators:
990
1224
  adata_curator = self._mod_adata_curators[modality]
991
- adata_curator.add_new_from(key=key, **self._kwargs, **kwargs)
1225
+ adata_curator.add_new_from(key=key, **kwargs)
992
1226
  if modality == "obs":
993
- self._obs_df_curator.add_new_from(key=key, **self._kwargs, **kwargs)
1227
+ self._obs_df_curator.add_new_from(key=key, **kwargs)
994
1228
 
995
- def validate(self, organism: str | None = None) -> bool:
1229
+ def validate(self) -> bool:
996
1230
  """Validate categories."""
997
1231
  from lamindb.core._settings import settings
998
1232
 
999
- self._kwargs.update({"organism": organism} if organism else {})
1000
- if self._using_key is not None and self._using_key != "default":
1001
- logger.important(
1002
- f"validating using registries of instance {colors.italic(self._using_key)}"
1003
- )
1004
-
1005
1233
  # add all validated records to the current instance
1006
1234
  verbosity = settings.verbosity
1007
1235
  try:
@@ -1015,20 +1243,20 @@ class MuDataCurator:
1015
1243
  obs_validated = True
1016
1244
  if "obs" in self._modalities:
1017
1245
  logger.info('validating categoricals in "obs"...')
1018
- obs_validated &= self._obs_df_curator.validate(**self._kwargs)
1246
+ obs_validated &= self._obs_df_curator.validate()
1019
1247
  self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore
1020
1248
  logger.print("")
1021
1249
 
1022
1250
  mods_validated = True
1023
1251
  for modality, adata_curator in self._mod_adata_curators.items():
1024
1252
  logger.info(f'validating categoricals in modality "{modality}"...')
1025
- mods_validated &= adata_curator.validate(**self._kwargs)
1253
+ mods_validated &= adata_curator.validate()
1026
1254
  if len(adata_curator.non_validated) > 0:
1027
1255
  self._non_validated[modality] = adata_curator.non_validated # type: ignore
1028
1256
  logger.print("")
1029
1257
 
1030
- self._validated = obs_validated & mods_validated
1031
- return self._validated
1258
+ self._is_validated = obs_validated & mods_validated
1259
+ return self._is_validated
1032
1260
 
1033
1261
  def standardize(self, key: str, modality: str | None = None):
1034
1262
  """Replace synonyms with standardized values.
@@ -1039,6 +1267,8 @@ class MuDataCurator:
1039
1267
 
1040
1268
  Inplace modification of the dataset.
1041
1269
  """
1270
+ if self._artifact is not None:
1271
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1042
1272
  modality = modality or "obs"
1043
1273
  if modality in self._mod_adata_curators:
1044
1274
  adata_curator = self._mod_adata_curators[modality]
@@ -1046,47 +1276,6 @@ class MuDataCurator:
1046
1276
  if modality == "obs":
1047
1277
  self._obs_df_curator.standardize(key=key)
1048
1278
 
1049
- def save_artifact(
1050
- self,
1051
- description: str | None = None,
1052
- key: str | None = None,
1053
- revises: Artifact | None = None,
1054
- run: Run | None = None,
1055
- ) -> Artifact:
1056
- """Save the validated ``MuData`` and metadata.
1057
-
1058
- Args:
1059
- description: A description of the ``MuData`` object.
1060
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
1061
- revises: Previous version of the artifact. Triggers a revision.
1062
- run: The run that creates the artifact.
1063
-
1064
- Returns:
1065
- A saved artifact record.
1066
- """
1067
- from lamindb.core._settings import settings
1068
-
1069
- if not self._validated:
1070
- self.validate()
1071
- if not self._validated:
1072
- raise ValidationError("Dataset does not validate. Please curate.")
1073
- verbosity = settings.verbosity
1074
- try:
1075
- settings.verbosity = "warning"
1076
- self._artifact = save_artifact(
1077
- self._mdata,
1078
- description=description,
1079
- columns_field=self.var_index,
1080
- fields=self.categoricals,
1081
- key=key,
1082
- revises=revises,
1083
- run=run,
1084
- **self._kwargs,
1085
- )
1086
- finally:
1087
- settings.verbosity = verbosity
1088
- return self._artifact
1089
-
1090
1279
 
1091
1280
  def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1092
1281
  if (n := len(nonval_keys)) > 0:
@@ -1097,8 +1286,8 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1097
1286
  )
1098
1287
 
1099
1288
 
1100
- class SOMACurator(BaseCurator):
1101
- """Curation flow for ``tiledbsoma``.
1289
+ class TiledbsomaCatManager(CatManager):
1290
+ """Curation flow for `tiledbsoma.Experiment`.
1102
1291
 
1103
1292
  See also :class:`~lamindb.Curator`.
1104
1293
 
@@ -1123,7 +1312,7 @@ class SOMACurator(BaseCurator):
1123
1312
  ... var_index={"RNA": ("var_id", bt.Gene.symbol)},
1124
1313
  ... categoricals={
1125
1314
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
1126
- ... "donor_id": ln.ULabel.name
1315
+ ... "donor_id": ULabel.name
1127
1316
  ... },
1128
1317
  ... organism="human",
1129
1318
  ... )
@@ -1138,23 +1327,21 @@ class SOMACurator(BaseCurator):
1138
1327
  organism: str | None = None,
1139
1328
  sources: dict[str, Record] | None = None,
1140
1329
  exclude: dict[str, str | list[str]] | None = None,
1141
- using_key: str | None = None,
1142
1330
  ):
1143
1331
  self._obs_fields = categoricals or {}
1144
1332
  self._var_fields = var_index
1145
1333
  self._columns_field = obs_columns
1146
1334
  if isinstance(experiment_uri, Artifact):
1147
- self._experiment_uri = experiment_uri.path
1335
+ self._dataset = experiment_uri.path
1148
1336
  self._artifact = experiment_uri
1149
1337
  else:
1150
- self._experiment_uri = UPath(experiment_uri)
1338
+ self._dataset = UPath(experiment_uri)
1151
1339
  self._artifact = None
1152
1340
  self._organism = organism
1153
- self._using_key = using_key
1154
1341
  self._sources = sources or {}
1155
1342
  self._exclude = exclude or {}
1156
1343
 
1157
- self._validated: bool | None = False
1344
+ self._is_validated: bool | None = False
1158
1345
  self._non_validated_values: dict[str, list] | None = None
1159
1346
  self._validated_values: dict[str, list] = {}
1160
1347
  # filled by _check_save_keys
@@ -1172,7 +1359,7 @@ class SOMACurator(BaseCurator):
1172
1359
  def _check_save_keys(self):
1173
1360
  from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1174
1361
 
1175
- with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1362
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1176
1363
  experiment_obs = experiment.obs
1177
1364
  self._n_obs = len(experiment_obs)
1178
1365
  self._obs_pa_schema = experiment_obs.schema
@@ -1228,7 +1415,6 @@ class SOMACurator(BaseCurator):
1228
1415
  values=register_columns,
1229
1416
  field=self._columns_field,
1230
1417
  key="columns",
1231
- using_key=self._using_key,
1232
1418
  validated_only=False,
1233
1419
  organism=organism,
1234
1420
  source=self._sources.get("columns"),
@@ -1244,7 +1430,6 @@ class SOMACurator(BaseCurator):
1244
1430
  values=additional_columns,
1245
1431
  field=self._columns_field,
1246
1432
  key="columns",
1247
- using_key=self._using_key,
1248
1433
  validated_only=True,
1249
1434
  organism=organism,
1250
1435
  source=self._sources.get("columns"),
@@ -1257,7 +1442,7 @@ class SOMACurator(BaseCurator):
1257
1442
 
1258
1443
  validated = True
1259
1444
  self._non_validated_values = {}
1260
- with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1445
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1261
1446
  for ms, (key, field) in self._var_fields.items():
1262
1447
  var_ms = experiment.ms[ms].var
1263
1448
  var_ms_key = f"{ms}__{key}"
@@ -1274,7 +1459,6 @@ class SOMACurator(BaseCurator):
1274
1459
  values=var_ms_values,
1275
1460
  field=field,
1276
1461
  key=var_ms_key,
1277
- using_key=self._using_key,
1278
1462
  validated_only=True,
1279
1463
  organism=organism,
1280
1464
  source=self._sources.get(var_ms_key),
@@ -1284,7 +1468,6 @@ class SOMACurator(BaseCurator):
1284
1468
  values=var_ms_values,
1285
1469
  field=field,
1286
1470
  key=var_ms_key,
1287
- using_key=self._using_key,
1288
1471
  organism=organism,
1289
1472
  source=self._sources.get(var_ms_key),
1290
1473
  exclude=self._exclude.get(var_ms_key),
@@ -1310,7 +1493,6 @@ class SOMACurator(BaseCurator):
1310
1493
  values=values,
1311
1494
  field=field,
1312
1495
  key=key,
1313
- using_key=self._using_key,
1314
1496
  validated_only=True,
1315
1497
  organism=organism,
1316
1498
  source=self._sources.get(key),
@@ -1320,7 +1502,6 @@ class SOMACurator(BaseCurator):
1320
1502
  values=values,
1321
1503
  field=field,
1322
1504
  key=key,
1323
- using_key=self._using_key,
1324
1505
  organism=organism,
1325
1506
  source=self._sources.get(key),
1326
1507
  exclude=self._exclude.get(key),
@@ -1330,8 +1511,8 @@ class SOMACurator(BaseCurator):
1330
1511
  self._non_validated_values[key] = non_val
1331
1512
  else:
1332
1513
  self._validated_values[key] = values
1333
- self._validated = validated
1334
- return self._validated
1514
+ self._is_validated = validated
1515
+ return self._is_validated
1335
1516
 
1336
1517
  def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
1337
1518
  assert self._non_validated_values is not None # noqa: S101
@@ -1346,7 +1527,7 @@ class SOMACurator(BaseCurator):
1346
1527
  values = self._non_validated_values.get(key, [])
1347
1528
  return values, field
1348
1529
 
1349
- def add_new_from(self, key: str) -> None:
1530
+ def add_new_from(self, key: str, **kwargs) -> None:
1350
1531
  """Add validated & new categories.
1351
1532
 
1352
1533
  Args:
@@ -1378,11 +1559,11 @@ class SOMACurator(BaseCurator):
1378
1559
  values=values,
1379
1560
  field=field,
1380
1561
  key=k,
1381
- using_key=self._using_key,
1382
1562
  validated_only=False,
1383
1563
  organism=organism,
1384
1564
  source=self._sources.get(k),
1385
1565
  exclude=self._exclude.get(k),
1566
+ **kwargs,
1386
1567
  )
1387
1568
  # update non-validated values list but keep the key there
1388
1569
  # it will be removed by .validate()
@@ -1405,19 +1586,15 @@ class SOMACurator(BaseCurator):
1405
1586
  """Return the obs fields to validate against."""
1406
1587
  return self._obs_fields
1407
1588
 
1408
- def lookup(
1409
- self, using_key: str | None = None, public: bool = False
1410
- ) -> CurateLookup:
1589
+ def lookup(self, public: bool = False) -> CurateLookup:
1411
1590
  """Lookup categories.
1412
1591
 
1413
1592
  Args:
1414
- using_key: The instance where the lookup is performed.
1415
- if "public", the lookup is performed on the public reference.
1593
+ public: If "public", the lookup is performed on the public reference.
1416
1594
  """
1417
1595
  return CurateLookup(
1418
1596
  categoricals=self._obs_fields,
1419
1597
  slots={"columns": self._columns_field, **self._var_fields_flat},
1420
- using_key=using_key or self._using_key,
1421
1598
  public=public,
1422
1599
  )
1423
1600
 
@@ -1462,7 +1639,6 @@ class SOMACurator(BaseCurator):
1462
1639
  syn_mapper = standardize_categories(
1463
1640
  values=values,
1464
1641
  field=field,
1465
- using_key=self._using_key,
1466
1642
  source=self._sources.get(k),
1467
1643
  organism=organism,
1468
1644
  )
@@ -1471,7 +1647,7 @@ class SOMACurator(BaseCurator):
1471
1647
 
1472
1648
  from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1473
1649
 
1474
- with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1650
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1475
1651
  value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
1476
1652
  table = slot(experiment).read(value_filter=value_filter).concat()
1477
1653
 
@@ -1484,7 +1660,7 @@ class SOMACurator(BaseCurator):
1484
1660
  lambda val: syn_mapper.get(val, val) # noqa
1485
1661
  )
1486
1662
  # write the mapped values
1487
- with _open_tiledbsoma(self._experiment_uri, mode="w") as experiment:
1663
+ with _open_tiledbsoma(self._dataset, mode="w") as experiment:
1488
1664
  slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
1489
1665
  # update non_validated dict
1490
1666
  non_val_k = [
@@ -1502,8 +1678,9 @@ class SOMACurator(BaseCurator):
1502
1678
 
1503
1679
  def save_artifact(
1504
1680
  self,
1505
- description: str | None = None,
1681
+ *,
1506
1682
  key: str | None = None,
1683
+ description: str | None = None,
1507
1684
  revises: Artifact | None = None,
1508
1685
  run: Run | None = None,
1509
1686
  ) -> Artifact:
@@ -1512,7 +1689,7 @@ class SOMACurator(BaseCurator):
1512
1689
  Args:
1513
1690
  description: A description of the ``tiledbsoma`` store.
1514
1691
  key: A path-like key to reference artifact in default storage,
1515
- e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a revision family.
1692
+ e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
1516
1693
  revises: Previous version of the artifact. Triggers a revision.
1517
1694
  run: The run that creates the artifact.
1518
1695
 
@@ -1521,14 +1698,14 @@ class SOMACurator(BaseCurator):
1521
1698
  """
1522
1699
  from lamindb.core._data import add_labels
1523
1700
 
1524
- if not self._validated:
1701
+ if not self._is_validated:
1525
1702
  self.validate()
1526
- if not self._validated:
1703
+ if not self._is_validated:
1527
1704
  raise ValidationError("Dataset does not validate. Please curate.")
1528
1705
 
1529
1706
  if self._artifact is None:
1530
1707
  artifact = Artifact(
1531
- self._experiment_uri,
1708
+ self._dataset,
1532
1709
  description=description,
1533
1710
  key=key,
1534
1711
  revises=revises,
@@ -1540,7 +1717,7 @@ class SOMACurator(BaseCurator):
1540
1717
  else:
1541
1718
  artifact = self._artifact
1542
1719
 
1543
- _schemas_m2m = {}
1720
+ feature_sets = {}
1544
1721
  if len(self._obs_fields) > 0:
1545
1722
  organism = check_registry_organism(
1546
1723
  self._columns_field.field.model, self._organism
@@ -1550,7 +1727,7 @@ class SOMACurator(BaseCurator):
1550
1727
  empty_dict, schema=self._obs_pa_schema
1551
1728
  ).to_pandas()
1552
1729
  # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
1553
- _schemas_m2m["obs"] = Schema.from_df(
1730
+ feature_sets["obs"] = Schema.from_df(
1554
1731
  df=mock_df,
1555
1732
  field=self._columns_field,
1556
1733
  mute=True,
@@ -1561,238 +1738,1370 @@ class SOMACurator(BaseCurator):
1561
1738
  organism = check_registry_organism(
1562
1739
  var_field.field.model, self._organism
1563
1740
  ).get("organism")
1564
- _schemas_m2m[f"{ms}__var"] = Schema.from_values(
1741
+ feature_sets[f"{ms}__var"] = Schema.from_values(
1565
1742
  values=self._validated_values[f"{ms}__{var_key}"],
1566
1743
  field=var_field,
1567
1744
  organism=organism,
1568
1745
  raise_validation_error=False,
1569
1746
  )
1570
- artifact._staged__schemas_m2m = _schemas_m2m
1747
+ artifact._staged_feature_sets = feature_sets
1748
+
1749
+ feature_ref_is_name = _ref_is_name(self._columns_field)
1750
+ features = Feature.lookup().dict()
1751
+ for key, field in self._obs_fields.items():
1752
+ feature = features.get(key)
1753
+ registry = field.field.model
1754
+ organism = check_registry_organism(field.field.model, self._organism).get(
1755
+ "organism"
1756
+ )
1757
+ labels = registry.from_values(
1758
+ values=self._validated_values[key], field=field, organism=organism
1759
+ )
1760
+ if len(labels) == 0:
1761
+ continue
1762
+ if hasattr(registry, "_name_field"):
1763
+ label_ref_is_name = field.field.name == registry._name_field
1764
+ add_labels(
1765
+ artifact,
1766
+ records=labels,
1767
+ feature=feature,
1768
+ feature_ref_is_name=feature_ref_is_name,
1769
+ label_ref_is_name=label_ref_is_name,
1770
+ from_curator=True,
1771
+ )
1772
+
1773
+ return artifact.save()
1774
+
1775
+
1776
+ class SpatialDataCatManager(CatManager):
1777
+ """Curation flow for a ``Spatialdata`` object.
1778
+
1779
+ See also :class:`~lamindb.Curator`.
1780
+
1781
+ Note that if genes or other measurements are removed from the SpatialData object,
1782
+ the object should be recreated.
1783
+
1784
+ In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
1785
+
1786
+ Args:
1787
+ sdata: The SpatialData object to curate.
1788
+ var_index: A dictionary mapping table keys to the ``.var`` indices.
1789
+ categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
1790
+
1791
+ organism: The organism name.
1792
+ sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
1793
+ exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
1794
+ When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1795
+ using the exclude parameter ensures they are not validated.
1796
+ verbosity: The verbosity level of the logger.
1797
+ sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
1798
+
1799
+ Examples:
1800
+ >>> import bionty as bt
1801
+ >>> curator = SpatialDataCatManager(
1802
+ ... sdata,
1803
+ ... var_index={
1804
+ ... "table_1": bt.Gene.ensembl_gene_id,
1805
+ ... },
1806
+ ... categoricals={
1807
+ ... "table1":
1808
+ ... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
1809
+ ... "sample":
1810
+ ... {"experimental_factor": bt.ExperimentalFactor.name},
1811
+ ... },
1812
+ ... organism="human",
1813
+ ... )
1814
+ """
1815
+
1816
+ def __init__(
1817
+ self,
1818
+ sdata: Any,
1819
+ var_index: dict[str, FieldAttr],
1820
+ categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1821
+ verbosity: str = "hint",
1822
+ organism: str | None = None,
1823
+ sources: dict[str, dict[str, Record]] | None = None,
1824
+ exclude: dict[str, dict] | None = None,
1825
+ *,
1826
+ sample_metadata_key: str | None = "sample",
1827
+ ) -> None:
1828
+ super().__init__(
1829
+ dataset=sdata,
1830
+ categoricals={},
1831
+ sources=sources,
1832
+ organism=organism,
1833
+ exclude=exclude,
1834
+ )
1835
+ if isinstance(sdata, Artifact):
1836
+ # TODO: load() doesn't yet work
1837
+ self._sdata = sdata.load()
1838
+ else:
1839
+ self._sdata = self._dataset
1840
+ self._sample_metadata_key = sample_metadata_key
1841
+ self._var_fields = var_index
1842
+ self._verify_accessor_exists(self._var_fields.keys())
1843
+ self._categoricals = categoricals
1844
+ self._table_keys = set(self._var_fields.keys()) | set(
1845
+ self._categoricals.keys() - {self._sample_metadata_key}
1846
+ )
1847
+ self._verbosity = verbosity
1848
+ self._sample_df_curator = None
1849
+ if self._sample_metadata_key is not None:
1850
+ self._sample_metadata = self._sdata.get_attrs(
1851
+ key=self._sample_metadata_key, return_as="df", flatten=True
1852
+ )
1853
+ self._is_validated = False
1854
+
1855
+ # Check validity of keys in categoricals
1856
+ nonval_keys = []
1857
+ for accessor, accessor_categoricals in self._categoricals.items():
1858
+ if (
1859
+ accessor == self._sample_metadata_key
1860
+ and self._sample_metadata is not None
1861
+ ):
1862
+ for key in accessor_categoricals.keys():
1863
+ if key not in self._sample_metadata.columns:
1864
+ nonval_keys.append(key)
1865
+ else:
1866
+ for key in accessor_categoricals.keys():
1867
+ if key not in self._sdata[accessor].obs.columns:
1868
+ nonval_keys.append(key)
1869
+
1870
+ _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1871
+
1872
+ # check validity of keys in sources and exclude
1873
+ for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1874
+ nonval_keys = []
1875
+ for accessor, accessor_sources in dct.items():
1876
+ if (
1877
+ accessor == self._sample_metadata_key
1878
+ and self._sample_metadata is not None
1879
+ ):
1880
+ columns = self._sample_metadata.columns
1881
+ elif accessor != self._sample_metadata_key:
1882
+ columns = self._sdata[accessor].obs.columns
1883
+ else:
1884
+ continue
1885
+ for key in accessor_sources:
1886
+ if key not in columns:
1887
+ nonval_keys.append(key)
1888
+ _maybe_curation_keys_not_present(nonval_keys, name)
1889
+
1890
+ # Set up sample level metadata and table Curator objects
1891
+ if (
1892
+ self._sample_metadata_key is not None
1893
+ and self._sample_metadata_key in self._categoricals
1894
+ ):
1895
+ self._sample_df_curator = DataFrameCatManager(
1896
+ df=self._sample_metadata,
1897
+ columns=Feature.name,
1898
+ categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1899
+ verbosity=verbosity,
1900
+ sources=self._sources.get(self._sample_metadata_key),
1901
+ exclude=self._exclude.get(self._sample_metadata_key),
1902
+ organism=organism,
1903
+ )
1904
+ self._table_adata_curators = {
1905
+ table: AnnDataCatManager(
1906
+ data=self._sdata[table],
1907
+ var_index=var_index.get(table),
1908
+ categoricals=self._categoricals.get(table),
1909
+ verbosity=verbosity,
1910
+ sources=self._sources.get(table),
1911
+ exclude=self._exclude.get(table),
1912
+ organism=organism,
1913
+ )
1914
+ for table in self._table_keys
1915
+ }
1916
+
1917
+ self._non_validated = None
1918
+
1919
+ @property
1920
+ def var_index(self) -> FieldAttr:
1921
+ """Return the registry fields to validate variables indices against."""
1922
+ return self._var_fields
1923
+
1924
+ @property
1925
+ def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
1926
+ """Return the categorical keys and fields to validate against."""
1927
+ return self._categoricals
1928
+
1929
+ @property
1930
+ def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
1931
+ """Return the non-validated features and labels."""
1932
+ if self._non_validated is None:
1933
+ raise ValidationError("Please run validate() first!")
1934
+ return self._non_validated
1935
+
1936
+ def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1937
+ """Verify that the accessors exist (either a valid table or in attrs)."""
1938
+ for acc in accessors:
1939
+ is_present = False
1940
+ try:
1941
+ self._sdata.get_attrs(key=acc)
1942
+ is_present = True
1943
+ except KeyError:
1944
+ if acc in self._sdata.tables.keys():
1945
+ is_present = True
1946
+ if not is_present:
1947
+ raise ValidationError(f"Accessor '{acc}' does not exist!")
1948
+
1949
+ def lookup(self, public: bool = False) -> CurateLookup:
1950
+ """Look up categories.
1951
+
1952
+ Args:
1953
+ public: Whether the lookup is performed on the public reference.
1954
+ """
1955
+ cat_values_dict = list(self.categoricals.values())[0]
1956
+ return CurateLookup(
1957
+ categoricals=cat_values_dict,
1958
+ slots={"accessors": cat_values_dict.keys()},
1959
+ public=public,
1960
+ )
1961
+
1962
+ def _update_registry_all(self) -> None:
1963
+ """Saves labels of all features for sample and table metadata."""
1964
+ if self._sample_df_curator is not None:
1965
+ self._sample_df_curator._update_registry_all(
1966
+ validated_only=True,
1967
+ )
1968
+ for _, adata_curator in self._table_adata_curators.items():
1969
+ adata_curator._obs_df_curator._update_registry_all(
1970
+ validated_only=True,
1971
+ )
1972
+
1973
+ def add_new_from_var_index(self, table: str, **kwargs) -> None:
1974
+ """Save new values from ``.var.index`` of table.
1975
+
1976
+ Args:
1977
+ table: The table key.
1978
+ organism: The organism name.
1979
+ **kwargs: Additional keyword arguments to pass to create new records.
1980
+ """
1981
+ if self._non_validated is None:
1982
+ raise ValidationError("Run .validate() first.")
1983
+ self._table_adata_curators[table].add_new_from_var_index(**kwargs)
1984
+ if table in self.non_validated.keys():
1985
+ if "var_index" in self._non_validated[table]:
1986
+ self._non_validated[table].pop("var_index")
1987
+
1988
+ if len(self.non_validated[table].values()) == 0:
1989
+ self.non_validated.pop(table)
1990
+
1991
+ def add_new_from(
1992
+ self,
1993
+ key: str,
1994
+ accessor: str | None = None,
1995
+ **kwargs,
1996
+ ) -> None:
1997
+ """Save new values of categorical from sample level metadata or table.
1998
+
1999
+ Args:
2000
+ key: The key referencing the slot in the DataFrame.
2001
+ accessor: The accessor key such as 'sample' or 'table x'.
2002
+ organism: The organism name.
2003
+ **kwargs: Additional keyword arguments to pass to create new records.
2004
+ """
2005
+ if self._non_validated is None:
2006
+ raise ValidationError("Run .validate() first.")
2007
+
2008
+ if len(kwargs) > 0 and key == "all":
2009
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
2010
+
2011
+ if accessor not in self.categoricals:
2012
+ raise ValueError(
2013
+ f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
2014
+ )
2015
+
2016
+ if accessor in self._table_adata_curators:
2017
+ adata_curator = self._table_adata_curators[accessor]
2018
+ adata_curator.add_new_from(key=key, **kwargs)
2019
+ if accessor == self._sample_metadata_key:
2020
+ self._sample_df_curator.add_new_from(key=key, **kwargs)
2021
+
2022
+ if accessor in self.non_validated.keys():
2023
+ if len(self.non_validated[accessor].values()) == 0:
2024
+ self.non_validated.pop(accessor)
2025
+
2026
+ def standardize(self, key: str, accessor: str | None = None) -> None:
2027
+ """Replace synonyms with canonical values.
2028
+
2029
+ Modifies the dataset inplace.
2030
+
2031
+ Args:
2032
+ key: The key referencing the slot in the table or sample metadata.
2033
+ accessor: The accessor key such as 'sample_key' or 'table_key'.
2034
+ """
2035
+ if len(self.non_validated) == 0:
2036
+ logger.warning("values are already standardized")
2037
+ return
2038
+ if self._artifact is not None:
2039
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
2040
+
2041
+ if accessor == self._sample_metadata_key:
2042
+ if key not in self._sample_metadata.columns:
2043
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
2044
+ else:
2045
+ if (
2046
+ key == "var_index" and self._sdata.tables[accessor].var.index is None
2047
+ ) or (
2048
+ key != "var_index"
2049
+ and key not in self._sdata.tables[accessor].obs.columns
2050
+ ):
2051
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
2052
+
2053
+ if accessor in self._table_adata_curators.keys():
2054
+ adata_curator = self._table_adata_curators[accessor]
2055
+ adata_curator.standardize(key)
2056
+ if accessor == self._sample_metadata_key:
2057
+ self._sample_df_curator.standardize(key)
2058
+
2059
+ if len(self.non_validated[accessor].values()) == 0:
2060
+ self.non_validated.pop(accessor)
2061
+
2062
+ def validate(self) -> bool:
2063
+ """Validate variables and categorical observations.
2064
+
2065
+ This method also registers the validated records in the current instance:
2066
+ - from public sources
2067
+
2068
+ Args:
2069
+ organism: The organism name.
2070
+
2071
+ Returns:
2072
+ Whether the SpatialData object is validated.
2073
+ """
2074
+ from lamindb.core._settings import settings
2075
+
2076
+ # add all validated records to the current instance
2077
+ verbosity = settings.verbosity
2078
+ try:
2079
+ settings.verbosity = "error"
2080
+ self._update_registry_all()
2081
+ finally:
2082
+ settings.verbosity = verbosity
2083
+
2084
+ self._non_validated = {} # type: ignore
2085
+
2086
+ sample_validated = True
2087
+ if self._sample_df_curator:
2088
+ logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
2089
+ sample_validated &= self._sample_df_curator.validate()
2090
+ if len(self._sample_df_curator.non_validated) > 0:
2091
+ self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
2092
+ logger.print("")
2093
+
2094
+ mods_validated = True
2095
+ for table, adata_curator in self._table_adata_curators.items():
2096
+ logger.info(f"validating categoricals of table '{table}' ...")
2097
+ mods_validated &= adata_curator.validate()
2098
+ if len(adata_curator.non_validated) > 0:
2099
+ self._non_validated[table] = adata_curator.non_validated # type: ignore
2100
+ logger.print("")
2101
+
2102
+ self._is_validated = sample_validated & mods_validated
2103
+ return self._is_validated
2104
+
2105
+ def save_artifact(
2106
+ self,
2107
+ *,
2108
+ key: str | None = None,
2109
+ description: str | None = None,
2110
+ revises: Artifact | None = None,
2111
+ run: Run | None = None,
2112
+ ) -> Artifact:
2113
+ if not self._is_validated:
2114
+ self.validate()
2115
+ if not self._is_validated:
2116
+ raise ValidationError("Dataset does not validate. Please curate.")
2117
+
2118
+ verbosity = settings.verbosity
2119
+ try:
2120
+ settings.verbosity = "warning"
2121
+
2122
+ if self._artifact is None:
2123
+ # Write the SpatialData object to a random path in tmp directory
2124
+ # The Artifact constructor will move it to the cache
2125
+ write_path = (
2126
+ f"{settings.cache_dir}/{random.randint(10**7, 10**8 - 1)}.zarr"
2127
+ )
2128
+ self._sdata.write(write_path)
2129
+
2130
+ # Create the Artifact and associate Artifact metadata
2131
+ self._artifact = Artifact(
2132
+ write_path,
2133
+ description=description,
2134
+ key=key,
2135
+ revises=revises,
2136
+ run=run,
2137
+ )
2138
+ # According to Tim it is not easy to calculate the number of observations.
2139
+ # We would have to write custom code to iterate over labels (which might not even exist at that point)
2140
+ self._artifact.otype = "spatialdata"
2141
+ self._artifact.save()
2142
+
2143
+ # Link schemas
2144
+ feature_kwargs = check_registry_organism(
2145
+ (list(self._var_fields.values())[0].field.model),
2146
+ self._organism,
2147
+ )
2148
+
2149
+ def _add_set_from_spatialdata(
2150
+ host: Artifact | Collection | Run,
2151
+ var_fields: dict[str, FieldAttr],
2152
+ obs_fields: dict[str, FieldAttr] = None,
2153
+ mute: bool = False,
2154
+ organism: str | Record | None = None,
2155
+ ):
2156
+ """Add Schemas from SpatialData."""
2157
+ if obs_fields is None:
2158
+ obs_fields = {}
2159
+ assert host.otype == "spatialdata" # noqa: S101
2160
+
2161
+ feature_sets = {}
2162
+
2163
+ # sample features
2164
+ sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
2165
+ if len(sample_features) > 0:
2166
+ feature_sets[self._sample_metadata_key] = Schema(
2167
+ features=sample_features
2168
+ )
2169
+
2170
+ # table features
2171
+ for table, field in var_fields.items():
2172
+ table_fs = parse_staged_feature_sets_from_anndata(
2173
+ self._sdata[table],
2174
+ var_field=field,
2175
+ obs_field=obs_fields.get(table, Feature.name),
2176
+ mute=mute,
2177
+ organism=organism,
2178
+ )
2179
+ for k, v in table_fs.items():
2180
+ feature_sets[f"['{table}'].{k}"] = v
2181
+
2182
+ def _unify_staged_feature_sets_by_hash(
2183
+ feature_sets: MutableMapping[str, Schema],
2184
+ ):
2185
+ unique_values: dict[str, Any] = {}
2186
+
2187
+ for key, value in feature_sets.items():
2188
+ value_hash = (
2189
+ value.hash
2190
+ ) # Assuming each value has a .hash attribute
2191
+ if value_hash in unique_values:
2192
+ feature_sets[key] = unique_values[value_hash]
2193
+ else:
2194
+ unique_values[value_hash] = value
2195
+
2196
+ return feature_sets
2197
+
2198
+ # link feature sets
2199
+ host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
2200
+ feature_sets
2201
+ )
2202
+ host.save()
2203
+
2204
+ _add_set_from_spatialdata(
2205
+ self._artifact, var_fields=self._var_fields, **feature_kwargs
2206
+ )
2207
+
2208
+ # Link labels
2209
+ def _add_labels_from_spatialdata(
2210
+ data,
2211
+ artifact: Artifact,
2212
+ fields: dict[str, FieldAttr],
2213
+ feature_ref_is_name: bool | None = None,
2214
+ ):
2215
+ """Add Labels from SpatialData."""
2216
+ features = Feature.lookup().dict()
2217
+ for key, field in fields.items():
2218
+ feature = features.get(key)
2219
+ registry = field.field.model
2220
+ filter_kwargs = check_registry_organism(registry, self._organism)
2221
+ filter_kwargs_current = get_current_filter_kwargs(
2222
+ registry, filter_kwargs
2223
+ )
2224
+ df = data if isinstance(data, pd.DataFrame) else data.obs
2225
+ labels = registry.from_values(
2226
+ df[key],
2227
+ field=field,
2228
+ **filter_kwargs_current,
2229
+ )
2230
+ if len(labels) == 0:
2231
+ continue
2232
+
2233
+ label_ref_is_name = None
2234
+ if hasattr(registry, "_name_field"):
2235
+ label_ref_is_name = field.field.name == registry._name_field
2236
+ add_labels(
2237
+ artifact,
2238
+ records=labels,
2239
+ feature=feature,
2240
+ feature_ref_is_name=feature_ref_is_name,
2241
+ label_ref_is_name=label_ref_is_name,
2242
+ from_curator=True,
2243
+ )
2244
+
2245
+ for accessor, accessor_fields in self._categoricals.items():
2246
+ column_field = self._var_fields.get(accessor)
2247
+ if accessor == self._sample_metadata_key:
2248
+ _add_labels_from_spatialdata(
2249
+ self._sample_metadata,
2250
+ self._artifact,
2251
+ accessor_fields,
2252
+ feature_ref_is_name=(
2253
+ None if column_field is None else _ref_is_name(column_field)
2254
+ ),
2255
+ )
2256
+ else:
2257
+ _add_labels_from_spatialdata(
2258
+ self._sdata.tables[accessor],
2259
+ self._artifact,
2260
+ accessor_fields,
2261
+ feature_ref_is_name=(
2262
+ None if column_field is None else _ref_is_name(column_field)
2263
+ ),
2264
+ )
2265
+
2266
+ finally:
2267
+ settings.verbosity = verbosity
2268
+
2269
+ slug = ln_setup.settings.instance.slug
2270
+ if ln_setup.settings.instance.is_remote: # pragma: no cover
2271
+ logger.important(
2272
+ f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
2273
+ )
2274
+
2275
+ return self._artifact
2276
+
2277
+
2278
+ def _restrict_obs_fields(
2279
+ obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
2280
+ ) -> dict[str, str]:
2281
+ """Restrict the obs fields to name return only available obs fields.
2282
+
2283
+ To simplify the curation, we only validate against either name or ontology_id.
2284
+ If both are available, we validate against ontology_id.
2285
+ If none are available, we validate against name.
2286
+ """
2287
+ obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
2288
+ for name, field in obs_fields.items():
2289
+ if name.endswith("_ontology_term_id"):
2290
+ continue
2291
+ # if both the ontology id and the name are present, only validate on the ontology_id
2292
+ if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
2293
+ obs_fields_unique.pop(name)
2294
+ # if the neither name nor ontology id are present, validate on the name
2295
+ # this will raise error downstream, we just use name to be more readable
2296
+ if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2297
+ obs_fields_unique[name] = field
2298
+
2299
+ # Only retain obs_fields_unique that have keys in adata.obs.columns
2300
+ available_obs_fields = {
2301
+ k: v for k, v in obs_fields_unique.items() if k in obs.columns
2302
+ }
2303
+
2304
+ return available_obs_fields
2305
+
2306
+
2307
+ def _add_defaults_to_obs(
2308
+ obs: pd.DataFrame,
2309
+ defaults: dict[str, str],
2310
+ ) -> None:
2311
+ """Add default columns and values to obs DataFrame."""
2312
+ added_defaults: dict = {}
2313
+ for name, default in defaults.items():
2314
+ if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2315
+ obs[name] = default
2316
+ added_defaults[name] = default
2317
+ logger.important(
2318
+ f"added default value '{default}' to the adata.obs['{name}']"
2319
+ )
2320
+
2321
+
2322
+ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2323
+ """Annotation flow of AnnData based on CELLxGENE schema."""
2324
+
2325
+ _controls_were_created: bool | None = None
2326
+
2327
+ def __init__(
2328
+ self,
2329
+ adata: ad.AnnData | UPathStr,
2330
+ categoricals: dict[str, FieldAttr] | None = None,
2331
+ organism: Literal["human", "mouse"] = "human",
2332
+ *,
2333
+ defaults: dict[str, str] = None,
2334
+ extra_sources: dict[str, Record] = None,
2335
+ schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
2336
+ verbosity: str = "hint",
2337
+ ) -> None:
2338
+ """CELLxGENE schema curator.
2339
+
2340
+ Args:
2341
+ adata: Path to or AnnData object to curate against the CELLxGENE schema.
2342
+ categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
2343
+ The CELLxGENE Curator maps against the required CELLxGENE fields by default.
2344
+ organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
2345
+ defaults: Default values that are set if columns or column values are missing.
2346
+ extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
2347
+ These extra sources are joined with the CELLxGENE fixed sources.
2348
+ Use this parameter when subclassing.
2349
+ exclude: A dictionary mapping column names to values to exclude.
2350
+ schema_version: The CELLxGENE schema version to curate against.
2351
+ verbosity: The verbosity level.
2352
+
2353
+ """
2354
+ import bionty as bt
2355
+
2356
+ CellxGeneAnnDataCatManager._init_categoricals_additional_values()
2357
+
2358
+ var_index: FieldAttr = bt.Gene.ensembl_gene_id
2359
+
2360
+ if categoricals is None:
2361
+ categoricals = CellxGeneAnnDataCatManager._get_categoricals()
2362
+
2363
+ self.organism = organism
2364
+
2365
+ VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
2366
+ if schema_version not in VALID_SCHEMA_VERSIONS:
2367
+ valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
2368
+ raise ValueError(
2369
+ f"Invalid schema_version: {schema_version}. "
2370
+ f"Valid versions are: {valid_versions}"
2371
+ )
2372
+ self.schema_version = schema_version
2373
+ self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
2374
+ with resources.path(
2375
+ "lamindb.curators._cellxgene_schemas", "schema_versions.yml"
2376
+ ) as schema_versions_path:
2377
+ self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
2378
+ self.schema_version
2379
+ ]
2380
+
2381
+ # Fetch AnnData obs to be able to set defaults and get sources
2382
+ if isinstance(adata, ad.AnnData):
2383
+ self._adata_obs = adata.obs
2384
+ else:
2385
+ self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
2386
+
2387
+ # Add defaults first to ensure that we fetch valid sources
2388
+ if defaults:
2389
+ _add_defaults_to_obs(self._adata_obs, defaults)
2390
+
2391
+ self.sources = self._create_sources(self._adata_obs)
2392
+ self.sources = {
2393
+ entity: source
2394
+ for entity, source in self.sources.items()
2395
+ if source is not None
2396
+ }
2397
+
2398
+ # These sources are not a part of the cellxgene schema but rather passed through.
2399
+ # This is useful when other Curators extend the CELLxGENE curator
2400
+ if extra_sources:
2401
+ self.sources = self.sources | extra_sources
2402
+
2403
+ # Exclude default values from validation because they are not available in the pinned sources
2404
+ exclude_keys = {
2405
+ entity: default
2406
+ for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
2407
+ if entity in self._adata_obs.columns # type: ignore
2408
+ }
2409
+
2410
+ super().__init__(
2411
+ data=adata,
2412
+ var_index=var_index,
2413
+ categoricals=_restrict_obs_fields(self._adata_obs, categoricals),
2414
+ verbosity=verbosity,
2415
+ organism=organism,
2416
+ sources=self.sources,
2417
+ exclude=exclude_keys,
2418
+ )
2419
+
2420
+ @classmethod
2421
+ def _init_categoricals_additional_values(cls) -> None:
2422
+ import bionty as bt
2423
+
2424
+ import lamindb as ln
2425
+
2426
+ # Note: if you add another control below, be mindful to change the if condition that
2427
+ # triggers whether creating these records is re-considered
2428
+ if cls._controls_were_created is None:
2429
+ cls._controls_were_created = (
2430
+ ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
2431
+ is not None
2432
+ )
2433
+ if not cls._controls_were_created:
2434
+ logger.important("Creating control labels in the CellxGene schema.")
2435
+ bt.CellType(
2436
+ ontology_id="unknown",
2437
+ name="unknown",
2438
+ description="From CellxGene schema.",
2439
+ ).save()
2440
+ pato = bt.Source.filter(name="pato", version="2024-03-28").one()
2441
+ normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
2442
+ bt.Disease(
2443
+ uid=normal.uid,
2444
+ name=normal.name,
2445
+ ontology_id=normal.ontology_id,
2446
+ description=normal.description,
2447
+ source=normal.source,
2448
+ ).save()
2449
+ bt.Ethnicity(
2450
+ ontology_id="na", name="na", description="From CellxGene schema."
2451
+ ).save()
2452
+ bt.Ethnicity(
2453
+ ontology_id="unknown",
2454
+ name="unknown",
2455
+ description="From CellxGene schema.",
2456
+ ).save()
2457
+ bt.DevelopmentalStage(
2458
+ ontology_id="unknown",
2459
+ name="unknown",
2460
+ description="From CellxGene schema.",
2461
+ ).save()
2462
+ bt.Phenotype(
2463
+ ontology_id="unknown",
2464
+ name="unknown",
2465
+ description="From CellxGene schema.",
2466
+ ).save()
2467
+
2468
+ tissue_type = ln.ULabel(
2469
+ name="TissueType",
2470
+ is_type=True,
2471
+ description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
2472
+ ).save()
2473
+ ln.ULabel(
2474
+ name="tissue", type=tissue_type, description="From CellxGene schema."
2475
+ ).save()
2476
+ ln.ULabel(
2477
+ name="organoid", type=tissue_type, description="From CellxGene schema."
2478
+ ).save()
2479
+ ln.ULabel(
2480
+ name="cell culture",
2481
+ type=tissue_type,
2482
+ description="From CellxGene schema.",
2483
+ ).save()
2484
+
2485
+ suspension_type = ln.ULabel(
2486
+ name="SuspensionType",
2487
+ is_type=True,
2488
+ description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
2489
+ ).save()
2490
+ ln.ULabel(
2491
+ name="cell", type=suspension_type, description="From CellxGene schema."
2492
+ ).save()
2493
+ ln.ULabel(
2494
+ name="nucleus",
2495
+ type=suspension_type,
2496
+ description="From CellxGene schema.",
2497
+ ).save()
2498
+ ln.ULabel(name="na", type=suspension_type).save()
2499
+
2500
+ @classmethod
2501
+ def _get_categoricals(cls) -> dict[str, FieldAttr]:
2502
+ import bionty as bt
2503
+
2504
+ return {
2505
+ "assay": bt.ExperimentalFactor.name,
2506
+ "assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
2507
+ "cell_type": bt.CellType.name,
2508
+ "cell_type_ontology_term_id": bt.CellType.ontology_id,
2509
+ "development_stage": bt.DevelopmentalStage.name,
2510
+ "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
2511
+ "disease": bt.Disease.name,
2512
+ "disease_ontology_term_id": bt.Disease.ontology_id,
2513
+ # "donor_id": "str", via pandera
2514
+ "self_reported_ethnicity": bt.Ethnicity.name,
2515
+ "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
2516
+ "sex": bt.Phenotype.name,
2517
+ "sex_ontology_term_id": bt.Phenotype.ontology_id,
2518
+ "suspension_type": ULabel.name,
2519
+ "tissue": bt.Tissue.name,
2520
+ "tissue_ontology_term_id": bt.Tissue.ontology_id,
2521
+ "tissue_type": ULabel.name,
2522
+ "organism": bt.Organism.name,
2523
+ "organism_ontology_term_id": bt.Organism.ontology_id,
2524
+ }
2525
+
2526
+ @classmethod
2527
+ def _get_categoricals_defaults(cls) -> dict[str, str]:
2528
+ return {
2529
+ "cell_type": "unknown",
2530
+ "development_stage": "unknown",
2531
+ "disease": "normal",
2532
+ "donor_id": "unknown",
2533
+ "self_reported_ethnicity": "unknown",
2534
+ "sex": "unknown",
2535
+ "suspension_type": "cell",
2536
+ "tissue_type": "tissue",
2537
+ }
2538
+
2539
+ @property
2540
+ def pinned_ontologies(self) -> pd.DataFrame:
2541
+ return self._pinned_ontologies
2542
+
2543
+ @property
2544
+ def adata(self) -> AnnData:
2545
+ return self._adata
2546
+
2547
+ def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
2548
+ """Creates a sources dictionary that can be passed to AnnDataCatManager."""
2549
+ import bionty as bt
2550
+
2551
+ # fmt: off
2552
+ def _fetch_bionty_source(
2553
+ entity: str, organism: str, source: str
2554
+ ) -> bt.Source | None:
2555
+ """Fetch the Bionty source of the pinned ontology.
2556
+
2557
+ Returns None if the source does not exist.
2558
+ """
2559
+ version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
2560
+ (self._pinned_ontologies["organism"] == organism) &
2561
+ (self._pinned_ontologies["source"] == source), "version"].iloc[0]
2562
+ return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
2563
+
2564
+ entity_mapping = {
2565
+ "var_index": ("Gene", self.organism, "ensembl"),
2566
+ "cell_type": ("CellType", "all", "cl"),
2567
+ "assay": ("ExperimentalFactor", "all", "efo"),
2568
+ "self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
2569
+ "development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
2570
+ "disease": ("Disease", "all", "mondo"),
2571
+ # "organism": ("Organism", "vertebrates", "ensembl"),
2572
+ "sex": ("Phenotype", "all", "pato"),
2573
+ "tissue": ("Tissue", "all", "uberon"),
2574
+ }
2575
+ # fmt: on
2576
+
2577
+ # Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
2578
+ entity_to_sources = {
2579
+ entity: _fetch_bionty_source(*params)
2580
+ for entity, params in entity_mapping.items()
2581
+ if entity in obs.columns
2582
+ or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
2583
+ or entity == "var_index"
2584
+ }
2585
+
2586
+ return entity_to_sources
2587
+
2588
+ def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
2589
+ """Converts a column that stores a name into a column that stores the ontology id.
2590
+
2591
+ cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
2592
+ """
2593
+ field_name = field.field.name
2594
+ assert field_name == "name" # noqa: S101
2595
+ cols = ["name", "ontology_id"]
2596
+ registry = field.field.model
2597
+
2598
+ if hasattr(registry, "ontology_id"):
2599
+ validated_records = registry.filter(**{f"{field_name}__in": values})
2600
+ mapper = (
2601
+ pd.DataFrame(validated_records.values_list(*cols))
2602
+ .set_index(0)
2603
+ .to_dict()[1]
2604
+ )
2605
+ return values.map(mapper)
2606
+
2607
+ def validate(self) -> bool: # type: ignore
2608
+ """Validates the AnnData object against most cellxgene requirements."""
2609
+ # Verify that all required obs columns are present
2610
+ missing_obs_fields = [
2611
+ name
2612
+ for name in CellxGeneAnnDataCatManager._get_categoricals_defaults().keys()
2613
+ if name not in self._adata.obs.columns
2614
+ and f"{name}_ontology_term_id" not in self._adata.obs.columns
2615
+ ]
2616
+ if len(missing_obs_fields) > 0:
2617
+ missing_obs_fields_str = ", ".join(list(missing_obs_fields))
2618
+ logger.error(f"missing required obs columns {missing_obs_fields_str}")
2619
+ logger.info(
2620
+ "consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
2621
+ "to automatically add these columns with default values."
2622
+ )
2623
+ return False
2624
+
2625
+ # Verify that no cellxgene reserved names are present
2626
+ reserved_names = {
2627
+ "ethnicity",
2628
+ "ethnicity_ontology_term_id",
2629
+ "X_normalization",
2630
+ "default_field",
2631
+ "layer_descriptions",
2632
+ "tags",
2633
+ "versions",
2634
+ "contributors",
2635
+ "preprint_doi",
2636
+ "project_description",
2637
+ "project_links",
2638
+ "project_name",
2639
+ "publication_doi",
2640
+ }
2641
+ matched_columns = [
2642
+ column for column in self._adata.obs.columns if column in reserved_names
2643
+ ]
2644
+ if len(matched_columns) > 0:
2645
+ raise ValueError(
2646
+ f"AnnData object must not contain obs columns {matched_columns} which are"
2647
+ " reserved from previous schema versions."
2648
+ )
1571
2649
 
1572
- feature_ref_is_name = _ref_is_name(self._columns_field)
1573
- features = Feature.lookup().dict()
1574
- for key, field in self._obs_fields.items():
1575
- feature = features.get(key)
1576
- registry = field.field.model
1577
- organism = check_registry_organism(field.field.model, self._organism).get(
1578
- "organism"
2650
+ return super().validate()
2651
+
2652
+ def to_cellxgene_anndata(
2653
+ self, is_primary_data: bool, title: str | None = None
2654
+ ) -> ad.AnnData:
2655
+ """Converts the AnnData object to the cellxgene-schema input format.
2656
+
2657
+ cellxgene expects the obs fields to be {entity}_ontology_id fields and has many further requirements which are
2658
+ documented here: https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema.
2659
+ This function checks for most but not all requirements of the CELLxGENE schema.
2660
+ If you want to ensure that it fully adheres to the CELLxGENE schema, run `cellxgene-schema` on the AnnData object.
2661
+
2662
+ Args:
2663
+ is_primary_data: Whether the measured data is primary data or not.
2664
+ title: Title of the AnnData object. Commonly the name of the publication.
2665
+
2666
+ Returns:
2667
+ An AnnData object which adheres to the cellxgene-schema.
2668
+ """
2669
+ # Create a copy since we modify the AnnData object extensively
2670
+ adata_cxg = self._adata.copy()
2671
+
2672
+ # cellxgene requires an embedding
2673
+ embedding_pattern = r"^[a-zA-Z][a-zA-Z0-9_.-]*$"
2674
+ exclude_key = "spatial"
2675
+ matching_keys = [
2676
+ key
2677
+ for key in adata_cxg.obsm.keys()
2678
+ if re.match(embedding_pattern, key) and key != exclude_key
2679
+ ]
2680
+ if len(matching_keys) == 0:
2681
+ raise ValueError(
2682
+ "Unable to find an embedding key. Please calculate an embedding."
1579
2683
  )
1580
- labels = registry.from_values(
1581
- values=self._validated_values[key], field=field, organism=organism
2684
+
2685
+ # convert name column to ontology_term_id column
2686
+ for column in adata_cxg.obs.columns:
2687
+ if column in self.categoricals and not column.endswith("_ontology_term_id"):
2688
+ mapped_column = self._convert_name_to_ontology_id(
2689
+ adata_cxg.obs[column], field=self.categoricals.get(column)
2690
+ )
2691
+ if mapped_column is not None:
2692
+ adata_cxg.obs[f"{column}_ontology_term_id"] = mapped_column
2693
+
2694
+ # drop the name columns for ontologies. cellxgene does not allow them.
2695
+ drop_columns = [
2696
+ i
2697
+ for i in adata_cxg.obs.columns
2698
+ if f"{i}_ontology_term_id" in adata_cxg.obs.columns
2699
+ ]
2700
+ adata_cxg.obs.drop(columns=drop_columns, inplace=True)
2701
+
2702
+ # Add cellxgene metadata to AnnData object
2703
+ if "is_primary_data" not in adata_cxg.obs.columns:
2704
+ adata_cxg.obs["is_primary_data"] = is_primary_data
2705
+ if "feature_is_filtered" not in adata_cxg.var.columns:
2706
+ logger.warn(
2707
+ "column 'feature_is_filtered' not present in var. Setting to default"
2708
+ " value of False."
1582
2709
  )
1583
- if len(labels) == 0:
1584
- continue
1585
- if hasattr(registry, "_name_field"):
1586
- label_ref_is_name = field.field.name == registry._name_field
1587
- add_labels(
1588
- artifact,
1589
- records=labels,
1590
- feature=feature,
1591
- feature_ref_is_name=feature_ref_is_name,
1592
- label_ref_is_name=label_ref_is_name,
1593
- from_curator=True,
2710
+ adata_cxg.var["feature_is_filtered"] = False
2711
+ if title is None:
2712
+ raise ValueError("please pass a title!")
2713
+ else:
2714
+ adata_cxg.uns["title"] = title
2715
+ adata_cxg.uns["cxg_lamin_schema_reference"] = self.schema_reference
2716
+ adata_cxg.uns["cxg_lamin_schema_version"] = self.schema_version
2717
+
2718
+ return adata_cxg
2719
+
2720
+
2721
+ class ValueUnit:
2722
+ """Base class for handling value-unit combinations."""
2723
+
2724
+ @staticmethod
2725
+ def parse_value_unit(value: str, is_dose: bool = True) -> tuple[str, str] | None:
2726
+ """Parse a string containing a value and unit into a tuple."""
2727
+ if not isinstance(value, str) or not value.strip():
2728
+ return None
2729
+
2730
+ value = str(value).strip()
2731
+ match = re.match(r"^(\d*\.?\d{0,1})\s*([a-zA-ZμµΜ]+)$", value)
2732
+
2733
+ if not match:
2734
+ raise ValueError(
2735
+ f"Invalid format: {value}. Expected format: number with max 1 decimal place + unit"
2736
+ )
2737
+
2738
+ number, unit = match.groups()
2739
+ formatted_number = f"{float(number):.1f}"
2740
+
2741
+ if is_dose:
2742
+ standardized_unit = DoseHandler.standardize_unit(unit)
2743
+ if not DoseHandler.validate_unit(standardized_unit):
2744
+ raise ValueError(
2745
+ f"Invalid dose unit: {unit}. Must be convertible to one of: nM, μM, mM, M"
2746
+ )
2747
+ else:
2748
+ standardized_unit = TimeHandler.standardize_unit(unit)
2749
+ if not TimeHandler.validate_unit(standardized_unit):
2750
+ raise ValueError(
2751
+ f"Invalid time unit: {unit}. Must be convertible to one of: h, m, s, d, y"
1594
2752
  )
1595
2753
 
1596
- return artifact.save()
2754
+ return formatted_number, standardized_unit
1597
2755
 
1598
2756
 
1599
- class Curator(BaseCurator):
1600
- """Dataset curator.
2757
+ class DoseHandler:
2758
+ """Handler for dose-related operations."""
1601
2759
 
1602
- A `Curator` object makes it easy to save validated & annotated artifacts.
2760
+ VALID_UNITS = {"nM", "μM", "µM", "mM", "M"}
2761
+ UNIT_MAP = {
2762
+ "nm": "nM",
2763
+ "NM": "nM",
2764
+ "um": "μM",
2765
+ "UM": "μM",
2766
+ "μm": "μM",
2767
+ "μM": "μM",
2768
+ "µm": "μM",
2769
+ "µM": "μM",
2770
+ "mm": "mM",
2771
+ "MM": "mM",
2772
+ "m": "M",
2773
+ "M": "M",
2774
+ }
1603
2775
 
1604
- Example:
2776
+ @classmethod
2777
+ def validate_unit(cls, unit: str) -> bool:
2778
+ """Validate if the dose unit is acceptable."""
2779
+ return unit in cls.VALID_UNITS
1605
2780
 
1606
- >>> curator = ln.Curator.from_df(
1607
- >>> df,
1608
- >>> # define validation criteria as mappings
1609
- >>> columns=ln.Feature.name, # map column names
1610
- >>> categoricals={"perturbation": ln.ULabel.name}, # map categories
1611
- >>> )
1612
- >>> curator.validate() # validate the data in df
1613
- >>> artifact = curator.save_artifact(description="my RNA-seq")
1614
- >>> artifact.describe() # see annotations
2781
+ @classmethod
2782
+ def standardize_unit(cls, unit: str) -> str:
2783
+ """Standardize dose unit to standard formats."""
2784
+ return cls.UNIT_MAP.get(unit, unit)
1615
2785
 
1616
- `curator.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
2786
+ @classmethod
2787
+ def validate_values(cls, values: pd.Series) -> list:
2788
+ """Validate pert_dose values with strict case checking."""
2789
+ errors = []
1617
2790
 
1618
- If you find non-validated values, you have several options:
2791
+ for idx, value in values.items():
2792
+ if pd.isna(value):
2793
+ continue
1619
2794
 
1620
- - new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`
1621
- - non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and addressed manually
1622
- """
2795
+ if isinstance(value, (int, float)):
2796
+ errors.append(
2797
+ f"Row {idx} - Missing unit for dose: {value}. Must include a unit (nM, μM, mM, M)"
2798
+ )
2799
+ continue
2800
+
2801
+ try:
2802
+ ValueUnit.parse_value_unit(value, is_dose=True)
2803
+ except ValueError as e:
2804
+ errors.append(f"Row {idx} - {str(e)}")
2805
+
2806
+ return errors
2807
+
2808
+
2809
+ class TimeHandler:
2810
+ """Handler for time-related operations."""
2811
+
2812
+ VALID_UNITS = {"h", "m", "s", "d", "y"}
1623
2813
 
1624
2814
  @classmethod
1625
- @doc_args(DataFrameCurator.__doc__)
1626
- def from_df(
1627
- cls,
1628
- df: pd.DataFrame,
1629
- categoricals: dict[str, FieldAttr] | None = None,
1630
- columns: FieldAttr = Feature.name,
1631
- using_key: str | None = None,
1632
- verbosity: str = "hint",
1633
- organism: str | None = None,
1634
- ) -> DataFrameCurator:
1635
- """{}""" # noqa: D415
1636
- return DataFrameCurator(
1637
- df=df,
1638
- categoricals=categoricals,
1639
- columns=columns,
1640
- using_key=using_key,
1641
- verbosity=verbosity,
1642
- organism=organism,
1643
- )
2815
+ def validate_unit(cls, unit: str) -> bool:
2816
+ """Validate if the time unit is acceptable."""
2817
+ return unit == unit.lower() and unit in cls.VALID_UNITS
1644
2818
 
1645
2819
  @classmethod
1646
- @doc_args(AnnDataCurator.__doc__)
1647
- def from_anndata(
1648
- cls,
1649
- data: ad.AnnData | UPathStr,
1650
- var_index: FieldAttr,
1651
- categoricals: dict[str, FieldAttr] | None = None,
1652
- obs_columns: FieldAttr = Feature.name,
1653
- using_key: str | None = None,
1654
- verbosity: str = "hint",
1655
- organism: str | None = None,
1656
- sources: dict[str, Record] | None = None,
1657
- ) -> AnnDataCurator:
1658
- """{}""" # noqa: D415
1659
- return AnnDataCurator(
1660
- data=data,
1661
- var_index=var_index,
1662
- categoricals=categoricals,
1663
- obs_columns=obs_columns,
1664
- using_key=using_key,
1665
- verbosity=verbosity,
1666
- organism=organism,
1667
- sources=sources,
1668
- )
2820
+ def standardize_unit(cls, unit: str) -> str:
2821
+ """Standardize time unit to standard formats."""
2822
+ if unit.startswith("hr"):
2823
+ return "h"
2824
+ elif unit.startswith("min"):
2825
+ return "m"
2826
+ elif unit.startswith("sec"):
2827
+ return "s"
2828
+ return unit[0].lower()
1669
2829
 
1670
2830
  @classmethod
1671
- @doc_args(MuDataCurator.__doc__)
1672
- def from_mudata(
1673
- cls,
1674
- mdata: MuData,
1675
- var_index: dict[str, dict[str, FieldAttr]],
1676
- categoricals: dict[str, FieldAttr] | None = None,
1677
- using_key: str | None = None,
2831
+ def validate_values(cls, values: pd.Series) -> list:
2832
+ """Validate pert_time values."""
2833
+ errors = []
2834
+
2835
+ for idx, value in values.items():
2836
+ if pd.isna(value):
2837
+ continue
2838
+
2839
+ if isinstance(value, (int, float)):
2840
+ errors.append(
2841
+ f"Row {idx} - Missing unit for time: {value}. Must include a unit (h, m, s, d, y)"
2842
+ )
2843
+ continue
2844
+
2845
+ try:
2846
+ ValueUnit.parse_value_unit(value, is_dose=False)
2847
+ except ValueError as e:
2848
+ errors.append(f"Row {idx} - {str(e)}")
2849
+
2850
+ return errors
2851
+
2852
+
2853
+ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2854
+ """Curator flow for Perturbation data."""
2855
+
2856
+ PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
2857
+
2858
+ def __init__(
2859
+ self,
2860
+ adata: ad.AnnData,
2861
+ organism: Literal["human", "mouse"] = "human",
2862
+ pert_dose: bool = True,
2863
+ pert_time: bool = True,
2864
+ *,
1678
2865
  verbosity: str = "hint",
1679
- organism: str | None = None,
1680
- ) -> MuDataCurator:
1681
- """{}""" # noqa: D415
1682
- return MuDataCurator(
1683
- mdata=mdata,
1684
- var_index=var_index,
1685
- categoricals=categoricals,
1686
- using_key=using_key,
2866
+ cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
2867
+ ):
2868
+ """Initialize the curator with configuration and validation settings."""
2869
+ import bionty as bt
2870
+
2871
+ self._pert_time = pert_time
2872
+ self._pert_dose = pert_dose
2873
+
2874
+ self._validate_initial_data(adata)
2875
+ self._setup_configuration(adata)
2876
+
2877
+ self._setup_sources(adata)
2878
+ self._setup_compound_source()
2879
+
2880
+ super().__init__(
2881
+ adata=adata,
2882
+ categoricals=self.PT_CATEGORICALS,
2883
+ defaults=self.PT_DEFAULT_VALUES,
1687
2884
  verbosity=verbosity,
1688
2885
  organism=organism,
2886
+ extra_sources=self.PT_SOURCES,
2887
+ schema_version=cxg_schema_version,
1689
2888
  )
1690
2889
 
1691
- @classmethod
1692
- @doc_args(SOMACurator.__doc__)
1693
- def from_tiledbsoma(
1694
- cls,
1695
- experiment_uri: UPathStr,
1696
- var_index: dict[str, tuple[str, FieldAttr]],
1697
- categoricals: dict[str, FieldAttr] | None = None,
1698
- obs_columns: FieldAttr = Feature.name,
1699
- using_key: str | None = None,
1700
- organism: str | None = None,
1701
- sources: dict[str, Record] | None = None,
1702
- exclude: dict[str, str | list[str]] | None = None,
1703
- ) -> SOMACurator:
1704
- """{}""" # noqa: D415
1705
- return SOMACurator(
1706
- experiment_uri=experiment_uri,
1707
- var_index=var_index,
1708
- categoricals=categoricals,
1709
- obs_columns=obs_columns,
1710
- using_key=using_key,
1711
- organism=organism,
1712
- sources=sources,
1713
- exclude=exclude,
2890
+ def _setup_configuration(self, adata: ad.AnnData):
2891
+ """Set up default configuration values."""
2892
+ import bionty as bt
2893
+ import wetlab as wl
2894
+
2895
+ self.PT_DEFAULT_VALUES = (
2896
+ CellxGeneAnnDataCatManager._get_categoricals_defaults()
2897
+ | {
2898
+ "cell_line": "unknown",
2899
+ "pert_target": "unknown",
2900
+ }
1714
2901
  )
1715
2902
 
1716
- @classmethod
1717
- def from_spatialdata(
1718
- cls,
1719
- sdata,
1720
- var_index: dict[str, FieldAttr],
1721
- categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1722
- using_key: str | None = None,
1723
- organism: str | None = None,
1724
- sources: dict[str, dict[str, Record]] | None = None,
1725
- exclude: dict[str, dict] | None = None,
1726
- verbosity: str = "hint",
1727
- *,
1728
- sample_metadata_key: str = "sample",
1729
- ):
1730
- """Curation flow for a ``Spatialdata`` object.
2903
+ self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
2904
+ k: v
2905
+ for k, v in {
2906
+ "cell_line": bt.CellLine.name,
2907
+ "pert_target": wl.PerturbationTarget.name,
2908
+ "pert_genetic": wl.GeneticPerturbation.name,
2909
+ "pert_compound": wl.Compound.name,
2910
+ "pert_biologic": wl.Biologic.name,
2911
+ "pert_physical": wl.EnvironmentalPerturbation.name,
2912
+ }.items()
2913
+ if k in adata.obs.columns
2914
+ }
2915
+ # if "donor_id" in self.PT_CATEGORICALS:
2916
+ # self.PT_CATEGORICALS["donor_id"] = Donor.name
2917
+
2918
+ def _setup_sources(self, adata: ad.AnnData):
2919
+ """Set up data sources."""
2920
+ self.PT_SOURCES = {}
2921
+ # if "cell_line" in adata.obs.columns:
2922
+ # self.PT_SOURCES["cell_line"] = (
2923
+ # bt.Source.filter(name="depmap").first()
2924
+ # )
2925
+ if "pert_compound" in adata.obs.columns:
2926
+ import bionty as bt
2927
+
2928
+ self.PT_SOURCES["pert_compound"] = bt.Source.filter(
2929
+ entity="wetlab.Compound", name="chebi"
2930
+ ).first()
2931
+
2932
+ def _validate_initial_data(self, adata: ad.AnnData):
2933
+ """Validate the initial data structure."""
2934
+ self._validate_required_columns(adata)
2935
+ self._validate_perturbation_types(adata)
2936
+
2937
+ def _validate_required_columns(self, adata: ad.AnnData):
2938
+ """Validate required columns are present."""
2939
+ if "pert_target" not in adata.obs.columns:
2940
+ if (
2941
+ "pert_name" not in adata.obs.columns
2942
+ or "pert_type" not in adata.obs.columns
2943
+ ):
2944
+ raise ValidationError(
2945
+ "either 'pert_target' or both 'pert_name' and 'pert_type' must be present"
2946
+ )
2947
+ else:
2948
+ if "pert_name" not in adata.obs.columns:
2949
+ logger.warning(
2950
+ "no 'pert' column found in adata.obs, will only curate 'pert_target'"
2951
+ )
2952
+ elif "pert_type" not in adata.obs.columns:
2953
+ raise ValidationError("both 'pert' and 'pert_type' must be present")
2954
+
2955
+ def _validate_perturbation_types(self, adata: ad.AnnData):
2956
+ """Validate perturbation types."""
2957
+ if "pert_type" in adata.obs.columns:
2958
+ data_pert_types = set(adata.obs["pert_type"].unique())
2959
+ invalid_pert_types = data_pert_types - self.PERT_COLUMNS
2960
+ if invalid_pert_types:
2961
+ raise ValidationError(
2962
+ f"invalid pert_type found: {invalid_pert_types}!\n"
2963
+ f" → allowed values: {self.PERT_COLUMNS}"
2964
+ )
2965
+ self._process_perturbation_types(adata, data_pert_types)
2966
+
2967
+ def _process_perturbation_types(self, adata: ad.AnnData, pert_types: set):
2968
+ """Process and map perturbation types."""
2969
+ for pert_type in pert_types:
2970
+ col_name = "pert_" + pert_type
2971
+ adata.obs[col_name] = adata.obs["pert_name"].where(
2972
+ adata.obs["pert_type"] == pert_type, None
2973
+ )
2974
+ if adata.obs[col_name].dtype.name == "category":
2975
+ adata.obs[col_name].cat.remove_unused_categories()
2976
+ logger.important(f"mapped 'pert_name' to '{col_name}'")
1731
2977
 
1732
- See also :class:`~lamindb.Curator`.
2978
+ def _setup_compound_source(self):
2979
+ """Set up the compound source with muted logging."""
2980
+ import bionty as bt
2981
+ import wetlab as wl
2982
+
2983
+ with logger.mute():
2984
+ chebi_source = bt.Source.filter(
2985
+ entity="wetlab.Compound", name="chebi"
2986
+ ).first()
2987
+ if not chebi_source:
2988
+ wl.Compound.add_source(
2989
+ bt.Source.filter(entity="Drug", name="chebi").first()
2990
+ )
1733
2991
 
1734
- Note that if genes or other measurements are removed from the SpatialData object,
1735
- the object should be recreated.
2992
+ def validate(self) -> bool: # type: ignore
2993
+ """Validate the AnnData object."""
2994
+ validated = super().validate()
2995
+
2996
+ if self._pert_dose:
2997
+ validated &= self._validate_dose_column()
2998
+ if self._pert_time:
2999
+ validated &= self._validate_time_column()
3000
+
3001
+ self._is_validated = validated
3002
+
3003
+ # sort columns
3004
+ first_columns = [
3005
+ "pert_target",
3006
+ "pert_genetic",
3007
+ "pert_compound",
3008
+ "pert_biologic",
3009
+ "pert_physical",
3010
+ "pert_dose",
3011
+ "pert_time",
3012
+ "organism",
3013
+ "cell_line",
3014
+ "cell_type",
3015
+ "disease",
3016
+ "tissue_type",
3017
+ "tissue",
3018
+ "assay",
3019
+ "suspension_type",
3020
+ "donor_id",
3021
+ "sex",
3022
+ "self_reported_ethnicity",
3023
+ "development_stage",
3024
+ "pert_name",
3025
+ "pert_type",
3026
+ ]
3027
+ sorted_columns = [
3028
+ col for col in first_columns if col in self._adata.obs.columns
3029
+ ] + [col for col in self._adata.obs.columns if col not in first_columns]
3030
+ # must assign to self._df to ensure .standardize works correctly
3031
+ self._obs_df = self._adata.obs[sorted_columns]
3032
+ self._adata.obs = self._obs_df
3033
+ return validated
3034
+
3035
+ def standardize(self, key: str) -> pd.DataFrame:
3036
+ """Standardize the AnnData object."""
3037
+ super().standardize(key)
3038
+ self._adata.obs = self._obs_df
3039
+
3040
+ def _validate_dose_column(self) -> bool:
3041
+ """Validate the dose column."""
3042
+ if not Feature.filter(name="pert_dose").exists():
3043
+ Feature(name="pert_dose", dtype="str").save() # type: ignore
3044
+
3045
+ dose_errors = DoseHandler.validate_values(self._adata.obs["pert_dose"])
3046
+ if dose_errors:
3047
+ self._log_validation_errors("pert_dose", dose_errors)
3048
+ return False
3049
+ return True
3050
+
3051
+ def _validate_time_column(self) -> bool:
3052
+ """Validate the time column."""
3053
+ if not Feature.filter(name="pert_time").exists():
3054
+ Feature(name="pert_time", dtype="str").save() # type: ignore
3055
+
3056
+ time_errors = TimeHandler.validate_values(self._adata.obs["pert_time"])
3057
+ if time_errors:
3058
+ self._log_validation_errors("pert_time", time_errors)
3059
+ return False
3060
+ return True
3061
+
3062
+ def _log_validation_errors(self, column: str, errors: list):
3063
+ """Log validation errors with formatting."""
3064
+ errors_print = "\n ".join(errors)
3065
+ logger.warning(
3066
+ f"invalid {column} values found!\n {errors_print}\n"
3067
+ f" → run {colors.cyan('standardize_dose_time()')}"
3068
+ )
1736
3069
 
1737
- In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
3070
+ def standardize_dose_time(self) -> pd.DataFrame:
3071
+ """Standardize dose and time values."""
3072
+ standardized_df = self._adata.obs.copy()
1738
3073
 
1739
- Args:
1740
- sdata: The SpatialData object to curate.
1741
- var_index: A dictionary mapping table keys to the ``.var`` indices.
1742
- categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
1743
- using_key: A reference LaminDB instance.
1744
- organism: The organism name.
1745
- sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
1746
- exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
1747
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1748
- using the exclude parameter ensures they are not validated.
1749
- verbosity: The verbosity level of the logger.
1750
- sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
1751
-
1752
- Examples:
1753
- >>> import lamindb as ln
1754
- >>> import bionty as bt
1755
- >>> curator = ln.Curator.from_spatialdata(
1756
- ... sdata,
1757
- ... var_index={
1758
- ... "table_1": bt.Gene.ensembl_gene_id,
1759
- ... },
1760
- ... categoricals={
1761
- ... "table1":
1762
- ... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name},
1763
- ... "sample":
1764
- ... {"experimental_factor": bt.ExperimentalFactor.name},
1765
- ... },
1766
- ... organism="human",
1767
- ... )
1768
- """
1769
- try:
1770
- import spatialdata
1771
- except ImportError as e:
1772
- raise ImportError(
1773
- "Please install spatialdata: pip install spatialdata"
1774
- ) from e
3074
+ if "pert_dose" in self._adata.obs.columns:
3075
+ standardized_df = self._standardize_column(
3076
+ standardized_df, "pert_dose", is_dose=True
3077
+ )
1775
3078
 
1776
- from ._spatial import SpatialDataCurator
3079
+ if "pert_time" in self._adata.obs.columns:
3080
+ standardized_df = self._standardize_column(
3081
+ standardized_df, "pert_time", is_dose=False
3082
+ )
1777
3083
 
1778
- return SpatialDataCurator(
1779
- sdata=sdata,
1780
- var_index=var_index,
1781
- categoricals=categoricals,
1782
- using_key=using_key,
1783
- verbosity=verbosity,
1784
- organism=organism,
1785
- sources=sources,
1786
- exclude=exclude,
1787
- sample_metadata_key=sample_metadata_key,
1788
- )
3084
+ self._adata.obs = standardized_df
3085
+ return standardized_df
3086
+
3087
+ def _standardize_column(
3088
+ self, df: pd.DataFrame, column: str, is_dose: bool
3089
+ ) -> pd.DataFrame:
3090
+ """Standardize values in a specific column."""
3091
+ for idx, value in self._adata.obs[column].items():
3092
+ if pd.isna(value) or (
3093
+ isinstance(value, str) and (not value.strip() or value.lower() == "nan")
3094
+ ):
3095
+ df.at[idx, column] = None
3096
+ continue
1789
3097
 
3098
+ try:
3099
+ num, unit = ValueUnit.parse_value_unit(value, is_dose=is_dose)
3100
+ df.at[idx, column] = f"{num}{unit}"
3101
+ except ValueError:
3102
+ continue
1790
3103
 
1791
- def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
1792
- """Get a registry instance using a specific instance."""
1793
- if using_key is not None and using_key != "default":
1794
- return registry.using(using_key)
1795
- return registry
3104
+ return df
1796
3105
 
1797
3106
 
1798
3107
  def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
@@ -1871,11 +3180,11 @@ def validate_categories(
1871
3180
  values: Iterable[str],
1872
3181
  field: FieldAttr,
1873
3182
  key: str,
1874
- using_key: str | None = None,
1875
3183
  organism: str | None = None,
1876
3184
  source: Record | None = None,
1877
3185
  exclude: str | list | None = None,
1878
3186
  hint_print: str | None = None,
3187
+ curator: CatManager | None = None,
1879
3188
  ) -> tuple[bool, list]:
1880
3189
  """Validate ontology terms in a pandas series using LaminDB registries.
1881
3190
 
@@ -1883,7 +3192,6 @@ def validate_categories(
1883
3192
  values: The values to validate.
1884
3193
  field: The field attribute.
1885
3194
  key: The key referencing the slot in the DataFrame.
1886
- using_key: A reference LaminDB instance.
1887
3195
  organism: The organism name.
1888
3196
  source: The source record.
1889
3197
  exclude: Exclude specific values from validation.
@@ -1918,22 +3226,8 @@ def validate_categories(
1918
3226
  non_validated = inspect_result.non_validated
1919
3227
  syn_mapper = inspect_result.synonyms_mapper
1920
3228
 
1921
- # inspect the non-validated values from the using_key instance
1922
- values_validated = []
1923
- if using_key is not None and using_key != "default" and non_validated:
1924
- registry_using = get_registry_instance(registry, using_key)
1925
- inspect_result = inspect_instance(
1926
- values=non_validated,
1927
- field=field,
1928
- registry=registry_using,
1929
- exclude=exclude,
1930
- **kwargs,
1931
- )
1932
- non_validated = inspect_result.non_validated
1933
- values_validated += inspect_result.validated
1934
- syn_mapper.update(inspect_result.synonyms_mapper)
1935
-
1936
3229
  # inspect the non-validated values from public (bionty only)
3230
+ values_validated = []
1937
3231
  if hasattr(registry, "public"):
1938
3232
  verbosity = settings.verbosity
1939
3233
  try:
@@ -1975,6 +3269,10 @@ def validate_categories(
1975
3269
  if logger.indent == "":
1976
3270
  _log_mapping_info()
1977
3271
  logger.warning(warning_message)
3272
+ if curator is not None:
3273
+ curator._validate_category_error_messages = strip_ansi_codes(
3274
+ warning_message
3275
+ )
1978
3276
  logger.indent = ""
1979
3277
  return False, non_validated
1980
3278
 
@@ -1982,7 +3280,6 @@ def validate_categories(
1982
3280
  def standardize_categories(
1983
3281
  values: Iterable[str],
1984
3282
  field: FieldAttr,
1985
- using_key: str | None = None,
1986
3283
  organism: str | None = None,
1987
3284
  source: Record | None = None,
1988
3285
  ) -> dict:
@@ -1999,30 +3296,15 @@ def standardize_categories(
1999
3296
  mute=True,
2000
3297
  return_mapper=True,
2001
3298
  )
2002
-
2003
- if len(values) > len(syn_mapper): # type: ignore
2004
- # standardize values using the using_key instance
2005
- if using_key is not None and using_key != "default":
2006
- registry_using = get_registry_instance(registry, using_key)
2007
- syn_mapper.update(
2008
- registry_using.standardize(
2009
- [v for v in values if v not in syn_mapper],
2010
- field=field.field.name,
2011
- organism=organism,
2012
- source=source,
2013
- mute=True,
2014
- return_mapper=True,
2015
- )
2016
- )
2017
3299
  return syn_mapper
2018
3300
 
2019
3301
 
2020
3302
  def validate_categories_in_df(
2021
3303
  df: pd.DataFrame,
2022
3304
  fields: dict[str, FieldAttr],
2023
- using_key: str | None = None,
2024
3305
  sources: dict[str, Record] = None,
2025
3306
  exclude: dict | None = None,
3307
+ curator: CatManager | None = None,
2026
3308
  **kwargs,
2027
3309
  ) -> tuple[bool, dict]:
2028
3310
  """Validate categories in DataFrame columns using LaminDB registries."""
@@ -2038,9 +3320,9 @@ def validate_categories_in_df(
2038
3320
  df[key],
2039
3321
  field=field,
2040
3322
  key=key,
2041
- using_key=using_key,
2042
3323
  source=sources.get(key),
2043
3324
  exclude=exclude.get(key) if exclude else None,
3325
+ curator=curator,
2044
3326
  **kwargs,
2045
3327
  )
2046
3328
  validated &= is_val
@@ -2055,80 +3337,72 @@ def save_artifact(
2055
3337
  columns_field: FieldAttr | dict[str, FieldAttr],
2056
3338
  description: str | None = None,
2057
3339
  organism: str | None = None,
2058
- adata: ad.AnnData | None = None,
2059
3340
  key: str | None = None,
3341
+ artifact: Artifact | None = None,
2060
3342
  revises: Artifact | None = None,
2061
3343
  run: Run | None = None,
3344
+ schema: Schema | None = None,
2062
3345
  ) -> Artifact:
2063
3346
  """Save all metadata with an Artifact.
2064
3347
 
2065
3348
  Args:
2066
- data: The DataFrame or AnnData object to save.
3349
+ data: The DataFrame/AnnData/MuData object to save.
2067
3350
  fields: A dictionary mapping obs_column to registry_field.
2068
3351
  columns_field: The registry field to validate variables index against.
2069
3352
  description: A description of the artifact.
2070
3353
  organism: The organism name.
2071
- adata: The AnnData object to save and get n_observations, must be provided if data is a path.
2072
3354
  type: The artifact type.
2073
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
3355
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
3356
+ artifact: A already registered artifact. Passing this will not save a new artifact from data.
2074
3357
  revises: Previous version of the artifact. Triggers a revision.
2075
3358
  run: The run that creates the artifact.
2076
3359
 
2077
3360
  Returns:
2078
3361
  The saved Artifact.
2079
3362
  """
2080
- from .._artifact import data_is_anndata
3363
+ from .._artifact import data_is_anndata, data_is_mudata
2081
3364
  from ..core._data import add_labels
2082
3365
 
2083
- artifact = None
2084
- if data_is_anndata(data):
2085
- assert adata is not None # noqa: S101
2086
- artifact = Artifact.from_anndata(
2087
- data, description=description, key=key, revises=revises, run=run
2088
- )
2089
- artifact.n_observations = adata.shape[0]
2090
- data = adata
2091
-
2092
- elif isinstance(data, pd.DataFrame):
2093
- artifact = Artifact.from_df(
2094
- data, description=description, key=key, revises=revises, run=run
2095
- )
2096
- else:
2097
- try:
2098
- from mudata import MuData
2099
-
2100
- if isinstance(data, MuData):
2101
- artifact = Artifact.from_mudata(
2102
- data,
2103
- description=description,
2104
- key=key,
2105
- revises=revises,
2106
- run=run,
2107
- )
2108
- artifact.n_observations = data.n_obs
2109
- except ImportError:
2110
- pass
2111
3366
  if artifact is None:
2112
- raise ValueError("data must be a DataFrame, AnnData or MuData object.")
3367
+ if data_is_anndata(data):
3368
+ artifact = Artifact.from_anndata(
3369
+ data, description=description, key=key, revises=revises, run=run
3370
+ )
3371
+ elif isinstance(data, pd.DataFrame):
3372
+ artifact = Artifact.from_df(
3373
+ data, description=description, key=key, revises=revises, run=run
3374
+ )
3375
+ elif data_is_mudata(data):
3376
+ artifact = Artifact.from_mudata(
3377
+ data,
3378
+ description=description,
3379
+ key=key,
3380
+ revises=revises,
3381
+ run=run,
3382
+ )
3383
+ artifact.schema = schema
2113
3384
  artifact.save()
2114
3385
 
2115
- feature_kwargs = check_registry_organism(
2116
- (
2117
- list(columns_field.values())[0].field.model
2118
- if isinstance(columns_field, dict)
2119
- else columns_field.field.model
2120
- ),
2121
- organism,
2122
- )
3386
+ if organism is not None:
3387
+ feature_kwargs = check_registry_organism(
3388
+ (
3389
+ list(columns_field.values())[0].field.model
3390
+ if isinstance(columns_field, dict)
3391
+ else columns_field.field.model
3392
+ ),
3393
+ organism,
3394
+ )
3395
+ else:
3396
+ feature_kwargs = {}
2123
3397
 
2124
3398
  if artifact.otype == "DataFrame":
2125
- artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
3399
+ artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
2126
3400
  elif artifact.otype == "AnnData":
2127
- artifact.features._add_set_from_anndata(
3401
+ artifact.features._add_set_from_anndata( # type: ignore
2128
3402
  var_field=columns_field, **feature_kwargs
2129
3403
  )
2130
3404
  elif artifact.otype == "MuData":
2131
- artifact.features._add_set_from_mudata(
3405
+ artifact.features._add_set_from_mudata( # type: ignore
2132
3406
  var_fields=columns_field, **feature_kwargs
2133
3407
  )
2134
3408
  else:
@@ -2202,7 +3476,7 @@ def save_artifact(
2202
3476
  )
2203
3477
 
2204
3478
  slug = ln_setup.settings.instance.slug
2205
- if ln_setup.settings.instance.is_remote: # pragma: no cover
3479
+ if ln_setup.settings.instance.is_remote: # pdagma: no cover
2206
3480
  logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
2207
3481
  return artifact
2208
3482
 
@@ -2224,7 +3498,6 @@ def update_registry(
2224
3498
  values: list[str],
2225
3499
  field: FieldAttr,
2226
3500
  key: str,
2227
- using_key: str | None = None,
2228
3501
  validated_only: bool = True,
2229
3502
  df: pd.DataFrame | None = None,
2230
3503
  organism: str | None = None,
@@ -2233,13 +3506,12 @@ def update_registry(
2233
3506
  exclude: str | list | None = None,
2234
3507
  **kwargs,
2235
3508
  ) -> None:
2236
- """Save features or labels records in the default instance from the using_key instance.
3509
+ """Save features or labels records in the default instance..
2237
3510
 
2238
3511
  Args:
2239
3512
  values: A list of values to be saved as labels.
2240
3513
  field: The FieldAttr object representing the field for which labels are being saved.
2241
3514
  key: The name of the feature to save.
2242
- using_key: The name of the instance from which to transfer labels (if applicable).
2243
3515
  validated_only: If True, only save validated labels.
2244
3516
  df: A DataFrame to save labels from.
2245
3517
  organism: The organism name.
@@ -2290,22 +3562,10 @@ def update_registry(
2290
3562
  i for i in values if i not in existing_and_public_labels
2291
3563
  ]
2292
3564
 
2293
- # inspect and save validated records the using_key instance
2294
- (
2295
- labels_saved[f"from {using_key}"],
2296
- non_validated_labels,
2297
- ) = update_registry_from_using_instance(
2298
- non_validated_labels,
2299
- field=field,
2300
- using_key=using_key,
2301
- exclude=exclude,
2302
- **filter_kwargs,
2303
- )
2304
-
2305
3565
  # save non-validated/new records
2306
3566
  labels_saved["new"] = non_validated_labels
2307
3567
  if not validated_only:
2308
- non_validated_records = []
3568
+ non_validated_records: RecordList[Any] = [] # type: ignore
2309
3569
  if df is not None and registry == Feature:
2310
3570
  nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
2311
3571
  non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
@@ -2379,48 +3639,6 @@ def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
2379
3639
  is_feature.children.add(*all_records)
2380
3640
 
2381
3641
 
2382
- def update_registry_from_using_instance(
2383
- values: list[str],
2384
- field: FieldAttr,
2385
- using_key: str | None = None,
2386
- exclude: str | list | None = None,
2387
- **kwargs,
2388
- ) -> tuple[list[str], list[str]]:
2389
- """Save features or labels records from the using_key instance.
2390
-
2391
- Args:
2392
- values: A list of values to be saved as labels.
2393
- field: The FieldAttr object representing the field for which labels are being saved.
2394
- using_key: The name of the instance from which to transfer labels (if applicable).
2395
- kwargs: Additional keyword arguments to pass to the registry model.
2396
-
2397
- Returns:
2398
- A tuple containing the list of saved labels and the list of non-saved labels.
2399
- """
2400
- labels_saved = []
2401
- not_saved = values
2402
-
2403
- if using_key is not None and using_key != "default":
2404
- registry_using = get_registry_instance(field.field.model, using_key)
2405
-
2406
- inspect_result_using = inspect_instance(
2407
- values=values,
2408
- field=field,
2409
- registry=registry_using,
2410
- exclude=exclude,
2411
- **kwargs,
2412
- )
2413
- labels_using = registry_using.filter(
2414
- **{f"{field.field.name}__in": inspect_result_using.validated}
2415
- ).all()
2416
- for label_using in labels_using:
2417
- label_using.save()
2418
- labels_saved.append(getattr(label_using, field.field.name))
2419
- not_saved = inspect_result_using.non_validated
2420
-
2421
- return labels_saved, not_saved
2422
-
2423
-
2424
3642
  def _save_organism(name: str):
2425
3643
  """Save an organism record."""
2426
3644
  import bionty as bt
@@ -2445,4 +3663,121 @@ def _ref_is_name(field: FieldAttr) -> bool | None:
2445
3663
  return field.field.name == name_field
2446
3664
 
2447
3665
 
2448
- Curate = Curator # backward compat
3666
+ # backward compat constructors ------------------
3667
+
3668
+
3669
+ @classmethod # type: ignore
3670
+ def from_df(
3671
+ cls,
3672
+ df: pd.DataFrame,
3673
+ categoricals: dict[str, FieldAttr] | None = None,
3674
+ columns: FieldAttr = Feature.name,
3675
+ verbosity: str = "hint",
3676
+ organism: str | None = None,
3677
+ ) -> DataFrameCatManager:
3678
+ return DataFrameCatManager(
3679
+ df=df,
3680
+ categoricals=categoricals,
3681
+ columns=columns,
3682
+ verbosity=verbosity,
3683
+ organism=organism,
3684
+ )
3685
+
3686
+
3687
+ @classmethod # type: ignore
3688
+ def from_anndata(
3689
+ cls,
3690
+ data: ad.AnnData | UPathStr,
3691
+ var_index: FieldAttr,
3692
+ categoricals: dict[str, FieldAttr] | None = None,
3693
+ obs_columns: FieldAttr = Feature.name,
3694
+ verbosity: str = "hint",
3695
+ organism: str | None = None,
3696
+ sources: dict[str, Record] | None = None,
3697
+ ) -> AnnDataCatManager:
3698
+ return AnnDataCatManager(
3699
+ data=data,
3700
+ var_index=var_index,
3701
+ categoricals=categoricals,
3702
+ obs_columns=obs_columns,
3703
+ verbosity=verbosity,
3704
+ organism=organism,
3705
+ sources=sources,
3706
+ )
3707
+
3708
+
3709
+ @classmethod # type: ignore
3710
+ def from_mudata(
3711
+ cls,
3712
+ mdata: MuData,
3713
+ var_index: dict[str, dict[str, FieldAttr]],
3714
+ categoricals: dict[str, FieldAttr] | None = None,
3715
+ verbosity: str = "hint",
3716
+ organism: str | None = None,
3717
+ ) -> MuDataCatManager:
3718
+ return MuDataCatManager(
3719
+ mdata=mdata,
3720
+ var_index=var_index,
3721
+ categoricals=categoricals,
3722
+ verbosity=verbosity,
3723
+ organism=organism,
3724
+ )
3725
+
3726
+
3727
+ @classmethod # type: ignore
3728
+ def from_tiledbsoma(
3729
+ cls,
3730
+ experiment_uri: UPathStr,
3731
+ var_index: dict[str, tuple[str, FieldAttr]],
3732
+ categoricals: dict[str, FieldAttr] | None = None,
3733
+ obs_columns: FieldAttr = Feature.name,
3734
+ organism: str | None = None,
3735
+ sources: dict[str, Record] | None = None,
3736
+ exclude: dict[str, str | list[str]] | None = None,
3737
+ ) -> TiledbsomaCatManager:
3738
+ return TiledbsomaCatManager(
3739
+ experiment_uri=experiment_uri,
3740
+ var_index=var_index,
3741
+ categoricals=categoricals,
3742
+ obs_columns=obs_columns,
3743
+ organism=organism,
3744
+ sources=sources,
3745
+ exclude=exclude,
3746
+ )
3747
+
3748
+
3749
+ @classmethod # type: ignore
3750
+ def from_spatialdata(
3751
+ cls,
3752
+ sdata,
3753
+ var_index: dict[str, FieldAttr],
3754
+ categoricals: dict[str, dict[str, FieldAttr]] | None = None,
3755
+ organism: str | None = None,
3756
+ sources: dict[str, dict[str, Record]] | None = None,
3757
+ exclude: dict[str, dict] | None = None,
3758
+ verbosity: str = "hint",
3759
+ *,
3760
+ sample_metadata_key: str = "sample",
3761
+ ):
3762
+ try:
3763
+ import spatialdata
3764
+ except ImportError as e:
3765
+ raise ImportError("Please install spatialdata: pip install spatialdata") from e
3766
+
3767
+ return SpatialDataCatManager(
3768
+ sdata=sdata,
3769
+ var_index=var_index,
3770
+ categoricals=categoricals,
3771
+ verbosity=verbosity,
3772
+ organism=organism,
3773
+ sources=sources,
3774
+ exclude=exclude,
3775
+ sample_metadata_key=sample_metadata_key,
3776
+ )
3777
+
3778
+
3779
+ CatManager.from_df = from_df # type: ignore
3780
+ CatManager.from_anndata = from_anndata # type: ignore
3781
+ CatManager.from_mudata = from_mudata # type: ignore
3782
+ CatManager.from_spatialdata = from_spatialdata # type: ignore
3783
+ CatManager.from_tiledbsoma = from_tiledbsoma # type: ignore