lamindb 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. lamindb/__init__.py +17 -6
  2. lamindb/_artifact.py +202 -87
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +86 -52
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +21 -7
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +78 -18
  10. lamindb/_record.py +170 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +42 -11
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +129 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/users.py +1 -4
  19. lamindb/base/validation.py +2 -6
  20. lamindb/core/__init__.py +13 -14
  21. lamindb/core/_context.py +14 -9
  22. lamindb/core/_data.py +29 -25
  23. lamindb/core/_describe.py +1 -1
  24. lamindb/core/_django.py +1 -1
  25. lamindb/core/_feature_manager.py +53 -43
  26. lamindb/core/_label_manager.py +4 -4
  27. lamindb/core/_mapped_collection.py +24 -9
  28. lamindb/core/_track_environment.py +2 -1
  29. lamindb/core/datasets/__init__.py +6 -1
  30. lamindb/core/datasets/_core.py +12 -11
  31. lamindb/core/datasets/_small.py +67 -21
  32. lamindb/core/exceptions.py +1 -90
  33. lamindb/core/loaders.py +21 -15
  34. lamindb/core/relations.py +6 -4
  35. lamindb/core/storage/_anndata_accessor.py +49 -3
  36. lamindb/core/storage/_backed_access.py +12 -7
  37. lamindb/core/storage/_pyarrow_dataset.py +40 -15
  38. lamindb/core/storage/_tiledbsoma.py +56 -12
  39. lamindb/core/storage/paths.py +30 -24
  40. lamindb/core/subsettings/_creation_settings.py +4 -16
  41. lamindb/curators/__init__.py +2193 -846
  42. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  43. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  44. lamindb/errors.py +96 -0
  45. lamindb/integrations/_vitessce.py +3 -3
  46. lamindb/migrations/0069_squashed.py +76 -75
  47. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  48. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  49. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  50. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  51. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  52. lamindb/migrations/0086_various.py +95 -0
  53. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  54. lamindb/migrations/0088_schema_components.py +273 -0
  55. lamindb/migrations/0088_squashed.py +4372 -0
  56. lamindb/models.py +475 -168
  57. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/METADATA +9 -7
  58. lamindb-1.1.1.dist-info/RECORD +95 -0
  59. lamindb/curators/_spatial.py +0 -528
  60. lamindb/migrations/0052_squashed.py +0 -1261
  61. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  62. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  63. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  64. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  65. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  66. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  67. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  68. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  69. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  70. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  71. lamindb/migrations/0063_populate_latest_field.py +0 -45
  72. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  73. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  74. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  75. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  76. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  77. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  78. lamindb-1.0.5.dist-info/RECORD +0 -102
  79. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
  80. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
@@ -1,21 +1,54 @@
1
+ """Curators.
2
+
3
+ .. versionadded:: 1.1.0
4
+
5
+ .. autosummary::
6
+ :toctree: .
7
+
8
+ Curator
9
+ DataFrameCurator
10
+ AnnDataCurator
11
+
12
+ """
13
+
1
14
  from __future__ import annotations
2
15
 
3
16
  import copy
4
- import warnings
17
+ import random
18
+ import re
19
+ from importlib import resources
5
20
  from itertools import chain
6
- from typing import TYPE_CHECKING
21
+ from typing import TYPE_CHECKING, Any, Literal
7
22
 
8
23
  import anndata as ad
9
24
  import lamindb_setup as ln_setup
10
25
  import pandas as pd
26
+ import pandera
11
27
  import pyarrow as pa
12
28
  from lamin_utils import colors, logger
29
+ from lamindb_setup.core import deprecated, upath
13
30
  from lamindb_setup.core._docs import doc_args
14
31
  from lamindb_setup.core.upath import UPath
15
32
 
33
+ from lamindb.core.storage._backed_access import backed_access
34
+
35
+ from ._cellxgene_schemas import _read_schema_versions
36
+
37
+ if TYPE_CHECKING:
38
+ from anndata import AnnData
39
+ from lamindb_setup.core.types import UPathStr
40
+
41
+ from lamindb.base.types import FieldAttr
42
+ from lamindb.models import Record
43
+ from lamindb._feature import parse_dtype, parse_dtype_single_cat
16
44
  from lamindb.base.types import FieldAttr # noqa
45
+ from lamindb.core._data import add_labels
46
+ from lamindb.core._feature_manager import parse_staged_feature_sets_from_anndata
47
+ from lamindb.core._settings import settings
17
48
  from lamindb.models import (
18
49
  Artifact,
50
+ CanCurate,
51
+ Collection,
19
52
  Feature,
20
53
  Record,
21
54
  Run,
@@ -23,15 +56,25 @@ from lamindb.models import (
23
56
  ULabel,
24
57
  )
25
58
 
59
+ from .._artifact import data_is_anndata
26
60
  from .._from_values import _format_values
27
- from ..core.exceptions import ValidationError
61
+ from ..errors import InvalidArgument, ValidationError
28
62
 
29
63
  if TYPE_CHECKING:
30
- from collections.abc import Iterable
64
+ from collections.abc import Iterable, MutableMapping
31
65
  from typing import Any
32
66
 
33
67
  from lamindb_setup.core.types import UPathStr
34
68
  from mudata import MuData
69
+ from spatialdata import SpatialData
70
+
71
+ from lamindb._query_set import RecordList
72
+
73
+
74
+ def strip_ansi_codes(text):
75
+ # This pattern matches ANSI escape sequences
76
+ ansi_pattern = re.compile(r"\x1b\[[0-9;]*m")
77
+ return ansi_pattern.sub("", text)
35
78
 
36
79
 
37
80
  class CurateLookup:
@@ -40,8 +83,6 @@ class CurateLookup:
40
83
  Args:
41
84
  categoricals: A dictionary of categorical fields to lookup.
42
85
  slots: A dictionary of slot fields to lookup.
43
- using_key: The key of the instance to lookup from. Defaults to the
44
- current instance if not specified.
45
86
  public: Whether to lookup from the public instance. Defaults to False.
46
87
 
47
88
  Example:
@@ -55,48 +96,43 @@ class CurateLookup:
55
96
  self,
56
97
  categoricals: dict[str, FieldAttr],
57
98
  slots: dict[str, FieldAttr] = None,
58
- using_key: str | None = None,
59
99
  public: bool = False,
60
100
  ) -> None:
61
101
  slots = slots or {}
62
- self._fields = {**categoricals, **slots}
63
- self._using_key = None if using_key == "default" else using_key
64
- self._using_key_name = self._using_key or ln_setup.settings.instance.slug
102
+ self._categoricals = {**categoricals, **slots}
65
103
  self._public = public
66
- debug_message = f"Lookup objects from {colors.italic(self._using_key_name)}"
67
- logger.debug(debug_message)
68
104
 
69
105
  def __getattr__(self, name):
70
- if name in self._fields:
71
- registry = self._fields[name].field.model
106
+ if name in self._categoricals:
107
+ registry = self._categoricals[name].field.model
72
108
  if self._public and hasattr(registry, "public"):
73
109
  return registry.public().lookup()
74
110
  else:
75
- return get_registry_instance(registry, self._using_key).lookup()
111
+ return registry.lookup()
76
112
  raise AttributeError(
77
113
  f'"{self.__class__.__name__}" object has no attribute "{name}"'
78
114
  )
79
115
 
80
116
  def __getitem__(self, name):
81
- if name in self._fields:
82
- registry = self._fields[name].field.model
117
+ if name in self._categoricals:
118
+ registry = self._categoricals[name].field.model
83
119
  if self._public and hasattr(registry, "public"):
84
120
  return registry.public().lookup()
85
121
  else:
86
- return get_registry_instance(registry, self._using_key).lookup()
122
+ return registry.lookup()
87
123
  raise AttributeError(
88
124
  f'"{self.__class__.__name__}" object has no attribute "{name}"'
89
125
  )
90
126
 
91
127
  def __repr__(self) -> str:
92
- if len(self._fields) > 0:
128
+ if len(self._categoricals) > 0:
93
129
  getattr_keys = "\n ".join(
94
- [f".{key}" for key in self._fields if key.isidentifier()]
130
+ [f".{key}" for key in self._categoricals if key.isidentifier()]
95
131
  )
96
132
  getitem_keys = "\n ".join(
97
- [str([key]) for key in self._fields if not key.isidentifier()]
133
+ [str([key]) for key in self._categoricals if not key.isidentifier()]
98
134
  )
99
- ref = "public" if self._public else self._using_key_name
135
+ ref = "public" if self._public else "registries"
100
136
  return (
101
137
  f"Lookup objects from the {colors.italic(ref)}:\n "
102
138
  f"{colors.green(getattr_keys)}\n "
@@ -105,21 +141,442 @@ class CurateLookup:
105
141
  " → categories.alveolar_type_1_fibroblast_cell\n\n"
106
142
  "To look up public ontologies, use .lookup(public=True)"
107
143
  )
108
- else: # pragma: no cover
144
+ else: # pdagma: no cover
109
145
  return colors.warning("No fields are found!")
110
146
 
111
147
 
112
- class BaseCurator:
113
- """Curate a dataset."""
148
+ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
149
+
150
+
151
+ SLOTS_DOCSTRING = """Curator objects by slot.
152
+
153
+ .. versionadded:: 1.1.1
154
+ """
155
+
156
+
157
+ VALIDATE_DOCSTRING = """Validate dataset.
158
+
159
+ Raises:
160
+ lamindb.errors.ValidationError: If validation fails.
161
+ """
162
+
163
+ SAVE_ARTIFACT_DOCSTRING = """Save an annotated artifact.
164
+
165
+ Args:
166
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
167
+ description: A description.
168
+ revises: Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
169
+ run: The run that creates the artifact.
170
+
171
+ Returns:
172
+ A saved artifact record.
173
+ """
174
+
175
+
176
+ class Curator:
177
+ """Dataset curator.
178
+
179
+ A `Curator` object makes it easy to validate, standardize & annotate datasets.
180
+
181
+ .. versionadded:: 1.1.0
182
+
183
+ See:
184
+ - :class:`~lamindb.curators.DataFrameCurator`
185
+ - :class:`~lamindb.curators.AnnDataCurator`
186
+ """
187
+
188
+ def __init__(self, dataset: Any, schema: Schema | None = None):
189
+ self._artifact: Artifact = None # pass the dataset as an artifact
190
+ self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
191
+ if isinstance(self._dataset, Artifact):
192
+ self._artifact = self._dataset
193
+ if self._artifact.otype in {"DataFrame", "AnnData"}:
194
+ self._dataset = self._dataset.load()
195
+ self._schema: Schema | None = schema
196
+ self._is_validated: bool = False
197
+ self._cat_manager: CatManager = None # is None for CatManager curators
198
+
199
+ @doc_args(VALIDATE_DOCSTRING)
200
+ def validate(self) -> bool | str:
201
+ """{}""" # noqa: D415
202
+ pass # pdagma: no cover
203
+
204
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
205
+ def save_artifact(
206
+ self,
207
+ *,
208
+ key: str | None = None,
209
+ description: str | None = None,
210
+ revises: Artifact | None = None,
211
+ run: Run | None = None,
212
+ ) -> Artifact:
213
+ """{}""" # noqa: D415
214
+ # Note that this docstring has to be consistent with the Artifact()
215
+ # constructor signature
216
+ pass
217
+
218
+
219
+ class DataFrameCurator(Curator):
220
+ # the example in the docstring is tested in test_curators_quickstart_example
221
+ """Curator for a DataFrame object.
222
+
223
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
224
+
225
+ .. versionadded:: 1.1.0
226
+
227
+ Args:
228
+ dataset: The DataFrame-like object to validate & annotate.
229
+ schema: A `Schema` object that defines the validation constraints.
230
+
231
+ Example::
232
+
233
+ import lamindb as ln
234
+ import bionty as bt
235
+
236
+ # define valid labels
237
+ perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
238
+ ln.ULabel(name="DMSO", type=perturbation).save()
239
+ ln.ULabel(name="IFNG", type=perturbation).save()
240
+ bt.CellType.from_source(name="B cell").save()
241
+ bt.CellType.from_source(name="T cell").save()
242
+
243
+ # define schema
244
+ schema = ln.Schema(
245
+ name="small_dataset1_obs_level_metadata",
246
+ features=[
247
+ ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
248
+ ln.Feature(name="sample_note", dtype=str).save(),
249
+ ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
250
+ ln.Feature(name="cell_type_by_model", dtype=bt.CellType).save(),
251
+ ],
252
+ ).save()
253
+
254
+ # curate a DataFrame
255
+ df = datasets.small_dataset1(otype="DataFrame")
256
+ curator = ln.curators.DataFrameCurator(df, schema)
257
+ artifact = curator.save_artifact(key="example_datasets/dataset1.parquet")
258
+ assert artifact.schema == schema
259
+ """
260
+
261
+ def __init__(
262
+ self,
263
+ dataset: pd.DataFrame | Artifact,
264
+ schema: Schema,
265
+ ) -> None:
266
+ super().__init__(dataset=dataset, schema=schema)
267
+ categoricals = {}
268
+ if schema.n > 0:
269
+ # populate features
270
+ pandera_columns = {}
271
+ for feature in schema.features.all():
272
+ pandera_dtype = (
273
+ feature.dtype if not feature.dtype.startswith("cat") else "category"
274
+ )
275
+ pandera_columns[feature.name] = pandera.Column(
276
+ pandera_dtype, nullable=feature.nullable
277
+ )
278
+ if feature.dtype.startswith("cat"):
279
+ categoricals[feature.name] = parse_dtype(feature.dtype)[0]["field"]
280
+ self._pandera_schema = pandera.DataFrameSchema(
281
+ pandera_columns, coerce=schema.coerce_dtype
282
+ )
283
+ else:
284
+ assert schema.itype is not None # noqa: S101
285
+ self._cat_manager = DataFrameCatManager(
286
+ self._dataset,
287
+ columns=parse_dtype_single_cat(schema.itype, is_itype=True)["field"],
288
+ categoricals=categoricals,
289
+ )
290
+
291
+ @property
292
+ @doc_args(CAT_MANAGER_DOCSTRING)
293
+ def cat(self) -> CatManager:
294
+ """{}""" # noqa: D415
295
+ return self._cat_manager
296
+
297
+ def standardize(self) -> None:
298
+ """Standardize the dataset.
299
+
300
+ - Adds missing columns for features
301
+ - Fills missing values for features with default values
302
+ """
303
+ for feature in self._schema.members:
304
+ if feature.name not in self._dataset.columns:
305
+ if feature.default_value is not None or feature.nullable:
306
+ fill_value = (
307
+ feature.default_value
308
+ if feature.default_value is not None
309
+ else pd.NA
310
+ )
311
+ if feature.dtype.startswith("cat"):
312
+ self._dataset[feature.name] = pd.Categorical(
313
+ [fill_value] * len(self._dataset)
314
+ )
315
+ else:
316
+ self._dataset[feature.name] = fill_value
317
+ logger.important(
318
+ f"added column {feature.name} with fill value {fill_value}"
319
+ )
320
+ else:
321
+ raise ValidationError(
322
+ f"Missing column {feature.name} cannot be added because is not nullable and has no default value"
323
+ )
324
+ else:
325
+ if feature.default_value is not None:
326
+ if isinstance(
327
+ self._dataset[feature.name].dtype, pd.CategoricalDtype
328
+ ):
329
+ if (
330
+ feature.default_value
331
+ not in self._dataset[feature.name].cat.categories
332
+ ):
333
+ self._dataset[feature.name] = self._dataset[
334
+ feature.name
335
+ ].cat.add_categories(feature.default_value)
336
+ self._dataset[feature.name] = self._dataset[feature.name].fillna(
337
+ feature.default_value
338
+ )
339
+
340
+ def _cat_manager_validate(self) -> None:
341
+ self._cat_manager.validate()
342
+ if self._cat_manager._is_validated:
343
+ self._is_validated = True
344
+ else:
345
+ self._is_validated = False
346
+ raise ValidationError(self._cat_manager._validate_category_error_messages)
347
+
348
+ @doc_args(VALIDATE_DOCSTRING)
349
+ def validate(self) -> None:
350
+ """{}""" # noqa: D415
351
+ if self._schema.n > 0:
352
+ try:
353
+ # first validate through pandera
354
+ self._pandera_schema.validate(self._dataset)
355
+ # then validate lamindb categoricals
356
+ self._cat_manager_validate()
357
+ except pandera.errors.SchemaError as err:
358
+ self._is_validated = False
359
+ # .exconly() doesn't exist on SchemaError
360
+ raise ValidationError(str(err)) from err
361
+ else:
362
+ self._cat_manager_validate()
363
+
364
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
365
+ def save_artifact(
366
+ self,
367
+ *,
368
+ key: str | None = None,
369
+ description: str | None = None,
370
+ revises: Artifact | None = None,
371
+ run: Run | None = None,
372
+ ):
373
+ """{}""" # noqa: D415
374
+ if not self._is_validated:
375
+ self.validate() # raises ValidationError if doesn't validate
376
+ result = parse_dtype_single_cat(self._schema.itype, is_itype=True)
377
+ return save_artifact( # type: ignore
378
+ self._dataset,
379
+ description=description,
380
+ fields=self._cat_manager.categoricals,
381
+ columns_field=result["field"],
382
+ key=key,
383
+ artifact=self._artifact,
384
+ revises=revises,
385
+ run=run,
386
+ schema=self._schema,
387
+ )
388
+
389
+
390
+ class AnnDataCurator(Curator):
391
+ # the example in the docstring is tested in test_curators_quickstart_example
392
+ """Curator for a DataFrame object.
393
+
394
+ See also :class:`~lamindb.Curator` and :class:`~lamindb.Schema`.
395
+
396
+ .. versionadded:: 1.1.0
397
+
398
+ Args:
399
+ dataset: The AnnData-like object to validate & annotate.
400
+ schema: A `Schema` object that defines the validation constraints.
401
+
402
+ Example::
403
+
404
+ import lamindb as ln
405
+ import bionty as bt
406
+
407
+ # define valid labels
408
+ perturbation = ln.ULabel(name="Perturbation", is_type=True).save()
409
+ ln.ULabel(name="DMSO", type=perturbation).save()
410
+ ln.ULabel(name="IFNG", type=perturbation).save()
411
+ bt.CellType.from_source(name="B cell").save()
412
+ bt.CellType.from_source(name="T cell").save()
413
+
414
+ # define obs schema
415
+ obs_schema = ln.Schema(
416
+ name="small_dataset1_obs_level_metadata",
417
+ features=[
418
+ ln.Feature(name="perturbation", dtype="cat[ULabel[Perturbation]]").save(),
419
+ ln.Feature(name="sample_note", dtype=str).save(),
420
+ ln.Feature(name="cell_type_by_expert", dtype=bt.CellType).save(),
421
+ ln.Feature(name="cell_type_by_model", dtype=bt.CellType").save(),
422
+ ],
423
+ ).save()
424
+
425
+ # define var schema
426
+ var_schema = ln.Schema(
427
+ name="scRNA_seq_var_schema",
428
+ itype=bt.Gene.ensembl_gene_id,
429
+ dtype=int,
430
+ ).save()
431
+
432
+ # define composite schema
433
+ anndata_schema = ln.Schema(
434
+ name="small_dataset1_anndata_schema",
435
+ otype="AnnData",
436
+ components={"obs": obs_schema, "var": var_schema},
437
+ ).save()
438
+
439
+ # curate an AnnData
440
+ adata = datasets.small_dataset1(otype="AnnData")
441
+ curator = ln.curators.AnnDataCurator(adata, anndata_schema)
442
+ artifact = curator.save_artifact(key="example_datasets/dataset1.h5ad")
443
+ assert artifact.schema == anndata_schema
444
+ """
445
+
446
+ def __init__(
447
+ self,
448
+ dataset: AnnData | Artifact,
449
+ schema: Schema,
450
+ ) -> None:
451
+ super().__init__(dataset=dataset, schema=schema)
452
+ if not data_is_anndata(self._dataset):
453
+ raise InvalidArgument("dataset must be AnnData-like.")
454
+ if schema.otype != "AnnData":
455
+ raise InvalidArgument("Schema otype must be 'AnnData'.")
456
+ self._obs_curator = DataFrameCurator(
457
+ self._dataset.obs, schema._get_component("obs")
458
+ )
459
+ self._var_curator = DataFrameCurator(
460
+ self._dataset.var.T, schema._get_component("var")
461
+ )
462
+
463
+ @property
464
+ @doc_args(SLOTS_DOCSTRING)
465
+ def slots(self) -> dict[str, DataFrameCurator]:
466
+ """{}""" # noqa: D415
467
+ return {"obs": self._obs_curator, "var": self._var_curator}
468
+
469
+ @doc_args(VALIDATE_DOCSTRING)
470
+ def validate(self) -> None:
471
+ """{}""" # noqa: D415
472
+ self._obs_curator.validate()
473
+ self._var_curator.validate()
474
+
475
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
476
+ def save_artifact(
477
+ self,
478
+ *,
479
+ key: str | None = None,
480
+ description: str | None = None,
481
+ revises: Artifact | None = None,
482
+ run: Run | None = None,
483
+ ):
484
+ """{}""" # noqa: D415
485
+ if not self._is_validated:
486
+ self.validate() # raises ValidationError if doesn't validate
487
+ result = parse_dtype_single_cat(self._var_curator._schema.itype, is_itype=True)
488
+ return save_artifact( # type: ignore
489
+ self._dataset,
490
+ description=description,
491
+ fields=self._obs_curator._cat_manager.categoricals,
492
+ columns_field=result["field"],
493
+ key=key,
494
+ artifact=self._artifact,
495
+ revises=revises,
496
+ run=run,
497
+ schema=self._schema,
498
+ )
499
+
500
+
501
+ class CatManager:
502
+ """Manage valid categoricals by updating registries.
503
+
504
+ A `CatManager` object makes it easy to validate, standardize & annotate datasets.
505
+
506
+ Example:
507
+
508
+ >>> cat_manager = ln.CatManager(
509
+ >>> dataset,
510
+ >>> # define validation criteria as mappings
511
+ >>> columns=Feature.name, # map column names
512
+ >>> categoricals={"perturbation": ULabel.name}, # map categories
513
+ >>> )
514
+ >>> cat_manager.validate() # validate the dataframe
515
+ >>> artifact = cat_manager.save_artifact(description="my RNA-seq")
516
+ >>> artifact.describe() # see annotations
517
+
518
+ `cat_manager.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
519
+
520
+ If you find non-validated values, you have several options:
521
+
522
+ - new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCatManager.add_new_from`
523
+ - non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCatManager.non_validated` and addressed manually
524
+ """
525
+
526
+ def __init__(
527
+ self, *, dataset, categoricals, sources, organism, exclude, columns_field=None
528
+ ):
529
+ # the below is shared with Curator
530
+ self._artifact: Artifact = None # pass the dataset as an artifact
531
+ self._dataset: Any = dataset # pass the dataset as a UPathStr or data object
532
+ if isinstance(self._dataset, Artifact):
533
+ self._artifact = self._dataset
534
+ if self._artifact.otype in {"DataFrame", "AnnData"}:
535
+ self._dataset = self._dataset.load()
536
+ self._is_validated: bool = False
537
+ # shared until here
538
+ self._categoricals = categoricals or {}
539
+ self._non_validated = None
540
+ self._organism = organism
541
+ self._sources = sources or {}
542
+ self._exclude = exclude or {}
543
+ self._columns_field = columns_field
544
+ self._validate_category_error_messages: str = ""
545
+
546
+ @property
547
+ def non_validated(self) -> dict[str, list[str]]:
548
+ """Return the non-validated features and labels."""
549
+ if self._non_validated is None:
550
+ raise ValidationError("Please run validate() first!")
551
+ return self._non_validated
114
552
 
115
- def __init_subclass__(cls, **kwargs):
116
- super().__init_subclass__(**kwargs)
117
- import sys
553
+ @property
554
+ def categoricals(self) -> dict:
555
+ """Return the columns fields to validate against."""
556
+ return self._categoricals
118
557
 
119
- # Deprecated methods
120
- if "sphinx" not in sys.modules:
121
- if hasattr(cls, "_add_new_from_columns"):
122
- cls.add_new_from_columns = cls._add_new_from_columns
558
+ def _replace_synonyms(
559
+ self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
560
+ ):
561
+ # replace the values in df
562
+ std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
563
+ # remove the standardized values from self.non_validated
564
+ non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
565
+ if len(non_validated) == 0:
566
+ self._non_validated.pop(key, None) # type: ignore
567
+ else:
568
+ self._non_validated[key] = non_validated # type: ignore
569
+ # logging
570
+ n = len(syn_mapper)
571
+ if n > 0:
572
+ syn_mapper_print = _format_values(
573
+ [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
574
+ )
575
+ s = "s" if n > 1 else ""
576
+ logger.success(
577
+ f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
578
+ )
579
+ return std_values
123
580
 
124
581
  def validate(self) -> bool:
125
582
  """Validate dataset.
@@ -127,9 +584,9 @@ class BaseCurator:
127
584
  This method also registers the validated records in the current instance.
128
585
 
129
586
  Returns:
130
- Boolean indicating whether the dataset is validated.
587
+ The boolean `True` if the dataset is validated. Otherwise, a string with the error message.
131
588
  """
132
- pass # pragma: no cover
589
+ pass
133
590
 
134
591
  def standardize(self, key: str) -> None:
135
592
  """Replace synonyms with standardized values.
@@ -142,30 +599,48 @@ class BaseCurator:
142
599
  Returns:
143
600
  None
144
601
  """
145
- pass # pragma: no cover
602
+ pass # pdagma: no cover
146
603
 
604
+ @doc_args(SAVE_ARTIFACT_DOCSTRING)
147
605
  def save_artifact(
148
606
  self,
149
- description: str | None = None,
607
+ *,
150
608
  key: str | None = None,
609
+ description: str | None = None,
151
610
  revises: Artifact | None = None,
152
611
  run: Run | None = None,
153
612
  ) -> Artifact:
154
- """Save the dataset as artifact.
613
+ """{}""" # noqa: D415
614
+ from lamindb.core._settings import settings
155
615
 
156
- Args:
157
- description: A description of the DataFrame object.
158
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
159
- revises: Previous version of the artifact. Triggers a revision.
160
- run: The run that creates the artifact.
616
+ if not self._is_validated:
617
+ self.validate() # returns True or False
618
+ if not self._is_validated: # need to raise error manually
619
+ raise ValidationError("Dataset does not validate. Please curate.")
161
620
 
162
- Returns:
163
- A saved artifact record.
164
- """
165
- pass # pragma: no cover
621
+ # Make sure all labels are saved in the current instance
622
+ verbosity = settings.verbosity
623
+ try:
624
+ settings.verbosity = "warning"
625
+ self._artifact = save_artifact( # type: ignore
626
+ self._dataset,
627
+ description=description,
628
+ fields=self.categoricals,
629
+ columns_field=self._columns_field,
630
+ key=key,
631
+ artifact=self._artifact,
632
+ revises=revises,
633
+ run=run,
634
+ schema=None,
635
+ organism=self._organism,
636
+ )
637
+ finally:
638
+ settings.verbosity = verbosity
639
+
640
+ return self._artifact
166
641
 
167
642
 
168
- class DataFrameCurator(BaseCurator):
643
+ class DataFrameCatManager(CatManager):
169
644
  """Curation flow for a DataFrame object.
170
645
 
171
646
  See also :class:`~lamindb.Curator`.
@@ -174,7 +649,6 @@ class DataFrameCurator(BaseCurator):
174
649
  df: The DataFrame object to curate.
175
650
  columns: The field attribute for the feature column.
176
651
  categoricals: A dictionary mapping column names to registry_field.
177
- using_key: The reference instance containing registries to validate against.
178
652
  verbosity: The verbosity level.
179
653
  organism: The organism name.
180
654
  sources: A dictionary mapping column names to Source records.
@@ -191,165 +665,103 @@ class DataFrameCurator(BaseCurator):
191
665
  ... df,
192
666
  ... categoricals={
193
667
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
194
- ... "donor_id": ln.ULabel.name
668
+ ... "donor_id": ULabel.name
195
669
  ... }
196
670
  ... )
197
671
  """
198
672
 
199
673
  def __init__(
200
674
  self,
201
- df: pd.DataFrame,
675
+ df: pd.DataFrame | Artifact,
202
676
  columns: FieldAttr = Feature.name,
203
677
  categoricals: dict[str, FieldAttr] | None = None,
204
- using_key: str | None = None,
205
678
  verbosity: str = "hint",
206
679
  organism: str | None = None,
207
680
  sources: dict[str, Record] | None = None,
208
681
  exclude: dict | None = None,
209
- check_valid_keys: bool = True,
210
682
  ) -> None:
211
683
  from lamindb.core._settings import settings
212
684
 
213
685
  if organism is not None and not isinstance(organism, str):
214
686
  raise ValueError("organism must be a string such as 'human' or 'mouse'!")
215
687
 
216
- self._df = df
217
- self._fields = categoricals or {}
218
- self._columns_field = columns
219
- self._using_key = using_key
220
- # TODO: change verbosity back
221
688
  settings.verbosity = verbosity
222
- self._artifact = None
223
- self._collection = None
224
- self._validated = False
225
- self._kwargs = {"organism": organism} if organism else {}
226
- self._sources = sources or {}
227
- self._exclude = exclude or {}
228
689
  self._non_validated = None
229
- if check_valid_keys:
230
- self._check_valid_keys()
690
+ super().__init__(
691
+ dataset=df,
692
+ columns_field=columns,
693
+ organism=organism,
694
+ categoricals=categoricals,
695
+ sources=sources,
696
+ exclude=exclude,
697
+ )
231
698
  self._save_columns()
232
699
 
233
- @property
234
- def non_validated(self) -> dict[str, list[str]]:
235
- """Return the non-validated features and labels."""
236
- if self._non_validated is None:
237
- raise ValidationError("Please run validate() first!")
238
- return self._non_validated
239
-
240
- @property
241
- def fields(self) -> dict:
242
- """Return the columns fields to validate against."""
243
- return self._fields
244
-
245
- def lookup(
246
- self, using_key: str | None = None, public: bool = False
247
- ) -> CurateLookup:
700
+ def lookup(self, public: bool = False) -> CurateLookup:
248
701
  """Lookup categories.
249
702
 
250
703
  Args:
251
- using_key: The instance where the lookup is performed.
252
- if "public", the lookup is performed on the public reference.
704
+ public: If "public", the lookup is performed on the public reference.
253
705
  """
254
706
  return CurateLookup(
255
- categoricals=self._fields,
707
+ categoricals=self._categoricals,
256
708
  slots={"columns": self._columns_field},
257
- using_key=using_key or self._using_key,
258
709
  public=public,
259
710
  )
260
711
 
261
- def _check_valid_keys(self, extra: set | None = None) -> None:
262
- extra = extra or set()
263
- for name, d in {
264
- "categoricals": self._fields,
265
- "sources": self._sources,
266
- "exclude": self._exclude,
267
- }.items():
268
- if not isinstance(d, dict):
269
- raise TypeError(f"{name} must be a dictionary!")
270
- valid_keys = set(self._df.columns) | {"columns"} | extra
271
- nonval_keys = [key for key in d.keys() if key not in valid_keys]
272
- n = len(nonval_keys)
273
- s = "s" if n > 1 else ""
274
- are = "are" if n > 1 else "is"
275
- if len(nonval_keys) > 0:
276
- raise ValidationError(
277
- f"key{s} passed to {name} {are} not present in columns: {colors.yellow(_format_values(nonval_keys))}"
278
- )
279
-
280
712
  def _save_columns(self, validated_only: bool = True) -> None:
281
713
  """Save column name records."""
282
714
  # Always save features specified as the fields keys
283
715
  update_registry(
284
- values=list(self.fields.keys()),
716
+ values=list(self.categoricals.keys()),
285
717
  field=self._columns_field,
286
718
  key="columns",
287
- using_key=self._using_key,
288
719
  validated_only=False,
289
720
  source=self._sources.get("columns"),
290
721
  exclude=self._exclude.get("columns"),
291
- **self._kwargs, # type: ignore
292
722
  )
293
723
 
294
724
  # Save the rest of the columns based on validated_only
295
- additional_columns = set(self._df.columns) - set(self.fields.keys())
725
+ additional_columns = set(self._dataset.columns) - set(self.categoricals.keys())
296
726
  if additional_columns:
297
727
  update_registry(
298
728
  values=list(additional_columns),
299
729
  field=self._columns_field,
300
730
  key="columns",
301
- using_key=self._using_key,
302
731
  validated_only=validated_only,
303
- df=self._df, # Get the Feature type from df
732
+ df=self._dataset, # Get the Feature type from df
304
733
  source=self._sources.get("columns"),
305
734
  exclude=self._exclude.get("columns"),
306
- **self._kwargs, # type: ignore
307
735
  )
308
736
 
309
- def add_new_from(self, key: str, organism: str | None = None, **kwargs):
310
- """Add validated & new categories.
737
+ @deprecated(new_name="is run by default")
738
+ def add_new_from_columns(self, organism: str | None = None, **kwargs):
739
+ pass
740
+
741
+ def validate(self) -> bool:
742
+ """Validate variables and categorical observations.
743
+
744
+ This method also registers the validated records in the current instance:
745
+ - from public sources
311
746
 
312
747
  Args:
313
- key: The key referencing the slot in the DataFrame from which to draw terms.
314
748
  organism: The organism name.
315
- **kwargs: Additional keyword arguments to pass to create new records
749
+
750
+ Returns:
751
+ Whether the DataFrame is validated.
316
752
  """
317
- if len(kwargs) > 0 and key == "all":
318
- raise ValueError("Cannot pass additional arguments to 'all' key!")
319
- self._kwargs.update({"organism": organism} if organism else {})
320
- self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
321
-
322
- def _add_new_from_columns(self, organism: str | None = None, **kwargs):
323
- """Deprecated to run by default during init."""
324
- warnings.warn(
325
- "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
326
- DeprecationWarning,
327
- stacklevel=2,
753
+ # add all validated records to the current instance
754
+ self._update_registry_all()
755
+ self._validate_category_error_messages = "" # reset the error messages
756
+ self._is_validated, self._non_validated = validate_categories_in_df( # type: ignore
757
+ self._dataset,
758
+ fields=self.categoricals,
759
+ sources=self._sources,
760
+ exclude=self._exclude,
761
+ curator=self,
762
+ organism=self._organism,
328
763
  )
329
- pass
330
-
331
- def _replace_synonyms(
332
- self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
333
- ):
334
- # replace the values in df
335
- std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
336
- # remove the standardized values from self.non_validated
337
- non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
338
- if len(non_validated) == 0:
339
- self._non_validated.pop(key, None) # type: ignore
340
- else:
341
- self._non_validated[key] = non_validated # type: ignore
342
- # logging
343
- n = len(syn_mapper)
344
- if n > 0:
345
- syn_mapper_print = _format_values(
346
- [f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
347
- )
348
- s = "s" if n > 1 else ""
349
- logger.success(
350
- f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
351
- )
352
- return std_values
764
+ return self._is_validated
353
765
 
354
766
  def standardize(self, key: str) -> None:
355
767
  """Replace synonyms with standardized values.
@@ -359,6 +771,8 @@ class DataFrameCurator(BaseCurator):
359
771
  Args:
360
772
  key: The key referencing the column in the DataFrame to standardize.
361
773
  """
774
+ if self._artifact is not None:
775
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
362
776
  # list is needed to avoid RuntimeError: dictionary changed size during iteration
363
777
  avail_keys = list(self.non_validated.keys())
364
778
  if len(avail_keys) == 0:
@@ -367,137 +781,74 @@ class DataFrameCurator(BaseCurator):
367
781
 
368
782
  if key == "all":
369
783
  for k in avail_keys:
370
- if k in self._fields: # needed to exclude var_index
784
+ if k in self._categoricals: # needed to exclude var_index
371
785
  syn_mapper = standardize_categories(
372
786
  self.non_validated[k],
373
- field=self._fields[k],
374
- using_key=self._using_key,
787
+ field=self._categoricals[k],
375
788
  source=self._sources.get(k),
376
- **self._kwargs,
377
789
  )
378
- self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k])
790
+ self._dataset[k] = self._replace_synonyms(
791
+ k, syn_mapper, self._dataset[k]
792
+ )
379
793
  else:
380
794
  if key not in avail_keys:
381
- if key in self._fields:
795
+ if key in self._categoricals:
382
796
  logger.info(f"No unstandardized values found for {key!r}")
383
797
  else:
384
798
  raise KeyError(
385
799
  f"{key!r} is not a valid key, available keys are: {_format_values(avail_keys)}!"
386
800
  )
387
801
  else:
388
- if key in self._fields: # needed to exclude var_index
802
+ if key in self._categoricals: # needed to exclude var_index
389
803
  syn_mapper = standardize_categories(
390
804
  self.non_validated[key],
391
- field=self._fields[key],
392
- using_key=self._using_key,
805
+ field=self._categoricals[key],
393
806
  source=self._sources.get(key),
394
- **self._kwargs,
807
+ organism=self._organism,
395
808
  )
396
- self._df[key] = self._replace_synonyms(
397
- key, syn_mapper, self._df[key]
809
+ self._dataset[key] = self._replace_synonyms(
810
+ key, syn_mapper, self._dataset[key]
398
811
  )
399
812
 
813
+ def _update_registry_all(self, validated_only: bool = True, **kwargs):
814
+ """Save labels for all features."""
815
+ for name in self.categoricals.keys():
816
+ self._update_registry(name, validated_only=validated_only, **kwargs)
817
+
400
818
  def _update_registry(
401
819
  self, categorical: str, validated_only: bool = True, **kwargs
402
820
  ) -> None:
403
821
  if categorical == "all":
404
822
  self._update_registry_all(validated_only=validated_only, **kwargs)
405
823
  else:
406
- if categorical not in self.fields:
824
+ if categorical not in self.categoricals:
407
825
  raise ValidationError(
408
826
  f"Feature {categorical} is not part of the fields!"
409
827
  )
410
828
  update_registry(
411
- values=_flatten_unique(self._df[categorical]),
412
- field=self.fields[categorical],
829
+ values=_flatten_unique(self._dataset[categorical]),
830
+ field=self.categoricals[categorical],
413
831
  key=categorical,
414
- using_key=self._using_key,
415
832
  validated_only=validated_only,
416
833
  source=self._sources.get(categorical),
417
834
  exclude=self._exclude.get(categorical),
418
- **kwargs,
835
+ organism=self._organism,
419
836
  )
420
837
  # adding new records removes them from non_validated
421
838
  if not validated_only and self._non_validated:
422
839
  self._non_validated.pop(categorical, None) # type: ignore
423
840
 
424
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
425
- """Save labels for all features."""
426
- for name in self.fields.keys():
427
- self._update_registry(name, validated_only=validated_only, **kwargs)
428
-
429
- def validate(self, organism: str | None = None) -> bool:
430
- """Validate variables and categorical observations.
431
-
432
- This method also registers the validated records in the current instance:
433
- - from public sources
434
- - from the using_key instance
841
+ def add_new_from(self, key: str, **kwargs):
842
+ """Add validated & new categories.
435
843
 
436
844
  Args:
845
+ key: The key referencing the slot in the DataFrame from which to draw terms.
437
846
  organism: The organism name.
438
-
439
- Returns:
440
- Whether the DataFrame is validated.
441
- """
442
- self._kwargs.update({"organism": organism} if organism else {})
443
-
444
- # add all validated records to the current instance
445
- self._update_registry_all()
446
-
447
- self._validated, self._non_validated = validate_categories_in_df( # type: ignore
448
- self._df,
449
- fields=self.fields,
450
- using_key=self._using_key,
451
- sources=self._sources,
452
- exclude=self._exclude,
453
- **self._kwargs,
454
- )
455
- return self._validated
456
-
457
- def save_artifact(
458
- self,
459
- description: str | None = None,
460
- key: str | None = None,
461
- revises: Artifact | None = None,
462
- run: Run | None = None,
463
- ) -> Artifact:
464
- """Save the validated DataFrame and metadata.
465
-
466
- Args:
467
- description: Description of the DataFrame object.
468
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
469
- Artifacts with the same key form a revision family.
470
- revises: Previous version of the artifact. Triggers a revision.
471
- run: The run that creates the artifact.
472
-
473
- Returns:
474
- A saved artifact record.
847
+ **kwargs: Additional keyword arguments to pass to create new records
475
848
  """
476
- from lamindb.core._settings import settings
477
-
478
- if not self._validated:
479
- self.validate()
480
- if not self._validated:
481
- raise ValidationError("Dataset does not validate. Please curate.")
482
-
483
- # Make sure all labels are saved in the current instance
484
- verbosity = settings.verbosity
485
- try:
486
- settings.verbosity = "warning"
487
- self._artifact = save_artifact(
488
- self._df,
489
- description=description,
490
- fields=self.fields,
491
- columns_field=self._columns_field,
492
- key=key,
493
- revises=revises,
494
- run=run,
495
- **self._kwargs,
496
- )
497
- finally:
498
- settings.verbosity = verbosity
499
-
500
- return self._artifact
849
+ if len(kwargs) > 0 and key == "all":
850
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
851
+ self._update_registry(key, validated_only=False, **kwargs)
501
852
 
502
853
  def clean_up_failed_runs(self):
503
854
  """Clean up previous failed runs that don't save any outputs."""
@@ -509,21 +860,14 @@ class DataFrameCurator(BaseCurator):
509
860
  ).delete()
510
861
 
511
862
 
512
- class AnnDataCurator(DataFrameCurator):
513
- """Curation flow for ``AnnData``.
514
-
515
- See also :class:`~lamindb.Curator`.
516
-
517
- Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
518
-
519
- See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
863
+ class AnnDataCatManager(CatManager):
864
+ """Manage categorical curation.
520
865
 
521
866
  Args:
522
867
  data: The AnnData object or an AnnData-like path.
523
868
  var_index: The registry field for mapping the ``.var`` index.
524
869
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
525
870
  obs_columns: The registry field for mapping the ``.obs.columns``.
526
- using_key: A reference LaminDB instance.
527
871
  verbosity: The verbosity level.
528
872
  organism: The organism name.
529
873
  sources: A dictionary mapping ``.obs.columns`` to Source records.
@@ -538,7 +882,7 @@ class AnnDataCurator(DataFrameCurator):
538
882
  ... var_index=bt.Gene.ensembl_gene_id,
539
883
  ... categoricals={
540
884
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
541
- ... "donor_id": ln.ULabel.name
885
+ ... "donor_id": ULabel.name
542
886
  ... },
543
887
  ... organism="human",
544
888
  ... )
@@ -546,56 +890,48 @@ class AnnDataCurator(DataFrameCurator):
546
890
 
547
891
  def __init__(
548
892
  self,
549
- data: ad.AnnData | UPathStr,
893
+ data: ad.AnnData | Artifact,
550
894
  var_index: FieldAttr,
551
895
  categoricals: dict[str, FieldAttr] | None = None,
552
896
  obs_columns: FieldAttr = Feature.name,
553
- using_key: str | None = None,
554
897
  verbosity: str = "hint",
555
898
  organism: str | None = None,
556
899
  sources: dict[str, Record] | None = None,
557
900
  exclude: dict | None = None,
558
901
  ) -> None:
559
- from lamindb_setup.core import upath
560
-
561
902
  if isinstance(var_index, str):
562
903
  raise TypeError("var_index parameter has to be a bionty field")
563
904
 
564
- from .._artifact import data_is_anndata
565
-
566
905
  if sources is None:
567
906
  sources = {}
568
907
  if not data_is_anndata(data):
569
- raise TypeError(
570
- "data has to be an AnnData object or a path to AnnData-like"
571
- )
572
- if isinstance(data, ad.AnnData):
573
- self._adata = data
574
- else: # pragma: no cover
575
- from lamindb.core.storage._backed_access import backed_access
576
-
577
- self._adata = backed_access(upath.create_path(data))
908
+ raise TypeError("data has to be an AnnData object")
578
909
 
579
910
  if "symbol" in str(var_index):
580
911
  logger.warning(
581
912
  "indexing datasets with gene symbols can be problematic: https://docs.lamin.ai/faq/symbol-mapping"
582
913
  )
583
914
 
584
- self._data = data
915
+ self._obs_fields = categoricals or {}
585
916
  self._var_field = var_index
586
917
  super().__init__(
587
- df=self._adata.obs,
918
+ dataset=data,
588
919
  categoricals=categoricals,
920
+ sources=sources,
921
+ organism=organism,
922
+ exclude=exclude,
923
+ columns_field=var_index,
924
+ )
925
+ self._adata = self._dataset
926
+ self._obs_df_curator = DataFrameCatManager(
927
+ df=self._adata.obs,
928
+ categoricals=self.categoricals,
589
929
  columns=obs_columns,
590
- using_key=using_key,
591
930
  verbosity=verbosity,
592
- organism=organism,
931
+ organism=None,
593
932
  sources=sources,
594
933
  exclude=exclude,
595
- check_valid_keys=False,
596
934
  )
597
- self._obs_fields = categoricals or {}
598
- self._check_valid_keys(extra={"var_index"})
599
935
 
600
936
  @property
601
937
  def var_index(self) -> FieldAttr:
@@ -607,54 +943,53 @@ class AnnDataCurator(DataFrameCurator):
607
943
  """Return the obs fields to validate against."""
608
944
  return self._obs_fields
609
945
 
610
- def lookup(
611
- self, using_key: str | None = None, public: bool = False
612
- ) -> CurateLookup:
946
+ def lookup(self, public: bool = False) -> CurateLookup:
613
947
  """Lookup categories.
614
948
 
615
949
  Args:
616
- using_key: The instance where the lookup is performed.
617
- if "public", the lookup is performed on the public reference.
950
+ public: If "public", the lookup is performed on the public reference.
618
951
  """
619
952
  return CurateLookup(
620
953
  categoricals=self._obs_fields,
621
954
  slots={"columns": self._columns_field, "var_index": self._var_field},
622
- using_key=using_key or self._using_key,
623
955
  public=public,
624
956
  )
625
957
 
626
958
  def _save_from_var_index(
627
- self, validated_only: bool = True, organism: str | None = None
959
+ self,
960
+ validated_only: bool = True,
628
961
  ):
629
962
  """Save variable records."""
630
963
  update_registry(
631
964
  values=list(self._adata.var.index),
632
965
  field=self.var_index,
633
966
  key="var_index",
634
- using_key=self._using_key,
635
967
  validated_only=validated_only,
636
- organism=organism,
968
+ organism=self._organism,
637
969
  source=self._sources.get("var_index"),
638
970
  exclude=self._exclude.get("var_index"),
639
971
  )
640
972
 
641
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
642
- """Save labels for all features."""
643
- self._save_from_var_index(validated_only=validated_only, **self._kwargs)
644
- for name in self._obs_fields.keys():
645
- self._update_registry(name, validated_only=validated_only, **self._kwargs)
973
+ def add_new_from(self, key: str, **kwargs):
974
+ """Add validated & new categories.
646
975
 
647
- def add_new_from_var_index(self, organism: str | None = None, **kwargs):
976
+ Args:
977
+ key: The key referencing the slot in the DataFrame from which to draw terms.
978
+ organism: The organism name.
979
+ **kwargs: Additional keyword arguments to pass to create new records
980
+ """
981
+ self._obs_df_curator.add_new_from(key, **kwargs)
982
+
983
+ def add_new_from_var_index(self, **kwargs):
648
984
  """Update variable records.
649
985
 
650
986
  Args:
651
987
  organism: The organism name.
652
988
  **kwargs: Additional keyword arguments to pass to create new records.
653
989
  """
654
- self._kwargs.update({"organism": organism} if organism else {})
655
- self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
990
+ self._save_from_var_index(validated_only=False, **kwargs)
656
991
 
657
- def validate(self, organism: str | None = None) -> bool:
992
+ def validate(self) -> bool:
658
993
  """Validate categories.
659
994
 
660
995
  This method also registers the validated records in the current instance.
@@ -665,38 +1000,25 @@ class AnnDataCurator(DataFrameCurator):
665
1000
  Returns:
666
1001
  Whether the AnnData object is validated.
667
1002
  """
668
- self._kwargs.update({"organism": organism} if organism else {})
669
- if self._using_key is not None and self._using_key != "default":
670
- logger.important(
671
- f"validating metadata using registries of instance {colors.italic(self._using_key)}"
672
- )
1003
+ self._validate_category_error_messages = "" # reset the error messages
673
1004
 
674
1005
  # add all validated records to the current instance
675
- self._update_registry_all()
676
-
1006
+ self._save_from_var_index(validated_only=True)
677
1007
  validated_var, non_validated_var = validate_categories(
678
1008
  self._adata.var.index,
679
1009
  field=self._var_field,
680
1010
  key="var_index",
681
- using_key=self._using_key,
682
1011
  source=self._sources.get("var_index"),
683
1012
  hint_print=".add_new_from_var_index()",
684
1013
  exclude=self._exclude.get("var_index"),
685
- **self._kwargs, # type: ignore
686
- )
687
- validated_obs, non_validated_obs = validate_categories_in_df(
688
- self._adata.obs,
689
- fields=self.categoricals,
690
- using_key=self._using_key,
691
- sources=self._sources,
692
- exclude=self._exclude,
693
- **self._kwargs,
1014
+ organism=self._organism, # type: ignore
694
1015
  )
695
- self._non_validated = non_validated_obs # type: ignore
1016
+ validated_obs = self._obs_df_curator.validate()
1017
+ self._non_validated = self._obs_df_curator._non_validated # type: ignore
696
1018
  if len(non_validated_var) > 0:
697
1019
  self._non_validated["var_index"] = non_validated_var # type: ignore
698
- self._validated = validated_var and validated_obs
699
- return self._validated
1020
+ self._is_validated = validated_var and validated_obs
1021
+ return self._is_validated
700
1022
 
701
1023
  def standardize(self, key: str):
702
1024
  """Replace synonyms with standardized values.
@@ -709,83 +1031,35 @@ class AnnDataCurator(DataFrameCurator):
709
1031
 
710
1032
  Inplace modification of the dataset.
711
1033
  """
1034
+ if self._artifact is not None:
1035
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
712
1036
  if key in self._adata.obs.columns or key == "all":
713
1037
  # standardize obs columns
714
- super().standardize(key)
1038
+ self._obs_df_curator.standardize(key)
715
1039
  # in addition to the obs columns, standardize the var.index
716
1040
  if key == "var_index" or key == "all":
717
1041
  syn_mapper = standardize_categories(
718
1042
  self._adata.var.index,
719
1043
  field=self.var_index,
720
- using_key=self._using_key,
721
1044
  source=self._sources.get("var_index"),
722
- **self._kwargs,
1045
+ organism=self._organism,
723
1046
  )
724
1047
  if "var_index" in self._non_validated: # type: ignore
725
1048
  self._adata.var.index = self._replace_synonyms(
726
1049
  "var_index", syn_mapper, self._adata.var.index
727
1050
  )
728
1051
 
729
- def save_artifact(
730
- self,
731
- description: str | None = None,
732
- key: str | None = None,
733
- revises: Artifact | None = None,
734
- run: Run | None = None,
735
- ) -> Artifact:
736
- """Save the validated ``AnnData`` and metadata.
737
-
738
- Args:
739
- description: A description of the ``AnnData`` object.
740
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`.
741
- Artifacts with the same key form a revision family.
742
- revises: Previous version of the artifact. Triggers a revision.
743
- run: The run that creates the artifact.
744
-
745
- Returns:
746
- A saved artifact record.
747
- """
748
- from lamindb.core._settings import settings
749
-
750
- if not self._validated:
751
- self.validate()
752
- if not self._validated:
753
- raise ValidationError("Dataset does not validate. Please curate.")
754
- verbosity = settings.verbosity
755
- try:
756
- settings.verbosity = "warning"
757
- self._artifact = save_artifact(
758
- self._data,
759
- adata=self._adata,
760
- description=description,
761
- columns_field=self.var_index,
762
- fields=self.categoricals,
763
- key=key,
764
- revises=revises,
765
- run=run,
766
- **self._kwargs,
767
- )
768
- finally:
769
- settings.verbosity = verbosity
770
- return self._artifact
771
-
772
1052
 
773
- class MuDataCurator:
1053
+ class MuDataCatManager(CatManager):
774
1054
  """Curation flow for a ``MuData`` object.
775
1055
 
776
- See also :class:`~lamindb.Curator`.
777
-
778
- Note that if genes or other measurements are removed from the MuData object,
779
- the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
780
-
781
1056
  Args:
782
1057
  mdata: The MuData object to curate.
783
1058
  var_index: The registry field for mapping the ``.var`` index for each modality.
784
1059
  For example:
785
- ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
1060
+ ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": CellMarker.name}``
786
1061
  categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
787
1062
  Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
788
- using_key: A reference LaminDB instance.
789
1063
  verbosity: The verbosity level.
790
1064
  organism: The organism name.
791
1065
  sources: A dictionary mapping ``.obs.columns`` to Source records.
@@ -799,11 +1073,11 @@ class MuDataCurator:
799
1073
  ... mdata,
800
1074
  ... var_index={
801
1075
  ... "rna": bt.Gene.ensembl_gene_id,
802
- ... "adt": ln.CellMarker.name
1076
+ ... "adt": CellMarker.name
803
1077
  ... },
804
1078
  ... categoricals={
805
1079
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
806
- ... "donor_id": ln.ULabel.name
1080
+ ... "donor_id": ULabel.name
807
1081
  ... },
808
1082
  ... organism="human",
809
1083
  ... )
@@ -811,52 +1085,47 @@ class MuDataCurator:
811
1085
 
812
1086
  def __init__(
813
1087
  self,
814
- mdata: MuData,
1088
+ mdata: MuData | Artifact,
815
1089
  var_index: dict[str, FieldAttr],
816
1090
  categoricals: dict[str, FieldAttr] | None = None,
817
- using_key: str | None = None,
818
1091
  verbosity: str = "hint",
819
1092
  organism: str | None = None,
820
1093
  sources: dict[str, Record] | None = None,
821
1094
  exclude: dict | None = None, # {modality: {field: [values]}}
822
1095
  ) -> None:
823
- if sources is None:
824
- sources = {}
825
- self._sources = sources
826
- if exclude is None:
827
- exclude = {}
828
- self._exclude = exclude
829
- self._mdata = mdata
830
- self._kwargs = {"organism": organism} if organism else {}
1096
+ super().__init__(
1097
+ dataset=mdata,
1098
+ categoricals={},
1099
+ sources=sources,
1100
+ organism=organism,
1101
+ exclude=exclude,
1102
+ )
1103
+ self._columns_field = var_index # this is for consistency with BaseCatManager
831
1104
  self._var_fields = var_index
832
1105
  self._verify_modality(self._var_fields.keys())
833
1106
  self._obs_fields = self._parse_categoricals(categoricals)
834
1107
  self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
835
- self._using_key = using_key
836
1108
  self._verbosity = verbosity
837
1109
  self._obs_df_curator = None
838
1110
  if "obs" in self._modalities:
839
- self._obs_df_curator = DataFrameCurator(
840
- df=mdata.obs,
1111
+ self._obs_df_curator = DataFrameCatManager(
1112
+ df=self._dataset.obs,
841
1113
  columns=Feature.name,
842
1114
  categoricals=self._obs_fields.get("obs", {}),
843
- using_key=using_key,
844
1115
  verbosity=verbosity,
845
1116
  sources=self._sources.get("obs"),
846
1117
  exclude=self._exclude.get("obs"),
847
- check_valid_keys=False,
848
- **self._kwargs,
1118
+ organism=organism,
849
1119
  )
850
1120
  self._mod_adata_curators = {
851
- modality: AnnDataCurator(
852
- data=mdata[modality],
1121
+ modality: AnnDataCatManager(
1122
+ data=self._dataset[modality],
853
1123
  var_index=var_index.get(modality),
854
1124
  categoricals=self._obs_fields.get(modality),
855
- using_key=using_key,
856
1125
  verbosity=verbosity,
857
1126
  sources=self._sources.get(modality),
858
1127
  exclude=self._exclude.get(modality),
859
- **self._kwargs,
1128
+ organism=organism,
860
1129
  )
861
1130
  for modality in self._modalities
862
1131
  if modality != "obs"
@@ -874,7 +1143,7 @@ class MuDataCurator:
874
1143
  return self._obs_fields
875
1144
 
876
1145
  @property
877
- def non_validated(self) -> dict[str, dict[str, list[str]]]:
1146
+ def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
878
1147
  """Return the non-validated features and labels."""
879
1148
  if self._non_validated is None:
880
1149
  raise ValidationError("Please run validate() first!")
@@ -883,15 +1152,15 @@ class MuDataCurator:
883
1152
  def _verify_modality(self, modalities: Iterable[str]):
884
1153
  """Verify the modality exists."""
885
1154
  for modality in modalities:
886
- if modality not in self._mdata.mod.keys():
1155
+ if modality not in self._dataset.mod.keys():
887
1156
  raise ValidationError(f"modality '{modality}' does not exist!")
888
1157
 
889
1158
  def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
890
1159
  """Parse the categorical fields."""
891
- prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
1160
+ prefixes = {f"{k}:" for k in self._dataset.mod.keys()}
892
1161
  obs_fields: dict[str, dict[str, FieldAttr]] = {}
893
1162
  for k, v in categoricals.items():
894
- if k not in self._mdata.obs.columns:
1163
+ if k not in self._dataset.obs.columns:
895
1164
  raise ValidationError(f"column '{k}' does not exist in mdata.obs!")
896
1165
  if any(k.startswith(prefix) for prefix in prefixes):
897
1166
  modality, col = k.split(":")[0], k.split(":")[1]
@@ -904,14 +1173,11 @@ class MuDataCurator:
904
1173
  obs_fields["obs"][k] = v
905
1174
  return obs_fields
906
1175
 
907
- def lookup(
908
- self, using_key: str | None = None, public: bool = False
909
- ) -> CurateLookup:
1176
+ def lookup(self, public: bool = False) -> CurateLookup:
910
1177
  """Lookup categories.
911
1178
 
912
1179
  Args:
913
- using_key: The instance where the lookup is performed.
914
- if "public", the lookup is performed on the public reference.
1180
+ public: Perform lookup on public source ontologies.
915
1181
  """
916
1182
  obs_fields = {}
917
1183
  for mod, fields in self._obs_fields.items():
@@ -925,27 +1191,19 @@ class MuDataCurator:
925
1191
  slots={
926
1192
  **{f"{k}_var_index": v for k, v in self._var_fields.items()},
927
1193
  },
928
- using_key=using_key or self._using_key,
929
1194
  public=public,
930
1195
  )
931
1196
 
1197
+ @deprecated(new_name="is run by default")
932
1198
  def add_new_from_columns(
933
1199
  self,
934
1200
  modality: str,
935
1201
  column_names: list[str] | None = None,
936
- organism: str | None = None,
937
1202
  **kwargs,
938
1203
  ):
939
- """Update columns records."""
940
- warnings.warn(
941
- "`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
942
- DeprecationWarning,
943
- stacklevel=2,
944
- )
1204
+ pass
945
1205
 
946
- def add_new_from_var_index(
947
- self, modality: str, organism: str | None = None, **kwargs
948
- ):
1206
+ def add_new_from_var_index(self, modality: str, **kwargs):
949
1207
  """Update variable records.
950
1208
 
951
1209
  Args:
@@ -953,25 +1211,19 @@ class MuDataCurator:
953
1211
  organism: The organism name.
954
1212
  **kwargs: Additional keyword arguments to pass to create new records.
955
1213
  """
956
- self._kwargs.update({"organism": organism} if organism else {})
957
- self._mod_adata_curators[modality].add_new_from_var_index(
958
- **self._kwargs, **kwargs
959
- )
1214
+ self._mod_adata_curators[modality].add_new_from_var_index(**kwargs)
960
1215
 
961
1216
  def _update_registry_all(self):
962
1217
  """Update all registries."""
963
1218
  if self._obs_df_curator is not None:
964
- self._obs_df_curator._update_registry_all(
965
- validated_only=True, **self._kwargs
966
- )
1219
+ self._obs_df_curator._update_registry_all(validated_only=True)
967
1220
  for _, adata_curator in self._mod_adata_curators.items():
968
- adata_curator._update_registry_all(validated_only=True, **self._kwargs)
1221
+ adata_curator._obs_df_curator._update_registry_all(validated_only=True)
969
1222
 
970
1223
  def add_new_from(
971
1224
  self,
972
1225
  key: str,
973
1226
  modality: str | None = None,
974
- organism: str | None = None,
975
1227
  **kwargs,
976
1228
  ):
977
1229
  """Add validated & new categories.
@@ -984,24 +1236,17 @@ class MuDataCurator:
984
1236
  """
985
1237
  if len(kwargs) > 0 and key == "all":
986
1238
  raise ValueError("Cannot pass additional arguments to 'all' key!")
987
- self._kwargs.update({"organism": organism} if organism else {})
988
1239
  modality = modality or "obs"
989
1240
  if modality in self._mod_adata_curators:
990
1241
  adata_curator = self._mod_adata_curators[modality]
991
- adata_curator.add_new_from(key=key, **self._kwargs, **kwargs)
1242
+ adata_curator.add_new_from(key=key, **kwargs)
992
1243
  if modality == "obs":
993
- self._obs_df_curator.add_new_from(key=key, **self._kwargs, **kwargs)
1244
+ self._obs_df_curator.add_new_from(key=key, **kwargs)
994
1245
 
995
- def validate(self, organism: str | None = None) -> bool:
1246
+ def validate(self) -> bool:
996
1247
  """Validate categories."""
997
1248
  from lamindb.core._settings import settings
998
1249
 
999
- self._kwargs.update({"organism": organism} if organism else {})
1000
- if self._using_key is not None and self._using_key != "default":
1001
- logger.important(
1002
- f"validating using registries of instance {colors.italic(self._using_key)}"
1003
- )
1004
-
1005
1250
  # add all validated records to the current instance
1006
1251
  verbosity = settings.verbosity
1007
1252
  try:
@@ -1015,20 +1260,20 @@ class MuDataCurator:
1015
1260
  obs_validated = True
1016
1261
  if "obs" in self._modalities:
1017
1262
  logger.info('validating categoricals in "obs"...')
1018
- obs_validated &= self._obs_df_curator.validate(**self._kwargs)
1263
+ obs_validated &= self._obs_df_curator.validate()
1019
1264
  self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore
1020
1265
  logger.print("")
1021
1266
 
1022
1267
  mods_validated = True
1023
1268
  for modality, adata_curator in self._mod_adata_curators.items():
1024
1269
  logger.info(f'validating categoricals in modality "{modality}"...')
1025
- mods_validated &= adata_curator.validate(**self._kwargs)
1270
+ mods_validated &= adata_curator.validate()
1026
1271
  if len(adata_curator.non_validated) > 0:
1027
1272
  self._non_validated[modality] = adata_curator.non_validated # type: ignore
1028
1273
  logger.print("")
1029
1274
 
1030
- self._validated = obs_validated & mods_validated
1031
- return self._validated
1275
+ self._is_validated = obs_validated & mods_validated
1276
+ return self._is_validated
1032
1277
 
1033
1278
  def standardize(self, key: str, modality: str | None = None):
1034
1279
  """Replace synonyms with standardized values.
@@ -1039,6 +1284,8 @@ class MuDataCurator:
1039
1284
 
1040
1285
  Inplace modification of the dataset.
1041
1286
  """
1287
+ if self._artifact is not None:
1288
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
1042
1289
  modality = modality or "obs"
1043
1290
  if modality in self._mod_adata_curators:
1044
1291
  adata_curator = self._mod_adata_curators[modality]
@@ -1046,47 +1293,6 @@ class MuDataCurator:
1046
1293
  if modality == "obs":
1047
1294
  self._obs_df_curator.standardize(key=key)
1048
1295
 
1049
- def save_artifact(
1050
- self,
1051
- description: str | None = None,
1052
- key: str | None = None,
1053
- revises: Artifact | None = None,
1054
- run: Run | None = None,
1055
- ) -> Artifact:
1056
- """Save the validated ``MuData`` and metadata.
1057
-
1058
- Args:
1059
- description: A description of the ``MuData`` object.
1060
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
1061
- revises: Previous version of the artifact. Triggers a revision.
1062
- run: The run that creates the artifact.
1063
-
1064
- Returns:
1065
- A saved artifact record.
1066
- """
1067
- from lamindb.core._settings import settings
1068
-
1069
- if not self._validated:
1070
- self.validate()
1071
- if not self._validated:
1072
- raise ValidationError("Dataset does not validate. Please curate.")
1073
- verbosity = settings.verbosity
1074
- try:
1075
- settings.verbosity = "warning"
1076
- self._artifact = save_artifact(
1077
- self._mdata,
1078
- description=description,
1079
- columns_field=self.var_index,
1080
- fields=self.categoricals,
1081
- key=key,
1082
- revises=revises,
1083
- run=run,
1084
- **self._kwargs,
1085
- )
1086
- finally:
1087
- settings.verbosity = verbosity
1088
- return self._artifact
1089
-
1090
1296
 
1091
1297
  def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1092
1298
  if (n := len(nonval_keys)) > 0:
@@ -1097,10 +1303,8 @@ def _maybe_curation_keys_not_present(nonval_keys: list[str], name: str):
1097
1303
  )
1098
1304
 
1099
1305
 
1100
- class SOMACurator(BaseCurator):
1101
- """Curation flow for ``tiledbsoma``.
1102
-
1103
- See also :class:`~lamindb.Curator`.
1306
+ class TiledbsomaCatManager(CatManager):
1307
+ """Curation flow for `tiledbsoma.Experiment`.
1104
1308
 
1105
1309
  Args:
1106
1310
  experiment_uri: A local or cloud path to a `tiledbsoma.Experiment`.
@@ -1123,7 +1327,7 @@ class SOMACurator(BaseCurator):
1123
1327
  ... var_index={"RNA": ("var_id", bt.Gene.symbol)},
1124
1328
  ... categoricals={
1125
1329
  ... "cell_type_ontology_id": bt.CellType.ontology_id,
1126
- ... "donor_id": ln.ULabel.name
1330
+ ... "donor_id": ULabel.name
1127
1331
  ... },
1128
1332
  ... organism="human",
1129
1333
  ... )
@@ -1138,23 +1342,21 @@ class SOMACurator(BaseCurator):
1138
1342
  organism: str | None = None,
1139
1343
  sources: dict[str, Record] | None = None,
1140
1344
  exclude: dict[str, str | list[str]] | None = None,
1141
- using_key: str | None = None,
1142
1345
  ):
1143
1346
  self._obs_fields = categoricals or {}
1144
1347
  self._var_fields = var_index
1145
1348
  self._columns_field = obs_columns
1146
1349
  if isinstance(experiment_uri, Artifact):
1147
- self._experiment_uri = experiment_uri.path
1350
+ self._dataset = experiment_uri.path
1148
1351
  self._artifact = experiment_uri
1149
1352
  else:
1150
- self._experiment_uri = UPath(experiment_uri)
1353
+ self._dataset = UPath(experiment_uri)
1151
1354
  self._artifact = None
1152
1355
  self._organism = organism
1153
- self._using_key = using_key
1154
1356
  self._sources = sources or {}
1155
1357
  self._exclude = exclude or {}
1156
1358
 
1157
- self._validated: bool | None = False
1359
+ self._is_validated: bool | None = False
1158
1360
  self._non_validated_values: dict[str, list] | None = None
1159
1361
  self._validated_values: dict[str, list] = {}
1160
1362
  # filled by _check_save_keys
@@ -1172,7 +1374,7 @@ class SOMACurator(BaseCurator):
1172
1374
  def _check_save_keys(self):
1173
1375
  from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1174
1376
 
1175
- with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1377
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1176
1378
  experiment_obs = experiment.obs
1177
1379
  self._n_obs = len(experiment_obs)
1178
1380
  self._obs_pa_schema = experiment_obs.schema
@@ -1228,7 +1430,6 @@ class SOMACurator(BaseCurator):
1228
1430
  values=register_columns,
1229
1431
  field=self._columns_field,
1230
1432
  key="columns",
1231
- using_key=self._using_key,
1232
1433
  validated_only=False,
1233
1434
  organism=organism,
1234
1435
  source=self._sources.get("columns"),
@@ -1244,7 +1445,6 @@ class SOMACurator(BaseCurator):
1244
1445
  values=additional_columns,
1245
1446
  field=self._columns_field,
1246
1447
  key="columns",
1247
- using_key=self._using_key,
1248
1448
  validated_only=True,
1249
1449
  organism=organism,
1250
1450
  source=self._sources.get("columns"),
@@ -1257,7 +1457,7 @@ class SOMACurator(BaseCurator):
1257
1457
 
1258
1458
  validated = True
1259
1459
  self._non_validated_values = {}
1260
- with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1460
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1261
1461
  for ms, (key, field) in self._var_fields.items():
1262
1462
  var_ms = experiment.ms[ms].var
1263
1463
  var_ms_key = f"{ms}__{key}"
@@ -1274,7 +1474,6 @@ class SOMACurator(BaseCurator):
1274
1474
  values=var_ms_values,
1275
1475
  field=field,
1276
1476
  key=var_ms_key,
1277
- using_key=self._using_key,
1278
1477
  validated_only=True,
1279
1478
  organism=organism,
1280
1479
  source=self._sources.get(var_ms_key),
@@ -1284,7 +1483,6 @@ class SOMACurator(BaseCurator):
1284
1483
  values=var_ms_values,
1285
1484
  field=field,
1286
1485
  key=var_ms_key,
1287
- using_key=self._using_key,
1288
1486
  organism=organism,
1289
1487
  source=self._sources.get(var_ms_key),
1290
1488
  exclude=self._exclude.get(var_ms_key),
@@ -1310,7 +1508,6 @@ class SOMACurator(BaseCurator):
1310
1508
  values=values,
1311
1509
  field=field,
1312
1510
  key=key,
1313
- using_key=self._using_key,
1314
1511
  validated_only=True,
1315
1512
  organism=organism,
1316
1513
  source=self._sources.get(key),
@@ -1320,7 +1517,6 @@ class SOMACurator(BaseCurator):
1320
1517
  values=values,
1321
1518
  field=field,
1322
1519
  key=key,
1323
- using_key=self._using_key,
1324
1520
  organism=organism,
1325
1521
  source=self._sources.get(key),
1326
1522
  exclude=self._exclude.get(key),
@@ -1330,8 +1526,8 @@ class SOMACurator(BaseCurator):
1330
1526
  self._non_validated_values[key] = non_val
1331
1527
  else:
1332
1528
  self._validated_values[key] = values
1333
- self._validated = validated
1334
- return self._validated
1529
+ self._is_validated = validated
1530
+ return self._is_validated
1335
1531
 
1336
1532
  def _non_validated_values_field(self, key: str) -> tuple[list, FieldAttr]:
1337
1533
  assert self._non_validated_values is not None # noqa: S101
@@ -1346,7 +1542,7 @@ class SOMACurator(BaseCurator):
1346
1542
  values = self._non_validated_values.get(key, [])
1347
1543
  return values, field
1348
1544
 
1349
- def add_new_from(self, key: str) -> None:
1545
+ def add_new_from(self, key: str, **kwargs) -> None:
1350
1546
  """Add validated & new categories.
1351
1547
 
1352
1548
  Args:
@@ -1378,11 +1574,11 @@ class SOMACurator(BaseCurator):
1378
1574
  values=values,
1379
1575
  field=field,
1380
1576
  key=k,
1381
- using_key=self._using_key,
1382
1577
  validated_only=False,
1383
1578
  organism=organism,
1384
1579
  source=self._sources.get(k),
1385
1580
  exclude=self._exclude.get(k),
1581
+ **kwargs,
1386
1582
  )
1387
1583
  # update non-validated values list but keep the key there
1388
1584
  # it will be removed by .validate()
@@ -1405,19 +1601,15 @@ class SOMACurator(BaseCurator):
1405
1601
  """Return the obs fields to validate against."""
1406
1602
  return self._obs_fields
1407
1603
 
1408
- def lookup(
1409
- self, using_key: str | None = None, public: bool = False
1410
- ) -> CurateLookup:
1604
+ def lookup(self, public: bool = False) -> CurateLookup:
1411
1605
  """Lookup categories.
1412
1606
 
1413
1607
  Args:
1414
- using_key: The instance where the lookup is performed.
1415
- if "public", the lookup is performed on the public reference.
1608
+ public: If "public", the lookup is performed on the public reference.
1416
1609
  """
1417
1610
  return CurateLookup(
1418
1611
  categoricals=self._obs_fields,
1419
1612
  slots={"columns": self._columns_field, **self._var_fields_flat},
1420
- using_key=using_key or self._using_key,
1421
1613
  public=public,
1422
1614
  )
1423
1615
 
@@ -1462,7 +1654,6 @@ class SOMACurator(BaseCurator):
1462
1654
  syn_mapper = standardize_categories(
1463
1655
  values=values,
1464
1656
  field=field,
1465
- using_key=self._using_key,
1466
1657
  source=self._sources.get(k),
1467
1658
  organism=organism,
1468
1659
  )
@@ -1471,7 +1662,7 @@ class SOMACurator(BaseCurator):
1471
1662
 
1472
1663
  from lamindb.core.storage._tiledbsoma import _open_tiledbsoma
1473
1664
 
1474
- with _open_tiledbsoma(self._experiment_uri, mode="r") as experiment:
1665
+ with _open_tiledbsoma(self._dataset, mode="r") as experiment:
1475
1666
  value_filter = f"{slot_key} in {list(syn_mapper.keys())}"
1476
1667
  table = slot(experiment).read(value_filter=value_filter).concat()
1477
1668
 
@@ -1484,7 +1675,7 @@ class SOMACurator(BaseCurator):
1484
1675
  lambda val: syn_mapper.get(val, val) # noqa
1485
1676
  )
1486
1677
  # write the mapped values
1487
- with _open_tiledbsoma(self._experiment_uri, mode="w") as experiment:
1678
+ with _open_tiledbsoma(self._dataset, mode="w") as experiment:
1488
1679
  slot(experiment).write(pa.Table.from_pandas(df, schema=table.schema))
1489
1680
  # update non_validated dict
1490
1681
  non_val_k = [
@@ -1502,8 +1693,9 @@ class SOMACurator(BaseCurator):
1502
1693
 
1503
1694
  def save_artifact(
1504
1695
  self,
1505
- description: str | None = None,
1696
+ *,
1506
1697
  key: str | None = None,
1698
+ description: str | None = None,
1507
1699
  revises: Artifact | None = None,
1508
1700
  run: Run | None = None,
1509
1701
  ) -> Artifact:
@@ -1512,7 +1704,7 @@ class SOMACurator(BaseCurator):
1512
1704
  Args:
1513
1705
  description: A description of the ``tiledbsoma`` store.
1514
1706
  key: A path-like key to reference artifact in default storage,
1515
- e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a revision family.
1707
+ e.g., `"myfolder/mystore.tiledbsoma"`. Artifacts with the same key form a version family.
1516
1708
  revises: Previous version of the artifact. Triggers a revision.
1517
1709
  run: The run that creates the artifact.
1518
1710
 
@@ -1521,14 +1713,14 @@ class SOMACurator(BaseCurator):
1521
1713
  """
1522
1714
  from lamindb.core._data import add_labels
1523
1715
 
1524
- if not self._validated:
1716
+ if not self._is_validated:
1525
1717
  self.validate()
1526
- if not self._validated:
1718
+ if not self._is_validated:
1527
1719
  raise ValidationError("Dataset does not validate. Please curate.")
1528
1720
 
1529
1721
  if self._artifact is None:
1530
1722
  artifact = Artifact(
1531
- self._experiment_uri,
1723
+ self._dataset,
1532
1724
  description=description,
1533
1725
  key=key,
1534
1726
  revises=revises,
@@ -1540,7 +1732,7 @@ class SOMACurator(BaseCurator):
1540
1732
  else:
1541
1733
  artifact = self._artifact
1542
1734
 
1543
- _schemas_m2m = {}
1735
+ feature_sets = {}
1544
1736
  if len(self._obs_fields) > 0:
1545
1737
  organism = check_registry_organism(
1546
1738
  self._columns_field.field.model, self._organism
@@ -1550,7 +1742,7 @@ class SOMACurator(BaseCurator):
1550
1742
  empty_dict, schema=self._obs_pa_schema
1551
1743
  ).to_pandas()
1552
1744
  # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
1553
- _schemas_m2m["obs"] = Schema.from_df(
1745
+ feature_sets["obs"] = Schema.from_df(
1554
1746
  df=mock_df,
1555
1747
  field=self._columns_field,
1556
1748
  mute=True,
@@ -1561,238 +1753,1370 @@ class SOMACurator(BaseCurator):
1561
1753
  organism = check_registry_organism(
1562
1754
  var_field.field.model, self._organism
1563
1755
  ).get("organism")
1564
- _schemas_m2m[f"{ms}__var"] = Schema.from_values(
1756
+ feature_sets[f"{ms}__var"] = Schema.from_values(
1565
1757
  values=self._validated_values[f"{ms}__{var_key}"],
1566
1758
  field=var_field,
1567
1759
  organism=organism,
1568
1760
  raise_validation_error=False,
1569
1761
  )
1570
- artifact._staged__schemas_m2m = _schemas_m2m
1762
+ artifact._staged_feature_sets = feature_sets
1763
+
1764
+ feature_ref_is_name = _ref_is_name(self._columns_field)
1765
+ features = Feature.lookup().dict()
1766
+ for key, field in self._obs_fields.items():
1767
+ feature = features.get(key)
1768
+ registry = field.field.model
1769
+ organism = check_registry_organism(field.field.model, self._organism).get(
1770
+ "organism"
1771
+ )
1772
+ labels = registry.from_values(
1773
+ values=self._validated_values[key], field=field, organism=organism
1774
+ )
1775
+ if len(labels) == 0:
1776
+ continue
1777
+ if hasattr(registry, "_name_field"):
1778
+ label_ref_is_name = field.field.name == registry._name_field
1779
+ add_labels(
1780
+ artifact,
1781
+ records=labels,
1782
+ feature=feature,
1783
+ feature_ref_is_name=feature_ref_is_name,
1784
+ label_ref_is_name=label_ref_is_name,
1785
+ from_curator=True,
1786
+ )
1787
+
1788
+ return artifact.save()
1789
+
1790
+
1791
+ class SpatialDataCatManager(CatManager):
1792
+ """Curation flow for a ``Spatialdata`` object.
1793
+
1794
+ See also :class:`~lamindb.Curator`.
1795
+
1796
+ Note that if genes or other measurements are removed from the SpatialData object,
1797
+ the object should be recreated.
1798
+
1799
+ In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
1800
+
1801
+ Args:
1802
+ sdata: The SpatialData object to curate.
1803
+ var_index: A dictionary mapping table keys to the ``.var`` indices.
1804
+ categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
1805
+
1806
+ organism: The organism name.
1807
+ sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
1808
+ exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
1809
+ When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1810
+ using the exclude parameter ensures they are not validated.
1811
+ verbosity: The verbosity level of the logger.
1812
+ sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
1813
+
1814
+ Examples:
1815
+ >>> import bionty as bt
1816
+ >>> curator = SpatialDataCatManager(
1817
+ ... sdata,
1818
+ ... var_index={
1819
+ ... "table_1": bt.Gene.ensembl_gene_id,
1820
+ ... },
1821
+ ... categoricals={
1822
+ ... "table1":
1823
+ ... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ULabel.name},
1824
+ ... "sample":
1825
+ ... {"experimental_factor": bt.ExperimentalFactor.name},
1826
+ ... },
1827
+ ... organism="human",
1828
+ ... )
1829
+ """
1830
+
1831
+ def __init__(
1832
+ self,
1833
+ sdata: Any,
1834
+ var_index: dict[str, FieldAttr],
1835
+ categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1836
+ verbosity: str = "hint",
1837
+ organism: str | None = None,
1838
+ sources: dict[str, dict[str, Record]] | None = None,
1839
+ exclude: dict[str, dict] | None = None,
1840
+ *,
1841
+ sample_metadata_key: str | None = "sample",
1842
+ ) -> None:
1843
+ super().__init__(
1844
+ dataset=sdata,
1845
+ categoricals={},
1846
+ sources=sources,
1847
+ organism=organism,
1848
+ exclude=exclude,
1849
+ )
1850
+ if isinstance(sdata, Artifact):
1851
+ # TODO: load() doesn't yet work
1852
+ self._sdata = sdata.load()
1853
+ else:
1854
+ self._sdata = self._dataset
1855
+ self._sample_metadata_key = sample_metadata_key
1856
+ self._var_fields = var_index
1857
+ self._verify_accessor_exists(self._var_fields.keys())
1858
+ self._categoricals = categoricals
1859
+ self._table_keys = set(self._var_fields.keys()) | set(
1860
+ self._categoricals.keys() - {self._sample_metadata_key}
1861
+ )
1862
+ self._verbosity = verbosity
1863
+ self._sample_df_curator = None
1864
+ if self._sample_metadata_key is not None:
1865
+ self._sample_metadata = self._sdata.get_attrs(
1866
+ key=self._sample_metadata_key, return_as="df", flatten=True
1867
+ )
1868
+ self._is_validated = False
1869
+
1870
+ # Check validity of keys in categoricals
1871
+ nonval_keys = []
1872
+ for accessor, accessor_categoricals in self._categoricals.items():
1873
+ if (
1874
+ accessor == self._sample_metadata_key
1875
+ and self._sample_metadata is not None
1876
+ ):
1877
+ for key in accessor_categoricals.keys():
1878
+ if key not in self._sample_metadata.columns:
1879
+ nonval_keys.append(key)
1880
+ else:
1881
+ for key in accessor_categoricals.keys():
1882
+ if key not in self._sdata[accessor].obs.columns:
1883
+ nonval_keys.append(key)
1884
+
1885
+ _maybe_curation_keys_not_present(nonval_keys, "categoricals")
1886
+
1887
+ # check validity of keys in sources and exclude
1888
+ for name, dct in (("sources", self._sources), ("exclude", self._exclude)):
1889
+ nonval_keys = []
1890
+ for accessor, accessor_sources in dct.items():
1891
+ if (
1892
+ accessor == self._sample_metadata_key
1893
+ and self._sample_metadata is not None
1894
+ ):
1895
+ columns = self._sample_metadata.columns
1896
+ elif accessor != self._sample_metadata_key:
1897
+ columns = self._sdata[accessor].obs.columns
1898
+ else:
1899
+ continue
1900
+ for key in accessor_sources:
1901
+ if key not in columns:
1902
+ nonval_keys.append(key)
1903
+ _maybe_curation_keys_not_present(nonval_keys, name)
1904
+
1905
+ # Set up sample level metadata and table Curator objects
1906
+ if (
1907
+ self._sample_metadata_key is not None
1908
+ and self._sample_metadata_key in self._categoricals
1909
+ ):
1910
+ self._sample_df_curator = DataFrameCatManager(
1911
+ df=self._sample_metadata,
1912
+ columns=Feature.name,
1913
+ categoricals=self._categoricals.get(self._sample_metadata_key, {}),
1914
+ verbosity=verbosity,
1915
+ sources=self._sources.get(self._sample_metadata_key),
1916
+ exclude=self._exclude.get(self._sample_metadata_key),
1917
+ organism=organism,
1918
+ )
1919
+ self._table_adata_curators = {
1920
+ table: AnnDataCatManager(
1921
+ data=self._sdata[table],
1922
+ var_index=var_index.get(table),
1923
+ categoricals=self._categoricals.get(table),
1924
+ verbosity=verbosity,
1925
+ sources=self._sources.get(table),
1926
+ exclude=self._exclude.get(table),
1927
+ organism=organism,
1928
+ )
1929
+ for table in self._table_keys
1930
+ }
1931
+
1932
+ self._non_validated = None
1933
+
1934
+ @property
1935
+ def var_index(self) -> FieldAttr:
1936
+ """Return the registry fields to validate variables indices against."""
1937
+ return self._var_fields
1938
+
1939
+ @property
1940
+ def categoricals(self) -> dict[str, dict[str, FieldAttr]]:
1941
+ """Return the categorical keys and fields to validate against."""
1942
+ return self._categoricals
1943
+
1944
+ @property
1945
+ def non_validated(self) -> dict[str, dict[str, list[str]]]: # type: ignore
1946
+ """Return the non-validated features and labels."""
1947
+ if self._non_validated is None:
1948
+ raise ValidationError("Please run validate() first!")
1949
+ return self._non_validated
1950
+
1951
+ def _verify_accessor_exists(self, accessors: Iterable[str]) -> None:
1952
+ """Verify that the accessors exist (either a valid table or in attrs)."""
1953
+ for acc in accessors:
1954
+ is_present = False
1955
+ try:
1956
+ self._sdata.get_attrs(key=acc)
1957
+ is_present = True
1958
+ except KeyError:
1959
+ if acc in self._sdata.tables.keys():
1960
+ is_present = True
1961
+ if not is_present:
1962
+ raise ValidationError(f"Accessor '{acc}' does not exist!")
1963
+
1964
+ def lookup(self, public: bool = False) -> CurateLookup:
1965
+ """Look up categories.
1966
+
1967
+ Args:
1968
+ public: Whether the lookup is performed on the public reference.
1969
+ """
1970
+ cat_values_dict = list(self.categoricals.values())[0]
1971
+ return CurateLookup(
1972
+ categoricals=cat_values_dict,
1973
+ slots={"accessors": cat_values_dict.keys()},
1974
+ public=public,
1975
+ )
1976
+
1977
+ def _update_registry_all(self) -> None:
1978
+ """Saves labels of all features for sample and table metadata."""
1979
+ if self._sample_df_curator is not None:
1980
+ self._sample_df_curator._update_registry_all(
1981
+ validated_only=True,
1982
+ )
1983
+ for _, adata_curator in self._table_adata_curators.items():
1984
+ adata_curator._obs_df_curator._update_registry_all(
1985
+ validated_only=True,
1986
+ )
1987
+
1988
+ def add_new_from_var_index(self, table: str, **kwargs) -> None:
1989
+ """Save new values from ``.var.index`` of table.
1990
+
1991
+ Args:
1992
+ table: The table key.
1993
+ organism: The organism name.
1994
+ **kwargs: Additional keyword arguments to pass to create new records.
1995
+ """
1996
+ if self._non_validated is None:
1997
+ raise ValidationError("Run .validate() first.")
1998
+ self._table_adata_curators[table].add_new_from_var_index(**kwargs)
1999
+ if table in self.non_validated.keys():
2000
+ if "var_index" in self._non_validated[table]:
2001
+ self._non_validated[table].pop("var_index")
2002
+
2003
+ if len(self.non_validated[table].values()) == 0:
2004
+ self.non_validated.pop(table)
2005
+
2006
+ def add_new_from(
2007
+ self,
2008
+ key: str,
2009
+ accessor: str | None = None,
2010
+ **kwargs,
2011
+ ) -> None:
2012
+ """Save new values of categorical from sample level metadata or table.
2013
+
2014
+ Args:
2015
+ key: The key referencing the slot in the DataFrame.
2016
+ accessor: The accessor key such as 'sample' or 'table x'.
2017
+ organism: The organism name.
2018
+ **kwargs: Additional keyword arguments to pass to create new records.
2019
+ """
2020
+ if self._non_validated is None:
2021
+ raise ValidationError("Run .validate() first.")
2022
+
2023
+ if len(kwargs) > 0 and key == "all":
2024
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
2025
+
2026
+ if accessor not in self.categoricals:
2027
+ raise ValueError(
2028
+ f"Accessor {accessor} is not in 'categoricals'. Include it when creating the SpatialDataCatManager."
2029
+ )
2030
+
2031
+ if accessor in self._table_adata_curators:
2032
+ adata_curator = self._table_adata_curators[accessor]
2033
+ adata_curator.add_new_from(key=key, **kwargs)
2034
+ if accessor == self._sample_metadata_key:
2035
+ self._sample_df_curator.add_new_from(key=key, **kwargs)
2036
+
2037
+ if accessor in self.non_validated.keys():
2038
+ if len(self.non_validated[accessor].values()) == 0:
2039
+ self.non_validated.pop(accessor)
2040
+
2041
+ def standardize(self, key: str, accessor: str | None = None) -> None:
2042
+ """Replace synonyms with canonical values.
2043
+
2044
+ Modifies the dataset inplace.
2045
+
2046
+ Args:
2047
+ key: The key referencing the slot in the table or sample metadata.
2048
+ accessor: The accessor key such as 'sample_key' or 'table_key'.
2049
+ """
2050
+ if len(self.non_validated) == 0:
2051
+ logger.warning("values are already standardized")
2052
+ return
2053
+ if self._artifact is not None:
2054
+ raise RuntimeError("can't mutate the dataset when an artifact is passed!")
2055
+
2056
+ if accessor == self._sample_metadata_key:
2057
+ if key not in self._sample_metadata.columns:
2058
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
2059
+ else:
2060
+ if (
2061
+ key == "var_index" and self._sdata.tables[accessor].var.index is None
2062
+ ) or (
2063
+ key != "var_index"
2064
+ and key not in self._sdata.tables[accessor].obs.columns
2065
+ ):
2066
+ raise ValueError(f"key '{key}' not present in '{accessor}'!")
2067
+
2068
+ if accessor in self._table_adata_curators.keys():
2069
+ adata_curator = self._table_adata_curators[accessor]
2070
+ adata_curator.standardize(key)
2071
+ if accessor == self._sample_metadata_key:
2072
+ self._sample_df_curator.standardize(key)
2073
+
2074
+ if len(self.non_validated[accessor].values()) == 0:
2075
+ self.non_validated.pop(accessor)
2076
+
2077
+ def validate(self) -> bool:
2078
+ """Validate variables and categorical observations.
2079
+
2080
+ This method also registers the validated records in the current instance:
2081
+ - from public sources
2082
+
2083
+ Args:
2084
+ organism: The organism name.
2085
+
2086
+ Returns:
2087
+ Whether the SpatialData object is validated.
2088
+ """
2089
+ from lamindb.core._settings import settings
2090
+
2091
+ # add all validated records to the current instance
2092
+ verbosity = settings.verbosity
2093
+ try:
2094
+ settings.verbosity = "error"
2095
+ self._update_registry_all()
2096
+ finally:
2097
+ settings.verbosity = verbosity
2098
+
2099
+ self._non_validated = {} # type: ignore
2100
+
2101
+ sample_validated = True
2102
+ if self._sample_df_curator:
2103
+ logger.info(f"validating categoricals of '{self._sample_metadata_key}' ...")
2104
+ sample_validated &= self._sample_df_curator.validate()
2105
+ if len(self._sample_df_curator.non_validated) > 0:
2106
+ self._non_validated["sample"] = self._sample_df_curator.non_validated # type: ignore
2107
+ logger.print("")
2108
+
2109
+ mods_validated = True
2110
+ for table, adata_curator in self._table_adata_curators.items():
2111
+ logger.info(f"validating categoricals of table '{table}' ...")
2112
+ mods_validated &= adata_curator.validate()
2113
+ if len(adata_curator.non_validated) > 0:
2114
+ self._non_validated[table] = adata_curator.non_validated # type: ignore
2115
+ logger.print("")
2116
+
2117
+ self._is_validated = sample_validated & mods_validated
2118
+ return self._is_validated
2119
+
2120
+ def save_artifact(
2121
+ self,
2122
+ *,
2123
+ key: str | None = None,
2124
+ description: str | None = None,
2125
+ revises: Artifact | None = None,
2126
+ run: Run | None = None,
2127
+ ) -> Artifact:
2128
+ if not self._is_validated:
2129
+ self.validate()
2130
+ if not self._is_validated:
2131
+ raise ValidationError("Dataset does not validate. Please curate.")
2132
+
2133
+ verbosity = settings.verbosity
2134
+ try:
2135
+ settings.verbosity = "warning"
2136
+
2137
+ if self._artifact is None:
2138
+ # Write the SpatialData object to a random path in tmp directory
2139
+ # The Artifact constructor will move it to the cache
2140
+ write_path = (
2141
+ f"{settings.cache_dir}/{random.randint(10**7, 10**8 - 1)}.zarr"
2142
+ )
2143
+ self._sdata.write(write_path)
2144
+
2145
+ # Create the Artifact and associate Artifact metadata
2146
+ self._artifact = Artifact(
2147
+ write_path,
2148
+ description=description,
2149
+ key=key,
2150
+ revises=revises,
2151
+ run=run,
2152
+ )
2153
+ # According to Tim it is not easy to calculate the number of observations.
2154
+ # We would have to write custom code to iterate over labels (which might not even exist at that point)
2155
+ self._artifact.otype = "spatialdata"
2156
+ self._artifact.save()
2157
+
2158
+ # Link schemas
2159
+ feature_kwargs = check_registry_organism(
2160
+ (list(self._var_fields.values())[0].field.model),
2161
+ self._organism,
2162
+ )
2163
+
2164
+ def _add_set_from_spatialdata(
2165
+ host: Artifact | Collection | Run,
2166
+ var_fields: dict[str, FieldAttr],
2167
+ obs_fields: dict[str, FieldAttr] = None,
2168
+ mute: bool = False,
2169
+ organism: str | Record | None = None,
2170
+ ):
2171
+ """Add Schemas from SpatialData."""
2172
+ if obs_fields is None:
2173
+ obs_fields = {}
2174
+ assert host.otype == "spatialdata" # noqa: S101
2175
+
2176
+ feature_sets = {}
2177
+
2178
+ # sample features
2179
+ sample_features = Feature.from_values(self._sample_metadata.columns) # type: ignore
2180
+ if len(sample_features) > 0:
2181
+ feature_sets[self._sample_metadata_key] = Schema(
2182
+ features=sample_features
2183
+ )
2184
+
2185
+ # table features
2186
+ for table, field in var_fields.items():
2187
+ table_fs = parse_staged_feature_sets_from_anndata(
2188
+ self._sdata[table],
2189
+ var_field=field,
2190
+ obs_field=obs_fields.get(table, Feature.name),
2191
+ mute=mute,
2192
+ organism=organism,
2193
+ )
2194
+ for k, v in table_fs.items():
2195
+ feature_sets[f"['{table}'].{k}"] = v
2196
+
2197
+ def _unify_staged_feature_sets_by_hash(
2198
+ feature_sets: MutableMapping[str, Schema],
2199
+ ):
2200
+ unique_values: dict[str, Any] = {}
2201
+
2202
+ for key, value in feature_sets.items():
2203
+ value_hash = (
2204
+ value.hash
2205
+ ) # Assuming each value has a .hash attribute
2206
+ if value_hash in unique_values:
2207
+ feature_sets[key] = unique_values[value_hash]
2208
+ else:
2209
+ unique_values[value_hash] = value
2210
+
2211
+ return feature_sets
2212
+
2213
+ # link feature sets
2214
+ host._staged_feature_sets = _unify_staged_feature_sets_by_hash(
2215
+ feature_sets
2216
+ )
2217
+ host.save()
2218
+
2219
+ _add_set_from_spatialdata(
2220
+ self._artifact, var_fields=self._var_fields, **feature_kwargs
2221
+ )
2222
+
2223
+ # Link labels
2224
+ def _add_labels_from_spatialdata(
2225
+ data,
2226
+ artifact: Artifact,
2227
+ fields: dict[str, FieldAttr],
2228
+ feature_ref_is_name: bool | None = None,
2229
+ ):
2230
+ """Add Labels from SpatialData."""
2231
+ features = Feature.lookup().dict()
2232
+ for key, field in fields.items():
2233
+ feature = features.get(key)
2234
+ registry = field.field.model
2235
+ filter_kwargs = check_registry_organism(registry, self._organism)
2236
+ filter_kwargs_current = get_current_filter_kwargs(
2237
+ registry, filter_kwargs
2238
+ )
2239
+ df = data if isinstance(data, pd.DataFrame) else data.obs
2240
+ labels = registry.from_values(
2241
+ df[key],
2242
+ field=field,
2243
+ **filter_kwargs_current,
2244
+ )
2245
+ if len(labels) == 0:
2246
+ continue
2247
+
2248
+ label_ref_is_name = None
2249
+ if hasattr(registry, "_name_field"):
2250
+ label_ref_is_name = field.field.name == registry._name_field
2251
+ add_labels(
2252
+ artifact,
2253
+ records=labels,
2254
+ feature=feature,
2255
+ feature_ref_is_name=feature_ref_is_name,
2256
+ label_ref_is_name=label_ref_is_name,
2257
+ from_curator=True,
2258
+ )
2259
+
2260
+ for accessor, accessor_fields in self._categoricals.items():
2261
+ column_field = self._var_fields.get(accessor)
2262
+ if accessor == self._sample_metadata_key:
2263
+ _add_labels_from_spatialdata(
2264
+ self._sample_metadata,
2265
+ self._artifact,
2266
+ accessor_fields,
2267
+ feature_ref_is_name=(
2268
+ None if column_field is None else _ref_is_name(column_field)
2269
+ ),
2270
+ )
2271
+ else:
2272
+ _add_labels_from_spatialdata(
2273
+ self._sdata.tables[accessor],
2274
+ self._artifact,
2275
+ accessor_fields,
2276
+ feature_ref_is_name=(
2277
+ None if column_field is None else _ref_is_name(column_field)
2278
+ ),
2279
+ )
2280
+
2281
+ finally:
2282
+ settings.verbosity = verbosity
2283
+
2284
+ slug = ln_setup.settings.instance.slug
2285
+ if ln_setup.settings.instance.is_remote: # pragma: no cover
2286
+ logger.important(
2287
+ f"go to https://lamin.ai/{slug}/artifact/{self._artifact.uid}"
2288
+ )
2289
+
2290
+ return self._artifact
2291
+
2292
+
2293
+ def _restrict_obs_fields(
2294
+ obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
2295
+ ) -> dict[str, str]:
2296
+ """Restrict the obs fields to name return only available obs fields.
2297
+
2298
+ To simplify the curation, we only validate against either name or ontology_id.
2299
+ If both are available, we validate against ontology_id.
2300
+ If none are available, we validate against name.
2301
+ """
2302
+ obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
2303
+ for name, field in obs_fields.items():
2304
+ if name.endswith("_ontology_term_id"):
2305
+ continue
2306
+ # if both the ontology id and the name are present, only validate on the ontology_id
2307
+ if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
2308
+ obs_fields_unique.pop(name)
2309
+ # if the neither name nor ontology id are present, validate on the name
2310
+ # this will raise error downstream, we just use name to be more readable
2311
+ if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2312
+ obs_fields_unique[name] = field
2313
+
2314
+ # Only retain obs_fields_unique that have keys in adata.obs.columns
2315
+ available_obs_fields = {
2316
+ k: v for k, v in obs_fields_unique.items() if k in obs.columns
2317
+ }
2318
+
2319
+ return available_obs_fields
2320
+
2321
+
2322
+ def _add_defaults_to_obs(
2323
+ obs: pd.DataFrame,
2324
+ defaults: dict[str, str],
2325
+ ) -> None:
2326
+ """Add default columns and values to obs DataFrame."""
2327
+ added_defaults: dict = {}
2328
+ for name, default in defaults.items():
2329
+ if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
2330
+ obs[name] = default
2331
+ added_defaults[name] = default
2332
+ logger.important(
2333
+ f"added default value '{default}' to the adata.obs['{name}']"
2334
+ )
2335
+
2336
+
2337
+ class CellxGeneAnnDataCatManager(AnnDataCatManager):
2338
+ """Annotation flow of AnnData based on CELLxGENE schema."""
2339
+
2340
+ _controls_were_created: bool | None = None
2341
+
2342
+ def __init__(
2343
+ self,
2344
+ adata: ad.AnnData | UPathStr,
2345
+ categoricals: dict[str, FieldAttr] | None = None,
2346
+ organism: Literal["human", "mouse"] = "human",
2347
+ *,
2348
+ defaults: dict[str, str] = None,
2349
+ extra_sources: dict[str, Record] = None,
2350
+ schema_version: Literal["4.0.0", "5.0.0", "5.1.0"] = "5.1.0",
2351
+ verbosity: str = "hint",
2352
+ ) -> None:
2353
+ """CELLxGENE schema curator.
2354
+
2355
+ Args:
2356
+ adata: Path to or AnnData object to curate against the CELLxGENE schema.
2357
+ categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
2358
+ The CELLxGENE Curator maps against the required CELLxGENE fields by default.
2359
+ organism: The organism name. CELLxGENE restricts it to 'human' and 'mouse'.
2360
+ defaults: Default values that are set if columns or column values are missing.
2361
+ extra_sources: A dictionary mapping ``.obs.columns`` to Source records.
2362
+ These extra sources are joined with the CELLxGENE fixed sources.
2363
+ Use this parameter when subclassing.
2364
+ exclude: A dictionary mapping column names to values to exclude.
2365
+ schema_version: The CELLxGENE schema version to curate against.
2366
+ verbosity: The verbosity level.
2367
+
2368
+ """
2369
+ import bionty as bt
2370
+
2371
+ CellxGeneAnnDataCatManager._init_categoricals_additional_values()
2372
+
2373
+ var_index: FieldAttr = bt.Gene.ensembl_gene_id
2374
+
2375
+ if categoricals is None:
2376
+ categoricals = CellxGeneAnnDataCatManager._get_categoricals()
2377
+
2378
+ self.organism = organism
2379
+
2380
+ VALID_SCHEMA_VERSIONS = {"4.0.0", "5.0.0", "5.1.0"}
2381
+ if schema_version not in VALID_SCHEMA_VERSIONS:
2382
+ valid_versions = ", ".join(sorted(VALID_SCHEMA_VERSIONS))
2383
+ raise ValueError(
2384
+ f"Invalid schema_version: {schema_version}. "
2385
+ f"Valid versions are: {valid_versions}"
2386
+ )
2387
+ self.schema_version = schema_version
2388
+ self.schema_reference = f"https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/{schema_version}/schema.md"
2389
+ with resources.path(
2390
+ "lamindb.curators._cellxgene_schemas", "schema_versions.yml"
2391
+ ) as schema_versions_path:
2392
+ self._pinned_ontologies = _read_schema_versions(schema_versions_path)[
2393
+ self.schema_version
2394
+ ]
2395
+
2396
+ # Fetch AnnData obs to be able to set defaults and get sources
2397
+ if isinstance(adata, ad.AnnData):
2398
+ self._adata_obs = adata.obs
2399
+ else:
2400
+ self._adata_obs = backed_access(upath.create_path(adata)).obs # type: ignore
2401
+
2402
+ # Add defaults first to ensure that we fetch valid sources
2403
+ if defaults:
2404
+ _add_defaults_to_obs(self._adata_obs, defaults)
2405
+
2406
+ self.sources = self._create_sources(self._adata_obs)
2407
+ self.sources = {
2408
+ entity: source
2409
+ for entity, source in self.sources.items()
2410
+ if source is not None
2411
+ }
2412
+
2413
+ # These sources are not a part of the cellxgene schema but rather passed through.
2414
+ # This is useful when other Curators extend the CELLxGENE curator
2415
+ if extra_sources:
2416
+ self.sources = self.sources | extra_sources
2417
+
2418
+ # Exclude default values from validation because they are not available in the pinned sources
2419
+ exclude_keys = {
2420
+ entity: default
2421
+ for entity, default in CellxGeneAnnDataCatManager._get_categoricals_defaults().items()
2422
+ if entity in self._adata_obs.columns # type: ignore
2423
+ }
2424
+
2425
+ super().__init__(
2426
+ data=adata,
2427
+ var_index=var_index,
2428
+ categoricals=_restrict_obs_fields(self._adata_obs, categoricals),
2429
+ verbosity=verbosity,
2430
+ organism=organism,
2431
+ sources=self.sources,
2432
+ exclude=exclude_keys,
2433
+ )
2434
+
2435
+ @classmethod
2436
+ def _init_categoricals_additional_values(cls) -> None:
2437
+ import bionty as bt
2438
+
2439
+ import lamindb as ln
2440
+
2441
+ # Note: if you add another control below, be mindful to change the if condition that
2442
+ # triggers whether creating these records is re-considered
2443
+ if cls._controls_were_created is None:
2444
+ cls._controls_were_created = (
2445
+ ln.ULabel.filter(name="SuspensionType", is_type=True).one_or_none()
2446
+ is not None
2447
+ )
2448
+ if not cls._controls_were_created:
2449
+ logger.important("Creating control labels in the CellxGene schema.")
2450
+ bt.CellType(
2451
+ ontology_id="unknown",
2452
+ name="unknown",
2453
+ description="From CellxGene schema.",
2454
+ ).save()
2455
+ pato = bt.Source.filter(name="pato", version="2024-03-28").one()
2456
+ normal = bt.Phenotype.from_source(ontology_id="PATO:0000461", source=pato)
2457
+ bt.Disease(
2458
+ uid=normal.uid,
2459
+ name=normal.name,
2460
+ ontology_id=normal.ontology_id,
2461
+ description=normal.description,
2462
+ source=normal.source,
2463
+ ).save()
2464
+ bt.Ethnicity(
2465
+ ontology_id="na", name="na", description="From CellxGene schema."
2466
+ ).save()
2467
+ bt.Ethnicity(
2468
+ ontology_id="unknown",
2469
+ name="unknown",
2470
+ description="From CellxGene schema.",
2471
+ ).save()
2472
+ bt.DevelopmentalStage(
2473
+ ontology_id="unknown",
2474
+ name="unknown",
2475
+ description="From CellxGene schema.",
2476
+ ).save()
2477
+ bt.Phenotype(
2478
+ ontology_id="unknown",
2479
+ name="unknown",
2480
+ description="From CellxGene schema.",
2481
+ ).save()
2482
+
2483
+ tissue_type = ln.ULabel(
2484
+ name="TissueType",
2485
+ is_type=True,
2486
+ description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
2487
+ ).save()
2488
+ ln.ULabel(
2489
+ name="tissue", type=tissue_type, description="From CellxGene schema."
2490
+ ).save()
2491
+ ln.ULabel(
2492
+ name="organoid", type=tissue_type, description="From CellxGene schema."
2493
+ ).save()
2494
+ ln.ULabel(
2495
+ name="cell culture",
2496
+ type=tissue_type,
2497
+ description="From CellxGene schema.",
2498
+ ).save()
2499
+
2500
+ suspension_type = ln.ULabel(
2501
+ name="SuspensionType",
2502
+ is_type=True,
2503
+ description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
2504
+ ).save()
2505
+ ln.ULabel(
2506
+ name="cell", type=suspension_type, description="From CellxGene schema."
2507
+ ).save()
2508
+ ln.ULabel(
2509
+ name="nucleus",
2510
+ type=suspension_type,
2511
+ description="From CellxGene schema.",
2512
+ ).save()
2513
+ ln.ULabel(name="na", type=suspension_type).save()
2514
+
2515
+ @classmethod
2516
+ def _get_categoricals(cls) -> dict[str, FieldAttr]:
2517
+ import bionty as bt
2518
+
2519
+ return {
2520
+ "assay": bt.ExperimentalFactor.name,
2521
+ "assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
2522
+ "cell_type": bt.CellType.name,
2523
+ "cell_type_ontology_term_id": bt.CellType.ontology_id,
2524
+ "development_stage": bt.DevelopmentalStage.name,
2525
+ "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
2526
+ "disease": bt.Disease.name,
2527
+ "disease_ontology_term_id": bt.Disease.ontology_id,
2528
+ # "donor_id": "str", via pandera
2529
+ "self_reported_ethnicity": bt.Ethnicity.name,
2530
+ "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
2531
+ "sex": bt.Phenotype.name,
2532
+ "sex_ontology_term_id": bt.Phenotype.ontology_id,
2533
+ "suspension_type": ULabel.name,
2534
+ "tissue": bt.Tissue.name,
2535
+ "tissue_ontology_term_id": bt.Tissue.ontology_id,
2536
+ "tissue_type": ULabel.name,
2537
+ "organism": bt.Organism.name,
2538
+ "organism_ontology_term_id": bt.Organism.ontology_id,
2539
+ }
2540
+
2541
+ @classmethod
2542
+ def _get_categoricals_defaults(cls) -> dict[str, str]:
2543
+ return {
2544
+ "cell_type": "unknown",
2545
+ "development_stage": "unknown",
2546
+ "disease": "normal",
2547
+ "donor_id": "unknown",
2548
+ "self_reported_ethnicity": "unknown",
2549
+ "sex": "unknown",
2550
+ "suspension_type": "cell",
2551
+ "tissue_type": "tissue",
2552
+ }
2553
+
2554
+ @property
2555
+ def pinned_ontologies(self) -> pd.DataFrame:
2556
+ return self._pinned_ontologies
2557
+
2558
+ @property
2559
+ def adata(self) -> AnnData:
2560
+ return self._adata
2561
+
2562
+ def _create_sources(self, obs: pd.DataFrame) -> dict[str, Record]:
2563
+ """Creates a sources dictionary that can be passed to AnnDataCatManager."""
2564
+ import bionty as bt
2565
+
2566
+ # fmt: off
2567
+ def _fetch_bionty_source(
2568
+ entity: str, organism: str, source: str
2569
+ ) -> bt.Source | None:
2570
+ """Fetch the Bionty source of the pinned ontology.
2571
+
2572
+ Returns None if the source does not exist.
2573
+ """
2574
+ version = self._pinned_ontologies.loc[(self._pinned_ontologies.index == entity) &
2575
+ (self._pinned_ontologies["organism"] == organism) &
2576
+ (self._pinned_ontologies["source"] == source), "version"].iloc[0]
2577
+ return bt.Source.filter(organism=organism, entity=f"bionty.{entity}", version=version).first()
2578
+
2579
+ entity_mapping = {
2580
+ "var_index": ("Gene", self.organism, "ensembl"),
2581
+ "cell_type": ("CellType", "all", "cl"),
2582
+ "assay": ("ExperimentalFactor", "all", "efo"),
2583
+ "self_reported_ethnicity": ("Ethnicity", self.organism, "hancestro"),
2584
+ "development_stage": ("DevelopmentalStage", self.organism, "hsapdv" if self.organism == "human" else "mmusdv"),
2585
+ "disease": ("Disease", "all", "mondo"),
2586
+ # "organism": ("Organism", "vertebrates", "ensembl"),
2587
+ "sex": ("Phenotype", "all", "pato"),
2588
+ "tissue": ("Tissue", "all", "uberon"),
2589
+ }
2590
+ # fmt: on
2591
+
2592
+ # Retain var_index and one of 'entity'/'entity_ontology_term_id' that is present in obs
2593
+ entity_to_sources = {
2594
+ entity: _fetch_bionty_source(*params)
2595
+ for entity, params in entity_mapping.items()
2596
+ if entity in obs.columns
2597
+ or (f"{entity}_ontology_term_id" in obs.columns and entity != "var_index")
2598
+ or entity == "var_index"
2599
+ }
2600
+
2601
+ return entity_to_sources
2602
+
2603
+ def _convert_name_to_ontology_id(self, values: pd.Series, field: FieldAttr):
2604
+ """Converts a column that stores a name into a column that stores the ontology id.
2605
+
2606
+ cellxgene expects the obs columns to be {entity}_ontology_id columns and disallows {entity} columns.
2607
+ """
2608
+ field_name = field.field.name
2609
+ assert field_name == "name" # noqa: S101
2610
+ cols = ["name", "ontology_id"]
2611
+ registry = field.field.model
2612
+
2613
+ if hasattr(registry, "ontology_id"):
2614
+ validated_records = registry.filter(**{f"{field_name}__in": values})
2615
+ mapper = (
2616
+ pd.DataFrame(validated_records.values_list(*cols))
2617
+ .set_index(0)
2618
+ .to_dict()[1]
2619
+ )
2620
+ return values.map(mapper)
2621
+
2622
+ def validate(self) -> bool: # type: ignore
2623
+ """Validates the AnnData object against most cellxgene requirements."""
2624
+ # Verify that all required obs columns are present
2625
+ missing_obs_fields = [
2626
+ name
2627
+ for name in CellxGeneAnnDataCatManager._get_categoricals_defaults().keys()
2628
+ if name not in self._adata.obs.columns
2629
+ and f"{name}_ontology_term_id" not in self._adata.obs.columns
2630
+ ]
2631
+ if len(missing_obs_fields) > 0:
2632
+ missing_obs_fields_str = ", ".join(list(missing_obs_fields))
2633
+ logger.error(f"missing required obs columns {missing_obs_fields_str}")
2634
+ logger.info(
2635
+ "consider initializing a Curate object like 'Curate(adata, defaults=cxg.CellxGeneAnnDataCatManager._get_categoricals_defaults())'"
2636
+ "to automatically add these columns with default values."
2637
+ )
2638
+ return False
2639
+
2640
+ # Verify that no cellxgene reserved names are present
2641
+ reserved_names = {
2642
+ "ethnicity",
2643
+ "ethnicity_ontology_term_id",
2644
+ "X_normalization",
2645
+ "default_field",
2646
+ "layer_descriptions",
2647
+ "tags",
2648
+ "versions",
2649
+ "contributors",
2650
+ "preprint_doi",
2651
+ "project_description",
2652
+ "project_links",
2653
+ "project_name",
2654
+ "publication_doi",
2655
+ }
2656
+ matched_columns = [
2657
+ column for column in self._adata.obs.columns if column in reserved_names
2658
+ ]
2659
+ if len(matched_columns) > 0:
2660
+ raise ValueError(
2661
+ f"AnnData object must not contain obs columns {matched_columns} which are"
2662
+ " reserved from previous schema versions."
2663
+ )
1571
2664
 
1572
- feature_ref_is_name = _ref_is_name(self._columns_field)
1573
- features = Feature.lookup().dict()
1574
- for key, field in self._obs_fields.items():
1575
- feature = features.get(key)
1576
- registry = field.field.model
1577
- organism = check_registry_organism(field.field.model, self._organism).get(
1578
- "organism"
2665
+ return super().validate()
2666
+
2667
+ def to_cellxgene_anndata(
2668
+ self, is_primary_data: bool, title: str | None = None
2669
+ ) -> ad.AnnData:
2670
+ """Converts the AnnData object to the cellxgene-schema input format.
2671
+
2672
+ cellxgene expects the obs fields to be {entity}_ontology_id fields and has many further requirements which are
2673
+ documented here: https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema.
2674
+ This function checks for most but not all requirements of the CELLxGENE schema.
2675
+ If you want to ensure that it fully adheres to the CELLxGENE schema, run `cellxgene-schema` on the AnnData object.
2676
+
2677
+ Args:
2678
+ is_primary_data: Whether the measured data is primary data or not.
2679
+ title: Title of the AnnData object. Commonly the name of the publication.
2680
+
2681
+ Returns:
2682
+ An AnnData object which adheres to the cellxgene-schema.
2683
+ """
2684
+ # Create a copy since we modify the AnnData object extensively
2685
+ adata_cxg = self._adata.copy()
2686
+
2687
+ # cellxgene requires an embedding
2688
+ embedding_pattern = r"^[a-zA-Z][a-zA-Z0-9_.-]*$"
2689
+ exclude_key = "spatial"
2690
+ matching_keys = [
2691
+ key
2692
+ for key in adata_cxg.obsm.keys()
2693
+ if re.match(embedding_pattern, key) and key != exclude_key
2694
+ ]
2695
+ if len(matching_keys) == 0:
2696
+ raise ValueError(
2697
+ "Unable to find an embedding key. Please calculate an embedding."
1579
2698
  )
1580
- labels = registry.from_values(
1581
- values=self._validated_values[key], field=field, organism=organism
2699
+
2700
+ # convert name column to ontology_term_id column
2701
+ for column in adata_cxg.obs.columns:
2702
+ if column in self.categoricals and not column.endswith("_ontology_term_id"):
2703
+ mapped_column = self._convert_name_to_ontology_id(
2704
+ adata_cxg.obs[column], field=self.categoricals.get(column)
2705
+ )
2706
+ if mapped_column is not None:
2707
+ adata_cxg.obs[f"{column}_ontology_term_id"] = mapped_column
2708
+
2709
+ # drop the name columns for ontologies. cellxgene does not allow them.
2710
+ drop_columns = [
2711
+ i
2712
+ for i in adata_cxg.obs.columns
2713
+ if f"{i}_ontology_term_id" in adata_cxg.obs.columns
2714
+ ]
2715
+ adata_cxg.obs.drop(columns=drop_columns, inplace=True)
2716
+
2717
+ # Add cellxgene metadata to AnnData object
2718
+ if "is_primary_data" not in adata_cxg.obs.columns:
2719
+ adata_cxg.obs["is_primary_data"] = is_primary_data
2720
+ if "feature_is_filtered" not in adata_cxg.var.columns:
2721
+ logger.warn(
2722
+ "column 'feature_is_filtered' not present in var. Setting to default"
2723
+ " value of False."
1582
2724
  )
1583
- if len(labels) == 0:
1584
- continue
1585
- if hasattr(registry, "_name_field"):
1586
- label_ref_is_name = field.field.name == registry._name_field
1587
- add_labels(
1588
- artifact,
1589
- records=labels,
1590
- feature=feature,
1591
- feature_ref_is_name=feature_ref_is_name,
1592
- label_ref_is_name=label_ref_is_name,
1593
- from_curator=True,
2725
+ adata_cxg.var["feature_is_filtered"] = False
2726
+ if title is None:
2727
+ raise ValueError("please pass a title!")
2728
+ else:
2729
+ adata_cxg.uns["title"] = title
2730
+ adata_cxg.uns["cxg_lamin_schema_reference"] = self.schema_reference
2731
+ adata_cxg.uns["cxg_lamin_schema_version"] = self.schema_version
2732
+
2733
+ return adata_cxg
2734
+
2735
+
2736
+ class ValueUnit:
2737
+ """Base class for handling value-unit combinations."""
2738
+
2739
+ @staticmethod
2740
+ def parse_value_unit(value: str, is_dose: bool = True) -> tuple[str, str] | None:
2741
+ """Parse a string containing a value and unit into a tuple."""
2742
+ if not isinstance(value, str) or not value.strip():
2743
+ return None
2744
+
2745
+ value = str(value).strip()
2746
+ match = re.match(r"^(\d*\.?\d{0,1})\s*([a-zA-ZμµΜ]+)$", value)
2747
+
2748
+ if not match:
2749
+ raise ValueError(
2750
+ f"Invalid format: {value}. Expected format: number with max 1 decimal place + unit"
2751
+ )
2752
+
2753
+ number, unit = match.groups()
2754
+ formatted_number = f"{float(number):.1f}"
2755
+
2756
+ if is_dose:
2757
+ standardized_unit = DoseHandler.standardize_unit(unit)
2758
+ if not DoseHandler.validate_unit(standardized_unit):
2759
+ raise ValueError(
2760
+ f"Invalid dose unit: {unit}. Must be convertible to one of: nM, μM, mM, M"
2761
+ )
2762
+ else:
2763
+ standardized_unit = TimeHandler.standardize_unit(unit)
2764
+ if not TimeHandler.validate_unit(standardized_unit):
2765
+ raise ValueError(
2766
+ f"Invalid time unit: {unit}. Must be convertible to one of: h, m, s, d, y"
1594
2767
  )
1595
2768
 
1596
- return artifact.save()
2769
+ return formatted_number, standardized_unit
1597
2770
 
1598
2771
 
1599
- class Curator(BaseCurator):
1600
- """Dataset curator.
2772
+ class DoseHandler:
2773
+ """Handler for dose-related operations."""
1601
2774
 
1602
- A `Curator` object makes it easy to save validated & annotated artifacts.
2775
+ VALID_UNITS = {"nM", "μM", "µM", "mM", "M"}
2776
+ UNIT_MAP = {
2777
+ "nm": "nM",
2778
+ "NM": "nM",
2779
+ "um": "μM",
2780
+ "UM": "μM",
2781
+ "μm": "μM",
2782
+ "μM": "μM",
2783
+ "µm": "μM",
2784
+ "µM": "μM",
2785
+ "mm": "mM",
2786
+ "MM": "mM",
2787
+ "m": "M",
2788
+ "M": "M",
2789
+ }
1603
2790
 
1604
- Example:
2791
+ @classmethod
2792
+ def validate_unit(cls, unit: str) -> bool:
2793
+ """Validate if the dose unit is acceptable."""
2794
+ return unit in cls.VALID_UNITS
1605
2795
 
1606
- >>> curator = ln.Curator.from_df(
1607
- >>> df,
1608
- >>> # define validation criteria as mappings
1609
- >>> columns=ln.Feature.name, # map column names
1610
- >>> categoricals={"perturbation": ln.ULabel.name}, # map categories
1611
- >>> )
1612
- >>> curator.validate() # validate the data in df
1613
- >>> artifact = curator.save_artifact(description="my RNA-seq")
1614
- >>> artifact.describe() # see annotations
2796
+ @classmethod
2797
+ def standardize_unit(cls, unit: str) -> str:
2798
+ """Standardize dose unit to standard formats."""
2799
+ return cls.UNIT_MAP.get(unit, unit)
2800
+
2801
+ @classmethod
2802
+ def validate_values(cls, values: pd.Series) -> list:
2803
+ """Validate pert_dose values with strict case checking."""
2804
+ errors = []
1615
2805
 
1616
- `curator.validate()` maps values within `df` according to the mapping criteria and logs validated & problematic values.
2806
+ for idx, value in values.items():
2807
+ if pd.isna(value):
2808
+ continue
1617
2809
 
1618
- If you find non-validated values, you have several options:
2810
+ if isinstance(value, (int, float)):
2811
+ errors.append(
2812
+ f"Row {idx} - Missing unit for dose: {value}. Must include a unit (nM, μM, mM, M)"
2813
+ )
2814
+ continue
2815
+
2816
+ try:
2817
+ ValueUnit.parse_value_unit(value, is_dose=True)
2818
+ except ValueError as e:
2819
+ errors.append(f"Row {idx} - {str(e)}")
2820
+
2821
+ return errors
1619
2822
 
1620
- - new values found in the data can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`
1621
- - non-validated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and addressed manually
1622
- """
2823
+
2824
+ class TimeHandler:
2825
+ """Handler for time-related operations."""
2826
+
2827
+ VALID_UNITS = {"h", "m", "s", "d", "y"}
1623
2828
 
1624
2829
  @classmethod
1625
- @doc_args(DataFrameCurator.__doc__)
1626
- def from_df(
1627
- cls,
1628
- df: pd.DataFrame,
1629
- categoricals: dict[str, FieldAttr] | None = None,
1630
- columns: FieldAttr = Feature.name,
1631
- using_key: str | None = None,
1632
- verbosity: str = "hint",
1633
- organism: str | None = None,
1634
- ) -> DataFrameCurator:
1635
- """{}""" # noqa: D415
1636
- return DataFrameCurator(
1637
- df=df,
1638
- categoricals=categoricals,
1639
- columns=columns,
1640
- using_key=using_key,
1641
- verbosity=verbosity,
1642
- organism=organism,
1643
- )
2830
+ def validate_unit(cls, unit: str) -> bool:
2831
+ """Validate if the time unit is acceptable."""
2832
+ return unit == unit.lower() and unit in cls.VALID_UNITS
1644
2833
 
1645
2834
  @classmethod
1646
- @doc_args(AnnDataCurator.__doc__)
1647
- def from_anndata(
1648
- cls,
1649
- data: ad.AnnData | UPathStr,
1650
- var_index: FieldAttr,
1651
- categoricals: dict[str, FieldAttr] | None = None,
1652
- obs_columns: FieldAttr = Feature.name,
1653
- using_key: str | None = None,
1654
- verbosity: str = "hint",
1655
- organism: str | None = None,
1656
- sources: dict[str, Record] | None = None,
1657
- ) -> AnnDataCurator:
1658
- """{}""" # noqa: D415
1659
- return AnnDataCurator(
1660
- data=data,
1661
- var_index=var_index,
1662
- categoricals=categoricals,
1663
- obs_columns=obs_columns,
1664
- using_key=using_key,
1665
- verbosity=verbosity,
1666
- organism=organism,
1667
- sources=sources,
1668
- )
2835
+ def standardize_unit(cls, unit: str) -> str:
2836
+ """Standardize time unit to standard formats."""
2837
+ if unit.startswith("hr"):
2838
+ return "h"
2839
+ elif unit.startswith("min"):
2840
+ return "m"
2841
+ elif unit.startswith("sec"):
2842
+ return "s"
2843
+ return unit[0].lower()
1669
2844
 
1670
2845
  @classmethod
1671
- @doc_args(MuDataCurator.__doc__)
1672
- def from_mudata(
1673
- cls,
1674
- mdata: MuData,
1675
- var_index: dict[str, dict[str, FieldAttr]],
1676
- categoricals: dict[str, FieldAttr] | None = None,
1677
- using_key: str | None = None,
2846
+ def validate_values(cls, values: pd.Series) -> list:
2847
+ """Validate pert_time values."""
2848
+ errors = []
2849
+
2850
+ for idx, value in values.items():
2851
+ if pd.isna(value):
2852
+ continue
2853
+
2854
+ if isinstance(value, (int, float)):
2855
+ errors.append(
2856
+ f"Row {idx} - Missing unit for time: {value}. Must include a unit (h, m, s, d, y)"
2857
+ )
2858
+ continue
2859
+
2860
+ try:
2861
+ ValueUnit.parse_value_unit(value, is_dose=False)
2862
+ except ValueError as e:
2863
+ errors.append(f"Row {idx} - {str(e)}")
2864
+
2865
+ return errors
2866
+
2867
+
2868
+ class PertAnnDataCatManager(CellxGeneAnnDataCatManager):
2869
+ """Curator flow for Perturbation data."""
2870
+
2871
+ PERT_COLUMNS = {"compound", "genetic", "biologic", "physical"}
2872
+
2873
+ def __init__(
2874
+ self,
2875
+ adata: ad.AnnData,
2876
+ organism: Literal["human", "mouse"] = "human",
2877
+ pert_dose: bool = True,
2878
+ pert_time: bool = True,
2879
+ *,
1678
2880
  verbosity: str = "hint",
1679
- organism: str | None = None,
1680
- ) -> MuDataCurator:
1681
- """{}""" # noqa: D415
1682
- return MuDataCurator(
1683
- mdata=mdata,
1684
- var_index=var_index,
1685
- categoricals=categoricals,
1686
- using_key=using_key,
2881
+ cxg_schema_version: Literal["5.0.0", "5.1.0"] = "5.1.0",
2882
+ ):
2883
+ """Initialize the curator with configuration and validation settings."""
2884
+ import bionty as bt
2885
+
2886
+ self._pert_time = pert_time
2887
+ self._pert_dose = pert_dose
2888
+
2889
+ self._validate_initial_data(adata)
2890
+ self._setup_configuration(adata)
2891
+
2892
+ self._setup_sources(adata)
2893
+ self._setup_compound_source()
2894
+
2895
+ super().__init__(
2896
+ adata=adata,
2897
+ categoricals=self.PT_CATEGORICALS,
2898
+ defaults=self.PT_DEFAULT_VALUES,
1687
2899
  verbosity=verbosity,
1688
2900
  organism=organism,
2901
+ extra_sources=self.PT_SOURCES,
2902
+ schema_version=cxg_schema_version,
1689
2903
  )
1690
2904
 
1691
- @classmethod
1692
- @doc_args(SOMACurator.__doc__)
1693
- def from_tiledbsoma(
1694
- cls,
1695
- experiment_uri: UPathStr,
1696
- var_index: dict[str, tuple[str, FieldAttr]],
1697
- categoricals: dict[str, FieldAttr] | None = None,
1698
- obs_columns: FieldAttr = Feature.name,
1699
- using_key: str | None = None,
1700
- organism: str | None = None,
1701
- sources: dict[str, Record] | None = None,
1702
- exclude: dict[str, str | list[str]] | None = None,
1703
- ) -> SOMACurator:
1704
- """{}""" # noqa: D415
1705
- return SOMACurator(
1706
- experiment_uri=experiment_uri,
1707
- var_index=var_index,
1708
- categoricals=categoricals,
1709
- obs_columns=obs_columns,
1710
- using_key=using_key,
1711
- organism=organism,
1712
- sources=sources,
1713
- exclude=exclude,
2905
+ def _setup_configuration(self, adata: ad.AnnData):
2906
+ """Set up default configuration values."""
2907
+ import bionty as bt
2908
+ import wetlab as wl
2909
+
2910
+ self.PT_DEFAULT_VALUES = (
2911
+ CellxGeneAnnDataCatManager._get_categoricals_defaults()
2912
+ | {
2913
+ "cell_line": "unknown",
2914
+ "pert_target": "unknown",
2915
+ }
1714
2916
  )
1715
2917
 
1716
- @classmethod
1717
- def from_spatialdata(
1718
- cls,
1719
- sdata,
1720
- var_index: dict[str, FieldAttr],
1721
- categoricals: dict[str, dict[str, FieldAttr]] | None = None,
1722
- using_key: str | None = None,
1723
- organism: str | None = None,
1724
- sources: dict[str, dict[str, Record]] | None = None,
1725
- exclude: dict[str, dict] | None = None,
1726
- verbosity: str = "hint",
1727
- *,
1728
- sample_metadata_key: str = "sample",
1729
- ):
1730
- """Curation flow for a ``Spatialdata`` object.
2918
+ self.PT_CATEGORICALS = CellxGeneAnnDataCatManager._get_categoricals() | {
2919
+ k: v
2920
+ for k, v in {
2921
+ "cell_line": bt.CellLine.name,
2922
+ "pert_target": wl.PerturbationTarget.name,
2923
+ "pert_genetic": wl.GeneticPerturbation.name,
2924
+ "pert_compound": wl.Compound.name,
2925
+ "pert_biologic": wl.Biologic.name,
2926
+ "pert_physical": wl.EnvironmentalPerturbation.name,
2927
+ }.items()
2928
+ if k in adata.obs.columns
2929
+ }
2930
+ # if "donor_id" in self.PT_CATEGORICALS:
2931
+ # self.PT_CATEGORICALS["donor_id"] = Donor.name
2932
+
2933
+ def _setup_sources(self, adata: ad.AnnData):
2934
+ """Set up data sources."""
2935
+ self.PT_SOURCES = {}
2936
+ # if "cell_line" in adata.obs.columns:
2937
+ # self.PT_SOURCES["cell_line"] = (
2938
+ # bt.Source.filter(name="depmap").first()
2939
+ # )
2940
+ if "pert_compound" in adata.obs.columns:
2941
+ import bionty as bt
2942
+
2943
+ self.PT_SOURCES["pert_compound"] = bt.Source.filter(
2944
+ entity="wetlab.Compound", name="chebi"
2945
+ ).first()
2946
+
2947
+ def _validate_initial_data(self, adata: ad.AnnData):
2948
+ """Validate the initial data structure."""
2949
+ self._validate_required_columns(adata)
2950
+ self._validate_perturbation_types(adata)
2951
+
2952
+ def _validate_required_columns(self, adata: ad.AnnData):
2953
+ """Validate required columns are present."""
2954
+ if "pert_target" not in adata.obs.columns:
2955
+ if (
2956
+ "pert_name" not in adata.obs.columns
2957
+ or "pert_type" not in adata.obs.columns
2958
+ ):
2959
+ raise ValidationError(
2960
+ "either 'pert_target' or both 'pert_name' and 'pert_type' must be present"
2961
+ )
2962
+ else:
2963
+ if "pert_name" not in adata.obs.columns:
2964
+ logger.warning(
2965
+ "no 'pert' column found in adata.obs, will only curate 'pert_target'"
2966
+ )
2967
+ elif "pert_type" not in adata.obs.columns:
2968
+ raise ValidationError("both 'pert' and 'pert_type' must be present")
2969
+
2970
+ def _validate_perturbation_types(self, adata: ad.AnnData):
2971
+ """Validate perturbation types."""
2972
+ if "pert_type" in adata.obs.columns:
2973
+ data_pert_types = set(adata.obs["pert_type"].unique())
2974
+ invalid_pert_types = data_pert_types - self.PERT_COLUMNS
2975
+ if invalid_pert_types:
2976
+ raise ValidationError(
2977
+ f"invalid pert_type found: {invalid_pert_types}!\n"
2978
+ f" → allowed values: {self.PERT_COLUMNS}"
2979
+ )
2980
+ self._process_perturbation_types(adata, data_pert_types)
2981
+
2982
+ def _process_perturbation_types(self, adata: ad.AnnData, pert_types: set):
2983
+ """Process and map perturbation types."""
2984
+ for pert_type in pert_types:
2985
+ col_name = "pert_" + pert_type
2986
+ adata.obs[col_name] = adata.obs["pert_name"].where(
2987
+ adata.obs["pert_type"] == pert_type, None
2988
+ )
2989
+ if adata.obs[col_name].dtype.name == "category":
2990
+ adata.obs[col_name].cat.remove_unused_categories()
2991
+ logger.important(f"mapped 'pert_name' to '{col_name}'")
1731
2992
 
1732
- See also :class:`~lamindb.Curator`.
2993
+ def _setup_compound_source(self):
2994
+ """Set up the compound source with muted logging."""
2995
+ import bionty as bt
2996
+ import wetlab as wl
2997
+
2998
+ with logger.mute():
2999
+ chebi_source = bt.Source.filter(
3000
+ entity="wetlab.Compound", name="chebi"
3001
+ ).first()
3002
+ if not chebi_source:
3003
+ wl.Compound.add_source(
3004
+ bt.Source.filter(entity="Drug", name="chebi").first()
3005
+ )
1733
3006
 
1734
- Note that if genes or other measurements are removed from the SpatialData object,
1735
- the object should be recreated.
3007
+ def validate(self) -> bool: # type: ignore
3008
+ """Validate the AnnData object."""
3009
+ validated = super().validate()
3010
+
3011
+ if self._pert_dose:
3012
+ validated &= self._validate_dose_column()
3013
+ if self._pert_time:
3014
+ validated &= self._validate_time_column()
3015
+
3016
+ self._is_validated = validated
3017
+
3018
+ # sort columns
3019
+ first_columns = [
3020
+ "pert_target",
3021
+ "pert_genetic",
3022
+ "pert_compound",
3023
+ "pert_biologic",
3024
+ "pert_physical",
3025
+ "pert_dose",
3026
+ "pert_time",
3027
+ "organism",
3028
+ "cell_line",
3029
+ "cell_type",
3030
+ "disease",
3031
+ "tissue_type",
3032
+ "tissue",
3033
+ "assay",
3034
+ "suspension_type",
3035
+ "donor_id",
3036
+ "sex",
3037
+ "self_reported_ethnicity",
3038
+ "development_stage",
3039
+ "pert_name",
3040
+ "pert_type",
3041
+ ]
3042
+ sorted_columns = [
3043
+ col for col in first_columns if col in self._adata.obs.columns
3044
+ ] + [col for col in self._adata.obs.columns if col not in first_columns]
3045
+ # must assign to self._df to ensure .standardize works correctly
3046
+ self._obs_df = self._adata.obs[sorted_columns]
3047
+ self._adata.obs = self._obs_df
3048
+ return validated
3049
+
3050
+ def standardize(self, key: str) -> pd.DataFrame:
3051
+ """Standardize the AnnData object."""
3052
+ super().standardize(key)
3053
+ self._adata.obs = self._obs_df
3054
+
3055
+ def _validate_dose_column(self) -> bool:
3056
+ """Validate the dose column."""
3057
+ if not Feature.filter(name="pert_dose").exists():
3058
+ Feature(name="pert_dose", dtype="str").save() # type: ignore
3059
+
3060
+ dose_errors = DoseHandler.validate_values(self._adata.obs["pert_dose"])
3061
+ if dose_errors:
3062
+ self._log_validation_errors("pert_dose", dose_errors)
3063
+ return False
3064
+ return True
3065
+
3066
+ def _validate_time_column(self) -> bool:
3067
+ """Validate the time column."""
3068
+ if not Feature.filter(name="pert_time").exists():
3069
+ Feature(name="pert_time", dtype="str").save() # type: ignore
3070
+
3071
+ time_errors = TimeHandler.validate_values(self._adata.obs["pert_time"])
3072
+ if time_errors:
3073
+ self._log_validation_errors("pert_time", time_errors)
3074
+ return False
3075
+ return True
3076
+
3077
+ def _log_validation_errors(self, column: str, errors: list):
3078
+ """Log validation errors with formatting."""
3079
+ errors_print = "\n ".join(errors)
3080
+ logger.warning(
3081
+ f"invalid {column} values found!\n {errors_print}\n"
3082
+ f" → run {colors.cyan('standardize_dose_time()')}"
3083
+ )
1736
3084
 
1737
- In the following docstring, an accessor refers to either a ``.table`` key or the ``sample_metadata_key``.
3085
+ def standardize_dose_time(self) -> pd.DataFrame:
3086
+ """Standardize dose and time values."""
3087
+ standardized_df = self._adata.obs.copy()
1738
3088
 
1739
- Args:
1740
- sdata: The SpatialData object to curate.
1741
- var_index: A dictionary mapping table keys to the ``.var`` indices.
1742
- categoricals: A nested dictionary mapping an accessor to dictionaries that map columns to a registry field.
1743
- using_key: A reference LaminDB instance.
1744
- organism: The organism name.
1745
- sources: A dictionary mapping an accessor to dictionaries that map columns to Source records.
1746
- exclude: A dictionary mapping an accessor to dictionaries of column names to values to exclude from validation.
1747
- When specific :class:`~bionty.Source` instances are pinned and may lack default values (e.g., "unknown" or "na"),
1748
- using the exclude parameter ensures they are not validated.
1749
- verbosity: The verbosity level of the logger.
1750
- sample_metadata_key: The key in ``.attrs`` that stores the sample level metadata.
1751
-
1752
- Examples:
1753
- >>> import lamindb as ln
1754
- >>> import bionty as bt
1755
- >>> curator = ln.Curator.from_spatialdata(
1756
- ... sdata,
1757
- ... var_index={
1758
- ... "table_1": bt.Gene.ensembl_gene_id,
1759
- ... },
1760
- ... categoricals={
1761
- ... "table1":
1762
- ... {"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name},
1763
- ... "sample":
1764
- ... {"experimental_factor": bt.ExperimentalFactor.name},
1765
- ... },
1766
- ... organism="human",
1767
- ... )
1768
- """
1769
- try:
1770
- import spatialdata
1771
- except ImportError as e:
1772
- raise ImportError(
1773
- "Please install spatialdata: pip install spatialdata"
1774
- ) from e
3089
+ if "pert_dose" in self._adata.obs.columns:
3090
+ standardized_df = self._standardize_column(
3091
+ standardized_df, "pert_dose", is_dose=True
3092
+ )
1775
3093
 
1776
- from ._spatial import SpatialDataCurator
3094
+ if "pert_time" in self._adata.obs.columns:
3095
+ standardized_df = self._standardize_column(
3096
+ standardized_df, "pert_time", is_dose=False
3097
+ )
1777
3098
 
1778
- return SpatialDataCurator(
1779
- sdata=sdata,
1780
- var_index=var_index,
1781
- categoricals=categoricals,
1782
- using_key=using_key,
1783
- verbosity=verbosity,
1784
- organism=organism,
1785
- sources=sources,
1786
- exclude=exclude,
1787
- sample_metadata_key=sample_metadata_key,
1788
- )
3099
+ self._adata.obs = standardized_df
3100
+ return standardized_df
3101
+
3102
+ def _standardize_column(
3103
+ self, df: pd.DataFrame, column: str, is_dose: bool
3104
+ ) -> pd.DataFrame:
3105
+ """Standardize values in a specific column."""
3106
+ for idx, value in self._adata.obs[column].items():
3107
+ if pd.isna(value) or (
3108
+ isinstance(value, str) and (not value.strip() or value.lower() == "nan")
3109
+ ):
3110
+ df.at[idx, column] = None
3111
+ continue
1789
3112
 
3113
+ try:
3114
+ num, unit = ValueUnit.parse_value_unit(value, is_dose=is_dose)
3115
+ df.at[idx, column] = f"{num}{unit}"
3116
+ except ValueError:
3117
+ continue
1790
3118
 
1791
- def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
1792
- """Get a registry instance using a specific instance."""
1793
- if using_key is not None and using_key != "default":
1794
- return registry.using(using_key)
1795
- return registry
3119
+ return df
1796
3120
 
1797
3121
 
1798
3122
  def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
@@ -1859,10 +3183,7 @@ def check_registry_organism(registry: Record, organism: str | None = None) -> di
1859
3183
  import bionty as bt
1860
3184
 
1861
3185
  if organism is None and bt.settings.organism is None:
1862
- raise ValidationError(
1863
- f"{registry.__name__} registry requires an organism!\n"
1864
- " → please pass an organism name via organism="
1865
- )
3186
+ return {}
1866
3187
  return {"organism": organism or bt.settings.organism.name}
1867
3188
  return {}
1868
3189
 
@@ -1871,11 +3192,11 @@ def validate_categories(
1871
3192
  values: Iterable[str],
1872
3193
  field: FieldAttr,
1873
3194
  key: str,
1874
- using_key: str | None = None,
1875
3195
  organism: str | None = None,
1876
3196
  source: Record | None = None,
1877
3197
  exclude: str | list | None = None,
1878
3198
  hint_print: str | None = None,
3199
+ curator: CatManager | None = None,
1879
3200
  ) -> tuple[bool, list]:
1880
3201
  """Validate ontology terms in a pandas series using LaminDB registries.
1881
3202
 
@@ -1883,7 +3204,6 @@ def validate_categories(
1883
3204
  values: The values to validate.
1884
3205
  field: The field attribute.
1885
3206
  key: The key referencing the slot in the DataFrame.
1886
- using_key: A reference LaminDB instance.
1887
3207
  organism: The organism name.
1888
3208
  source: The source record.
1889
3209
  exclude: Exclude specific values from validation.
@@ -1918,22 +3238,8 @@ def validate_categories(
1918
3238
  non_validated = inspect_result.non_validated
1919
3239
  syn_mapper = inspect_result.synonyms_mapper
1920
3240
 
1921
- # inspect the non-validated values from the using_key instance
1922
- values_validated = []
1923
- if using_key is not None and using_key != "default" and non_validated:
1924
- registry_using = get_registry_instance(registry, using_key)
1925
- inspect_result = inspect_instance(
1926
- values=non_validated,
1927
- field=field,
1928
- registry=registry_using,
1929
- exclude=exclude,
1930
- **kwargs,
1931
- )
1932
- non_validated = inspect_result.non_validated
1933
- values_validated += inspect_result.validated
1934
- syn_mapper.update(inspect_result.synonyms_mapper)
1935
-
1936
3241
  # inspect the non-validated values from public (bionty only)
3242
+ values_validated = []
1937
3243
  if hasattr(registry, "public"):
1938
3244
  verbosity = settings.verbosity
1939
3245
  try:
@@ -1969,12 +3275,16 @@ def validate_categories(
1969
3275
  warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
1970
3276
  if n_non_validated > len(syn_mapper):
1971
3277
  if syn_mapper:
1972
- warning_message += " for remaining terms:\n"
3278
+ warning_message += "\n for remaining terms:\n"
1973
3279
  warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
1974
3280
 
1975
3281
  if logger.indent == "":
1976
3282
  _log_mapping_info()
1977
3283
  logger.warning(warning_message)
3284
+ if curator is not None:
3285
+ curator._validate_category_error_messages = strip_ansi_codes(
3286
+ warning_message
3287
+ )
1978
3288
  logger.indent = ""
1979
3289
  return False, non_validated
1980
3290
 
@@ -1982,7 +3292,6 @@ def validate_categories(
1982
3292
  def standardize_categories(
1983
3293
  values: Iterable[str],
1984
3294
  field: FieldAttr,
1985
- using_key: str | None = None,
1986
3295
  organism: str | None = None,
1987
3296
  source: Record | None = None,
1988
3297
  ) -> dict:
@@ -1999,30 +3308,15 @@ def standardize_categories(
1999
3308
  mute=True,
2000
3309
  return_mapper=True,
2001
3310
  )
2002
-
2003
- if len(values) > len(syn_mapper): # type: ignore
2004
- # standardize values using the using_key instance
2005
- if using_key is not None and using_key != "default":
2006
- registry_using = get_registry_instance(registry, using_key)
2007
- syn_mapper.update(
2008
- registry_using.standardize(
2009
- [v for v in values if v not in syn_mapper],
2010
- field=field.field.name,
2011
- organism=organism,
2012
- source=source,
2013
- mute=True,
2014
- return_mapper=True,
2015
- )
2016
- )
2017
3311
  return syn_mapper
2018
3312
 
2019
3313
 
2020
3314
  def validate_categories_in_df(
2021
3315
  df: pd.DataFrame,
2022
3316
  fields: dict[str, FieldAttr],
2023
- using_key: str | None = None,
2024
3317
  sources: dict[str, Record] = None,
2025
3318
  exclude: dict | None = None,
3319
+ curator: CatManager | None = None,
2026
3320
  **kwargs,
2027
3321
  ) -> tuple[bool, dict]:
2028
3322
  """Validate categories in DataFrame columns using LaminDB registries."""
@@ -2038,9 +3332,9 @@ def validate_categories_in_df(
2038
3332
  df[key],
2039
3333
  field=field,
2040
3334
  key=key,
2041
- using_key=using_key,
2042
3335
  source=sources.get(key),
2043
3336
  exclude=exclude.get(key) if exclude else None,
3337
+ curator=curator,
2044
3338
  **kwargs,
2045
3339
  )
2046
3340
  validated &= is_val
@@ -2055,80 +3349,72 @@ def save_artifact(
2055
3349
  columns_field: FieldAttr | dict[str, FieldAttr],
2056
3350
  description: str | None = None,
2057
3351
  organism: str | None = None,
2058
- adata: ad.AnnData | None = None,
2059
3352
  key: str | None = None,
3353
+ artifact: Artifact | None = None,
2060
3354
  revises: Artifact | None = None,
2061
3355
  run: Run | None = None,
3356
+ schema: Schema | None = None,
2062
3357
  ) -> Artifact:
2063
3358
  """Save all metadata with an Artifact.
2064
3359
 
2065
3360
  Args:
2066
- data: The DataFrame or AnnData object to save.
3361
+ data: The DataFrame/AnnData/MuData object to save.
2067
3362
  fields: A dictionary mapping obs_column to registry_field.
2068
3363
  columns_field: The registry field to validate variables index against.
2069
3364
  description: A description of the artifact.
2070
3365
  organism: The organism name.
2071
- adata: The AnnData object to save and get n_observations, must be provided if data is a path.
2072
3366
  type: The artifact type.
2073
- key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
3367
+ key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
3368
+ artifact: A already registered artifact. Passing this will not save a new artifact from data.
2074
3369
  revises: Previous version of the artifact. Triggers a revision.
2075
3370
  run: The run that creates the artifact.
2076
3371
 
2077
3372
  Returns:
2078
3373
  The saved Artifact.
2079
3374
  """
2080
- from .._artifact import data_is_anndata
3375
+ from .._artifact import data_is_anndata, data_is_mudata
2081
3376
  from ..core._data import add_labels
2082
3377
 
2083
- artifact = None
2084
- if data_is_anndata(data):
2085
- assert adata is not None # noqa: S101
2086
- artifact = Artifact.from_anndata(
2087
- data, description=description, key=key, revises=revises, run=run
2088
- )
2089
- artifact.n_observations = adata.shape[0]
2090
- data = adata
2091
-
2092
- elif isinstance(data, pd.DataFrame):
2093
- artifact = Artifact.from_df(
2094
- data, description=description, key=key, revises=revises, run=run
2095
- )
2096
- else:
2097
- try:
2098
- from mudata import MuData
2099
-
2100
- if isinstance(data, MuData):
2101
- artifact = Artifact.from_mudata(
2102
- data,
2103
- description=description,
2104
- key=key,
2105
- revises=revises,
2106
- run=run,
2107
- )
2108
- artifact.n_observations = data.n_obs
2109
- except ImportError:
2110
- pass
2111
3378
  if artifact is None:
2112
- raise ValueError("data must be a DataFrame, AnnData or MuData object.")
3379
+ if data_is_anndata(data):
3380
+ artifact = Artifact.from_anndata(
3381
+ data, description=description, key=key, revises=revises, run=run
3382
+ )
3383
+ elif isinstance(data, pd.DataFrame):
3384
+ artifact = Artifact.from_df(
3385
+ data, description=description, key=key, revises=revises, run=run
3386
+ )
3387
+ elif data_is_mudata(data):
3388
+ artifact = Artifact.from_mudata(
3389
+ data,
3390
+ description=description,
3391
+ key=key,
3392
+ revises=revises,
3393
+ run=run,
3394
+ )
3395
+ artifact.schema = schema
2113
3396
  artifact.save()
2114
3397
 
2115
- feature_kwargs = check_registry_organism(
2116
- (
2117
- list(columns_field.values())[0].field.model
2118
- if isinstance(columns_field, dict)
2119
- else columns_field.field.model
2120
- ),
2121
- organism,
2122
- )
3398
+ if organism is not None:
3399
+ feature_kwargs = check_registry_organism(
3400
+ (
3401
+ list(columns_field.values())[0].field.model
3402
+ if isinstance(columns_field, dict)
3403
+ else columns_field.field.model
3404
+ ),
3405
+ organism,
3406
+ )
3407
+ else:
3408
+ feature_kwargs = {}
2123
3409
 
2124
3410
  if artifact.otype == "DataFrame":
2125
- artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
3411
+ artifact.features._add_set_from_df(field=columns_field, **feature_kwargs) # type: ignore
2126
3412
  elif artifact.otype == "AnnData":
2127
- artifact.features._add_set_from_anndata(
3413
+ artifact.features._add_set_from_anndata( # type: ignore
2128
3414
  var_field=columns_field, **feature_kwargs
2129
3415
  )
2130
3416
  elif artifact.otype == "MuData":
2131
- artifact.features._add_set_from_mudata(
3417
+ artifact.features._add_set_from_mudata( # type: ignore
2132
3418
  var_fields=columns_field, **feature_kwargs
2133
3419
  )
2134
3420
  else:
@@ -2148,7 +3434,7 @@ def save_artifact(
2148
3434
  filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
2149
3435
  df = data if isinstance(data, pd.DataFrame) else data.obs
2150
3436
  # multi-value columns are separated by "|"
2151
- if df[key].str.contains("|").any():
3437
+ if not df[key].isna().all() and df[key].str.contains("|").any():
2152
3438
  values = df[key].str.split("|").explode().unique()
2153
3439
  else:
2154
3440
  values = df[key].unique()
@@ -2202,7 +3488,7 @@ def save_artifact(
2202
3488
  )
2203
3489
 
2204
3490
  slug = ln_setup.settings.instance.slug
2205
- if ln_setup.settings.instance.is_remote: # pragma: no cover
3491
+ if ln_setup.settings.instance.is_remote: # pdagma: no cover
2206
3492
  logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
2207
3493
  return artifact
2208
3494
 
@@ -2224,7 +3510,6 @@ def update_registry(
2224
3510
  values: list[str],
2225
3511
  field: FieldAttr,
2226
3512
  key: str,
2227
- using_key: str | None = None,
2228
3513
  validated_only: bool = True,
2229
3514
  df: pd.DataFrame | None = None,
2230
3515
  organism: str | None = None,
@@ -2233,13 +3518,12 @@ def update_registry(
2233
3518
  exclude: str | list | None = None,
2234
3519
  **kwargs,
2235
3520
  ) -> None:
2236
- """Save features or labels records in the default instance from the using_key instance.
3521
+ """Save features or labels records in the default instance..
2237
3522
 
2238
3523
  Args:
2239
3524
  values: A list of values to be saved as labels.
2240
3525
  field: The FieldAttr object representing the field for which labels are being saved.
2241
3526
  key: The name of the feature to save.
2242
- using_key: The name of the instance from which to transfer labels (if applicable).
2243
3527
  validated_only: If True, only save validated labels.
2244
3528
  df: A DataFrame to save labels from.
2245
3529
  organism: The organism name.
@@ -2290,22 +3574,10 @@ def update_registry(
2290
3574
  i for i in values if i not in existing_and_public_labels
2291
3575
  ]
2292
3576
 
2293
- # inspect and save validated records the using_key instance
2294
- (
2295
- labels_saved[f"from {using_key}"],
2296
- non_validated_labels,
2297
- ) = update_registry_from_using_instance(
2298
- non_validated_labels,
2299
- field=field,
2300
- using_key=using_key,
2301
- exclude=exclude,
2302
- **filter_kwargs,
2303
- )
2304
-
2305
3577
  # save non-validated/new records
2306
3578
  labels_saved["new"] = non_validated_labels
2307
3579
  if not validated_only:
2308
- non_validated_records = []
3580
+ non_validated_records: RecordList[Any] = [] # type: ignore
2309
3581
  if df is not None and registry == Feature:
2310
3582
  nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
2311
3583
  non_validated_records = Feature.from_df(df.loc[:, nonval_columns])
@@ -2379,48 +3651,6 @@ def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
2379
3651
  is_feature.children.add(*all_records)
2380
3652
 
2381
3653
 
2382
- def update_registry_from_using_instance(
2383
- values: list[str],
2384
- field: FieldAttr,
2385
- using_key: str | None = None,
2386
- exclude: str | list | None = None,
2387
- **kwargs,
2388
- ) -> tuple[list[str], list[str]]:
2389
- """Save features or labels records from the using_key instance.
2390
-
2391
- Args:
2392
- values: A list of values to be saved as labels.
2393
- field: The FieldAttr object representing the field for which labels are being saved.
2394
- using_key: The name of the instance from which to transfer labels (if applicable).
2395
- kwargs: Additional keyword arguments to pass to the registry model.
2396
-
2397
- Returns:
2398
- A tuple containing the list of saved labels and the list of non-saved labels.
2399
- """
2400
- labels_saved = []
2401
- not_saved = values
2402
-
2403
- if using_key is not None and using_key != "default":
2404
- registry_using = get_registry_instance(field.field.model, using_key)
2405
-
2406
- inspect_result_using = inspect_instance(
2407
- values=values,
2408
- field=field,
2409
- registry=registry_using,
2410
- exclude=exclude,
2411
- **kwargs,
2412
- )
2413
- labels_using = registry_using.filter(
2414
- **{f"{field.field.name}__in": inspect_result_using.validated}
2415
- ).all()
2416
- for label_using in labels_using:
2417
- label_using.save()
2418
- labels_saved.append(getattr(label_using, field.field.name))
2419
- not_saved = inspect_result_using.non_validated
2420
-
2421
- return labels_saved, not_saved
2422
-
2423
-
2424
3654
  def _save_organism(name: str):
2425
3655
  """Save an organism record."""
2426
3656
  import bionty as bt
@@ -2445,4 +3675,121 @@ def _ref_is_name(field: FieldAttr) -> bool | None:
2445
3675
  return field.field.name == name_field
2446
3676
 
2447
3677
 
2448
- Curate = Curator # backward compat
3678
+ # backward compat constructors ------------------
3679
+
3680
+
3681
+ @classmethod # type: ignore
3682
+ def from_df(
3683
+ cls,
3684
+ df: pd.DataFrame,
3685
+ categoricals: dict[str, FieldAttr] | None = None,
3686
+ columns: FieldAttr = Feature.name,
3687
+ verbosity: str = "hint",
3688
+ organism: str | None = None,
3689
+ ) -> DataFrameCatManager:
3690
+ return DataFrameCatManager(
3691
+ df=df,
3692
+ categoricals=categoricals,
3693
+ columns=columns,
3694
+ verbosity=verbosity,
3695
+ organism=organism,
3696
+ )
3697
+
3698
+
3699
+ @classmethod # type: ignore
3700
+ def from_anndata(
3701
+ cls,
3702
+ data: ad.AnnData | UPathStr,
3703
+ var_index: FieldAttr,
3704
+ categoricals: dict[str, FieldAttr] | None = None,
3705
+ obs_columns: FieldAttr = Feature.name,
3706
+ verbosity: str = "hint",
3707
+ organism: str | None = None,
3708
+ sources: dict[str, Record] | None = None,
3709
+ ) -> AnnDataCatManager:
3710
+ return AnnDataCatManager(
3711
+ data=data,
3712
+ var_index=var_index,
3713
+ categoricals=categoricals,
3714
+ obs_columns=obs_columns,
3715
+ verbosity=verbosity,
3716
+ organism=organism,
3717
+ sources=sources,
3718
+ )
3719
+
3720
+
3721
+ @classmethod # type: ignore
3722
+ def from_mudata(
3723
+ cls,
3724
+ mdata: MuData,
3725
+ var_index: dict[str, dict[str, FieldAttr]],
3726
+ categoricals: dict[str, FieldAttr] | None = None,
3727
+ verbosity: str = "hint",
3728
+ organism: str | None = None,
3729
+ ) -> MuDataCatManager:
3730
+ return MuDataCatManager(
3731
+ mdata=mdata,
3732
+ var_index=var_index,
3733
+ categoricals=categoricals,
3734
+ verbosity=verbosity,
3735
+ organism=organism,
3736
+ )
3737
+
3738
+
3739
+ @classmethod # type: ignore
3740
+ def from_tiledbsoma(
3741
+ cls,
3742
+ experiment_uri: UPathStr,
3743
+ var_index: dict[str, tuple[str, FieldAttr]],
3744
+ categoricals: dict[str, FieldAttr] | None = None,
3745
+ obs_columns: FieldAttr = Feature.name,
3746
+ organism: str | None = None,
3747
+ sources: dict[str, Record] | None = None,
3748
+ exclude: dict[str, str | list[str]] | None = None,
3749
+ ) -> TiledbsomaCatManager:
3750
+ return TiledbsomaCatManager(
3751
+ experiment_uri=experiment_uri,
3752
+ var_index=var_index,
3753
+ categoricals=categoricals,
3754
+ obs_columns=obs_columns,
3755
+ organism=organism,
3756
+ sources=sources,
3757
+ exclude=exclude,
3758
+ )
3759
+
3760
+
3761
+ @classmethod # type: ignore
3762
+ def from_spatialdata(
3763
+ cls,
3764
+ sdata,
3765
+ var_index: dict[str, FieldAttr],
3766
+ categoricals: dict[str, dict[str, FieldAttr]] | None = None,
3767
+ organism: str | None = None,
3768
+ sources: dict[str, dict[str, Record]] | None = None,
3769
+ exclude: dict[str, dict] | None = None,
3770
+ verbosity: str = "hint",
3771
+ *,
3772
+ sample_metadata_key: str = "sample",
3773
+ ):
3774
+ try:
3775
+ import spatialdata
3776
+ except ImportError as e:
3777
+ raise ImportError("Please install spatialdata: pip install spatialdata") from e
3778
+
3779
+ return SpatialDataCatManager(
3780
+ sdata=sdata,
3781
+ var_index=var_index,
3782
+ categoricals=categoricals,
3783
+ verbosity=verbosity,
3784
+ organism=organism,
3785
+ sources=sources,
3786
+ exclude=exclude,
3787
+ sample_metadata_key=sample_metadata_key,
3788
+ )
3789
+
3790
+
3791
+ CatManager.from_df = from_df # type: ignore
3792
+ CatManager.from_anndata = from_anndata # type: ignore
3793
+ CatManager.from_mudata = from_mudata # type: ignore
3794
+ CatManager.from_spatialdata = from_spatialdata # type: ignore
3795
+ CatManager.from_tiledbsoma = from_tiledbsoma # type: ignore