lamindb 0.76.7__py3-none-any.whl → 0.76.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lamindb/__init__.py +113 -113
  2. lamindb/_artifact.py +1205 -1178
  3. lamindb/_can_validate.py +579 -579
  4. lamindb/_collection.py +387 -387
  5. lamindb/_curate.py +1601 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +242 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +256 -256
  10. lamindb/_from_values.py +382 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +362 -362
  15. lamindb/_record.py +649 -649
  16. lamindb/_run.py +57 -57
  17. lamindb/_save.py +308 -295
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +127 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +574 -574
  25. lamindb/core/_data.py +438 -438
  26. lamindb/core/_feature_manager.py +867 -867
  27. lamindb/core/_label_manager.py +253 -253
  28. lamindb/core/_mapped_collection.py +597 -597
  29. lamindb/core/_settings.py +187 -187
  30. lamindb/core/_sync_git.py +138 -138
  31. lamindb/core/_track_environment.py +27 -27
  32. lamindb/core/datasets/__init__.py +59 -59
  33. lamindb/core/datasets/_core.py +571 -571
  34. lamindb/core/datasets/_fake.py +36 -36
  35. lamindb/core/exceptions.py +90 -77
  36. lamindb/core/fields.py +12 -12
  37. lamindb/core/loaders.py +164 -164
  38. lamindb/core/schema.py +56 -56
  39. lamindb/core/storage/__init__.py +25 -25
  40. lamindb/core/storage/_anndata_accessor.py +740 -740
  41. lamindb/core/storage/_anndata_sizes.py +41 -41
  42. lamindb/core/storage/_backed_access.py +98 -98
  43. lamindb/core/storage/_tiledbsoma.py +204 -204
  44. lamindb/core/storage/_valid_suffixes.py +21 -21
  45. lamindb/core/storage/_zarr.py +110 -110
  46. lamindb/core/storage/objects.py +62 -62
  47. lamindb/core/storage/paths.py +172 -141
  48. lamindb/core/subsettings/__init__.py +12 -12
  49. lamindb/core/subsettings/_creation_settings.py +38 -38
  50. lamindb/core/subsettings/_transform_settings.py +21 -21
  51. lamindb/core/types.py +19 -19
  52. lamindb/core/versioning.py +158 -158
  53. lamindb/integrations/__init__.py +12 -12
  54. lamindb/integrations/_vitessce.py +107 -107
  55. lamindb/setup/__init__.py +14 -14
  56. lamindb/setup/core/__init__.py +4 -4
  57. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
  58. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/METADATA +3 -3
  59. lamindb-0.76.8.dist-info/RECORD +60 -0
  60. {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
  61. lamindb-0.76.7.dist-info/RECORD +0 -60
lamindb/_curate.py CHANGED
@@ -1,1601 +1,1601 @@
1
- from __future__ import annotations
2
-
3
- import copy
4
- from typing import TYPE_CHECKING, Iterable
5
-
6
- import anndata as ad
7
- import lamindb_setup as ln_setup
8
- import pandas as pd
9
- from lamin_utils import colors, logger
10
- from lamindb_setup.core._docs import doc_args
11
- from lnschema_core import (
12
- Artifact,
13
- Feature,
14
- Record,
15
- Run,
16
- ULabel,
17
- )
18
-
19
- from .core.exceptions import ValidationError
20
-
21
- if TYPE_CHECKING:
22
- from lamindb_setup.core.types import UPathStr
23
- from lnschema_core.types import FieldAttr
24
- from mudata import MuData
25
-
26
-
27
- class CurateLookup:
28
- """Lookup categories from the reference instance."""
29
-
30
- def __init__(
31
- self,
32
- categoricals: dict[str, FieldAttr],
33
- slots: dict[str, FieldAttr] = None,
34
- using_key: str | None = None,
35
- ) -> None:
36
- if slots is None:
37
- slots = {}
38
- self._fields = {**categoricals, **slots}
39
- self._using_key = None if using_key == "default" else using_key
40
- self._using_key_name = self._using_key or ln_setup.settings.instance.slug
41
- debug_message = (
42
- f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
43
- )
44
- logger.debug(debug_message)
45
-
46
- def __getattr__(self, name):
47
- if name in self._fields:
48
- registry = self._fields[name].field.model
49
- if self._using_key == "public":
50
- return registry.public().lookup()
51
- else:
52
- return get_registry_instance(registry, self._using_key).lookup()
53
- raise AttributeError(
54
- f"'{self.__class__.__name__}' object has no attribute '{name}'"
55
- )
56
-
57
- def __getitem__(self, name):
58
- if name in self._fields:
59
- registry = self._fields[name].field.model
60
- if self._using_key == "public":
61
- return registry.public().lookup()
62
- else:
63
- return get_registry_instance(registry, self._using_key).lookup()
64
- raise AttributeError(
65
- f"'{self.__class__.__name__}' object has no attribute '{name}'"
66
- )
67
-
68
- def __repr__(self) -> str:
69
- if len(self._fields) > 0:
70
- getattr_keys = "\n ".join(
71
- [f".{key}" for key in self._fields if key.isidentifier()]
72
- )
73
- getitem_keys = "\n ".join(
74
- [str([key]) for key in self._fields if not key.isidentifier()]
75
- )
76
- return (
77
- f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
78
- f"{colors.green(getattr_keys)}\n "
79
- f"{colors.green(getitem_keys)}\n\n"
80
- "Example:\n → categories = validator.lookup().cell_type\n"
81
- " → categories.alveolar_type_1_fibroblast_cell"
82
- )
83
- else: # pragma: no cover
84
- return colors.warning("No fields are found!")
85
-
86
-
87
- class BaseCurator:
88
- """Curate a dataset."""
89
-
90
- def validate(self) -> bool:
91
- """Validate dataset.
92
-
93
- Returns:
94
- Boolean indicating whether the dataset is validated.
95
- """
96
- pass
97
-
98
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
99
- """Save the dataset as artifact.
100
-
101
- Args:
102
- description: Description of the DataFrame object.
103
- **kwargs: Object level metadata.
104
-
105
- Returns:
106
- A saved artifact record.
107
- """
108
- pass
109
-
110
-
111
- class DataFrameCurator(BaseCurator):
112
- """Curation flow for a DataFrame object.
113
-
114
- See also :class:`~lamindb.Curator`.
115
-
116
- Args:
117
- df: The DataFrame object to curate.
118
- columns: The field attribute for the feature column.
119
- categoricals: A dictionary mapping column names to registry_field.
120
- using_key: The reference instance containing registries to validate against.
121
- verbosity: The verbosity level.
122
- organism: The organism name.
123
- sources: A dictionary mapping column names to Source records.
124
- exclude: A dictionary mapping column names to values to exclude.
125
-
126
- Examples:
127
- >>> import bionty as bt
128
- >>> curate = ln.Curator.from_df(
129
- ... df,
130
- ... categoricals={
131
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
132
- ... "donor_id": ln.ULabel.name
133
- ... }
134
- ... )
135
- """
136
-
137
- def __init__(
138
- self,
139
- df: pd.DataFrame,
140
- columns: FieldAttr = Feature.name,
141
- categoricals: dict[str, FieldAttr] | None = None,
142
- using_key: str | None = None,
143
- verbosity: str = "hint",
144
- organism: str | None = None,
145
- sources: dict[str, Record] | None = None,
146
- exclude: dict | None = None,
147
- check_valid_keys: bool = True,
148
- ) -> None:
149
- from lamindb.core._settings import settings
150
-
151
- self._df = df
152
- self._fields = categoricals or {}
153
- self._columns_field = columns
154
- self._using_key = using_key
155
- settings.verbosity = verbosity
156
- self._artifact = None
157
- self._collection = None
158
- self._validated = False
159
- self._kwargs = {"organism": organism} if organism else {}
160
- if sources is None:
161
- sources = {}
162
- self._sources = sources
163
- if exclude is None:
164
- exclude = {}
165
- self._exclude = exclude
166
- self._non_validated = None
167
- if check_valid_keys:
168
- self._check_valid_keys()
169
- self._save_columns()
170
-
171
- @property
172
- def non_validated(self) -> list:
173
- """Return the non-validated features and labels."""
174
- if self._non_validated is None:
175
- raise ValueError("Please run validate() first!")
176
- return self._non_validated
177
-
178
- @property
179
- def fields(self) -> dict:
180
- """Return the columns fields to validate against."""
181
- return self._fields
182
-
183
- def lookup(self, using_key: str | None = None) -> CurateLookup:
184
- """Lookup categories.
185
-
186
- Args:
187
- using_key: The instance where the lookup is performed.
188
- if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
189
- if "public", the lookup is performed on the public reference.
190
- """
191
- return CurateLookup(
192
- categoricals=self._fields,
193
- slots={"columns": self._columns_field},
194
- using_key=using_key or self._using_key,
195
- )
196
-
197
- def _check_valid_keys(self, extra: set = None) -> None:
198
- if extra is None:
199
- extra = set()
200
- for name, d in {
201
- "categoricals": self._fields,
202
- "sources": self._sources,
203
- "exclude": self._exclude,
204
- }.items():
205
- if not isinstance(d, dict):
206
- raise TypeError(f"{name} must be a dictionary!")
207
- valid_keys = set(self._df.columns) | {"columns"} | extra
208
- nonval_keys = [key for key in d.keys() if key not in valid_keys]
209
- if len(nonval_keys) > 0:
210
- raise ValueError(
211
- f"the following keys passed to {name} are not allowed: {nonval_keys}"
212
- )
213
-
214
- def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
215
- """Save column name records."""
216
- # Always save features specified as the fields keys
217
- update_registry(
218
- values=list(self.fields.keys()),
219
- field=self._columns_field,
220
- key="columns",
221
- save_function="add_new_from_columns",
222
- using_key=self._using_key,
223
- validated_only=False,
224
- source=self._sources.get("columns"),
225
- exclude=self._exclude.get("columns"),
226
- **kwargs,
227
- )
228
-
229
- # Save the rest of the columns based on validated_only
230
- additional_columns = set(self._df.columns) - set(self.fields.keys())
231
- if additional_columns:
232
- update_registry(
233
- values=list(additional_columns),
234
- field=self._columns_field,
235
- key="columns",
236
- save_function="add_new_from_columns",
237
- using_key=self._using_key,
238
- validated_only=validated_only,
239
- df=self._df, # Get the Feature type from df
240
- source=self._sources.get("columns"),
241
- exclude=self._exclude.get("columns"),
242
- warning=False, # Do not warn about missing columns, just an info message
243
- **kwargs,
244
- )
245
-
246
- def add_validated_from(self, key: str, organism: str | None = None):
247
- """Add validated categories.
248
-
249
- Args:
250
- key: The key referencing the slot in the DataFrame.
251
- organism: The organism name.
252
- """
253
- self._kwargs.update({"organism": organism} if organism else {})
254
- self._update_registry(key, validated_only=True, **self._kwargs)
255
-
256
- def add_new_from(self, key: str, organism: str | None = None, **kwargs):
257
- """Add validated & new categories.
258
-
259
- Args:
260
- key: The key referencing the slot in the DataFrame from which to draw terms.
261
- organism: The organism name.
262
- **kwargs: Additional keyword arguments to pass to the registry model.
263
- """
264
- if len(kwargs) > 0 and key == "all":
265
- raise ValueError("Cannot pass additional arguments to 'all' key!")
266
- self._kwargs.update({"organism": organism} if organism else {})
267
- self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
268
-
269
- def add_new_from_columns(self, organism: str | None = None, **kwargs):
270
- """Add validated & new column names to its registry.
271
-
272
- Args:
273
- organism: The organism name.
274
- **kwargs: Additional keyword arguments to pass to the registry model.
275
- """
276
- self._kwargs.update({"organism": organism} if organism else {})
277
- self._save_columns(validated_only=False, **self._kwargs, **kwargs)
278
-
279
- def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
280
- if categorical == "all":
281
- self._update_registry_all(validated_only=validated_only, **kwargs)
282
- elif categorical == "columns":
283
- self._save_columns(validated_only=validated_only, **kwargs)
284
- else:
285
- if categorical not in self.fields:
286
- raise ValueError(f"Feature {categorical} is not part of the fields!")
287
- update_registry(
288
- values=self._df[categorical].unique().tolist(),
289
- field=self.fields[categorical],
290
- key=categorical,
291
- using_key=self._using_key,
292
- validated_only=validated_only,
293
- source=self._sources.get(categorical),
294
- exclude=self._exclude.get(categorical),
295
- **kwargs,
296
- )
297
-
298
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
299
- """Save labels for all features."""
300
- for name in self.fields.keys():
301
- logger.info(f"saving labels for '{name}'")
302
- self._update_registry(name, validated_only=validated_only, **kwargs)
303
-
304
- def validate(self, organism: str | None = None) -> bool:
305
- """Validate variables and categorical observations.
306
-
307
- Args:
308
- organism: The organism name.
309
-
310
- Returns:
311
- Whether the DataFrame is validated.
312
- """
313
- self._kwargs.update({"organism": organism} if organism else {})
314
- self._validated, self._non_validated = validate_categories_in_df( # type: ignore
315
- self._df,
316
- fields=self.fields,
317
- using_key=self._using_key,
318
- sources=self._sources,
319
- exclude=self._exclude,
320
- **self._kwargs,
321
- )
322
- return self._validated
323
-
324
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
325
- """Save the validated DataFrame and metadata.
326
-
327
- Args:
328
- description: Description of the DataFrame object.
329
- **kwargs: Object level metadata.
330
-
331
- Returns:
332
- A saved artifact record.
333
- """
334
- from lamindb.core._settings import settings
335
-
336
- if not self._validated:
337
- self.validate()
338
- if not self._validated:
339
- raise ValidationError("Dataset does not validate. Please curate.")
340
-
341
- # Make sure all labels are saved in the current instance
342
- verbosity = settings.verbosity
343
- try:
344
- settings.verbosity = "warning"
345
- # save all validated records to the current instance
346
- self.add_validated_from("all")
347
-
348
- self._artifact = save_artifact(
349
- self._df,
350
- description=description,
351
- fields=self.fields,
352
- columns_field=self._columns_field,
353
- **kwargs,
354
- **self._kwargs,
355
- )
356
- finally:
357
- settings.verbosity = verbosity
358
-
359
- return self._artifact
360
-
361
- def clean_up_failed_runs(self):
362
- """Clean up previous failed runs that don't save any outputs."""
363
- from lamindb.core._context import context
364
-
365
- if context.run is not None:
366
- Run.filter(transform=context.run.transform, output_artifacts=None).exclude(
367
- uid=context.run.uid
368
- ).delete()
369
-
370
-
371
- class AnnDataCurator(DataFrameCurator):
372
- """Curation flow for ``AnnData``.
373
-
374
- See also :class:`~lamindb.Curator`.
375
-
376
- Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
377
-
378
- See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
379
-
380
- Args:
381
- data: The AnnData object or an AnnData-like path.
382
- var_index: The registry field for mapping the ``.var`` index.
383
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
384
- using_key: A reference LaminDB instance.
385
- verbosity: The verbosity level.
386
- organism: The organism name.
387
- sources: A dictionary mapping ``.obs.columns`` to Source records.
388
- exclude: A dictionary mapping column names to values to exclude.
389
-
390
- Examples:
391
- >>> import bionty as bt
392
- >>> curate = ln.Curator.from_anndata(
393
- ... adata,
394
- ... var_index=bt.Gene.ensembl_gene_id,
395
- ... categoricals={
396
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
397
- ... "donor_id": ln.ULabel.name
398
- ... },
399
- ... organism="human",
400
- ... )
401
- """
402
-
403
- def __init__(
404
- self,
405
- data: ad.AnnData | UPathStr,
406
- var_index: FieldAttr,
407
- categoricals: dict[str, FieldAttr] | None = None,
408
- obs_columns: FieldAttr = Feature.name,
409
- using_key: str = "default",
410
- verbosity: str = "hint",
411
- organism: str | None = None,
412
- sources: dict[str, Record] | None = None,
413
- exclude: dict | None = None,
414
- ) -> None:
415
- from lamindb_setup.core import upath
416
-
417
- from ._artifact import data_is_anndata
418
-
419
- if sources is None:
420
- sources = {}
421
- if not data_is_anndata(data):
422
- raise ValueError(
423
- "data has to be an AnnData object or a path to AnnData-like"
424
- )
425
- if isinstance(data, ad.AnnData):
426
- self._adata = data
427
- else: # pragma: no cover
428
- from lamindb.core.storage._backed_access import backed_access
429
-
430
- self._adata = backed_access(upath.create_path(data))
431
-
432
- self._data = data
433
- self._var_field = var_index
434
- super().__init__(
435
- df=self._adata.obs,
436
- categoricals=categoricals,
437
- columns=obs_columns,
438
- using_key=using_key,
439
- verbosity=verbosity,
440
- organism=organism,
441
- sources=sources,
442
- exclude=exclude,
443
- check_valid_keys=False,
444
- )
445
- self._obs_fields = categoricals or {}
446
- self._check_valid_keys(extra={"var_index"})
447
-
448
- @property
449
- def var_index(self) -> FieldAttr:
450
- """Return the registry field to validate variables index against."""
451
- return self._var_field
452
-
453
- @property
454
- def categoricals(self) -> dict:
455
- """Return the obs fields to validate against."""
456
- return self._obs_fields
457
-
458
- def lookup(self, using_key: str | None = None) -> CurateLookup:
459
- """Lookup categories.
460
-
461
- Args:
462
- using_key: The instance where the lookup is performed.
463
- if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
464
- if "public", the lookup is performed on the public reference.
465
- """
466
- return CurateLookup(
467
- categoricals=self._obs_fields,
468
- slots={"columns": self._columns_field, "var_index": self._var_field},
469
- using_key=using_key or self._using_key,
470
- )
471
-
472
- def _save_from_var_index(
473
- self, validated_only: bool = True, organism: str | None = None
474
- ):
475
- """Save variable records."""
476
- update_registry(
477
- values=list(self._adata.var.index),
478
- field=self.var_index,
479
- key="var_index",
480
- save_function="add_new_from_var_index",
481
- using_key=self._using_key,
482
- validated_only=validated_only,
483
- organism=organism,
484
- source=self._sources.get("var_index"),
485
- exclude=self._exclude.get("var_index"),
486
- )
487
-
488
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
489
- """Save labels for all features."""
490
- for name in self.fields.keys():
491
- logger.info(f"saving labels for '{name}'")
492
- if name == "var_index":
493
- self._save_from_var_index(validated_only=validated_only, **kwargs)
494
- else:
495
- self._update_registry(name, validated_only=validated_only, **kwargs)
496
-
497
- def add_new_from_var_index(self, organism: str | None = None, **kwargs):
498
- """Update variable records.
499
-
500
- Args:
501
- organism: The organism name.
502
- **kwargs: Additional keyword arguments to pass to the registry model.
503
- """
504
- self._kwargs.update({"organism": organism} if organism else {})
505
- self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
506
-
507
- def add_validated_from_var_index(self, organism: str | None = None):
508
- """Add validated variable records.
509
-
510
- Args:
511
- organism: The organism name.
512
- """
513
- self._kwargs.update({"organism": organism} if organism else {})
514
- self._save_from_var_index(validated_only=True, **self._kwargs)
515
-
516
- def validate(self, organism: str | None = None) -> bool:
517
- """Validate categories.
518
-
519
- Args:
520
- organism: The organism name.
521
-
522
- Returns:
523
- Whether the AnnData object is validated.
524
- """
525
- self._kwargs.update({"organism": organism} if organism else {})
526
- if self._using_key is not None and self._using_key != "default":
527
- logger.important(
528
- f"validating metadata using registries of instance {colors.italic(self._using_key)}"
529
- )
530
-
531
- validated_var, non_validated_var = validate_categories(
532
- self._adata.var.index,
533
- field=self._var_field,
534
- key="var_index",
535
- using_key=self._using_key,
536
- source=self._sources.get("var_index"),
537
- validated_hint_print=".add_validated_from_var_index()",
538
- exclude=self._exclude.get("var_index"),
539
- **self._kwargs, # type: ignore
540
- )
541
- validated_obs, non_validated_obs = validate_categories_in_df(
542
- self._adata.obs,
543
- fields=self.categoricals,
544
- using_key=self._using_key,
545
- sources=self._sources,
546
- exclude=self._exclude,
547
- **self._kwargs,
548
- )
549
- self._non_validated = non_validated_obs # type: ignore
550
- if len(non_validated_var) > 0:
551
- self._non_validated["var_index"] = non_validated_var # type: ignore
552
- self._validated = validated_var and validated_obs
553
- return self._validated
554
-
555
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
556
- """Save the validated ``AnnData`` and metadata.
557
-
558
- Args:
559
- description: Description of the ``AnnData`` object.
560
- **kwargs: Object level metadata.
561
-
562
- Returns:
563
- A saved artifact record.
564
- """
565
- if not self._validated:
566
- self.validate()
567
- if not self._validated:
568
- raise ValidationError("Dataset does not validate. Please curate.")
569
-
570
- self._artifact = save_artifact(
571
- self._data,
572
- adata=self._adata,
573
- description=description,
574
- columns_field=self.var_index,
575
- fields=self.categoricals,
576
- **self._kwargs,
577
- **kwargs,
578
- )
579
- return self._artifact
580
-
581
-
582
- class MuDataCurator:
583
- """Curation flow for a ``MuData`` object.
584
-
585
- See also :class:`~lamindb.Curator`.
586
-
587
- Note that if genes or other measurements are removed from the MuData object,
588
- the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
589
-
590
- Args:
591
- mdata: The MuData object to curate.
592
- var_index: The registry field for mapping the ``.var`` index for each modality.
593
- For example:
594
- ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
595
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
596
- Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
597
- using_key: A reference LaminDB instance.
598
- verbosity: The verbosity level.
599
- organism: The organism name.
600
- sources: A dictionary mapping ``.obs.columns`` to Source records.
601
- exclude: A dictionary mapping column names to values to exclude.
602
-
603
- Examples:
604
- >>> import bionty as bt
605
- >>> curate = ln.Curator.from_mudata(
606
- ... mdata,
607
- ... var_index={
608
- ... "rna": bt.Gene.ensembl_gene_id,
609
- ... "adt": ln.CellMarker.name
610
- ... },
611
- ... categoricals={
612
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
613
- ... "donor_id": ln.ULabel.name
614
- ... },
615
- ... organism="human",
616
- ... )
617
- """
618
-
619
- def __init__(
620
- self,
621
- mdata: MuData,
622
- var_index: dict[str, dict[str, FieldAttr]],
623
- categoricals: dict[str, FieldAttr] | None = None,
624
- using_key: str = "default",
625
- verbosity: str = "hint",
626
- organism: str | None = None,
627
- sources: dict[str, Record] | None = None,
628
- exclude: dict | None = None,
629
- ) -> None:
630
- if sources is None:
631
- sources = {}
632
- self._sources = sources
633
- if exclude is None:
634
- exclude = {}
635
- self._exclude = exclude
636
- self._mdata = mdata
637
- self._kwargs = {"organism": organism} if organism else {}
638
- self._var_fields = var_index
639
- self._verify_modality(self._var_fields.keys())
640
- self._obs_fields = self._parse_categoricals(categoricals)
641
- self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
642
- self._using_key = using_key
643
- self._verbosity = verbosity
644
- self._df_annotators = {
645
- modality: DataFrameCurator(
646
- df=mdata[modality].obs if modality != "obs" else mdata.obs,
647
- categoricals=self._obs_fields.get(modality, {}),
648
- using_key=using_key,
649
- verbosity=verbosity,
650
- sources=self._sources.get(modality),
651
- exclude=self._exclude.get(modality),
652
- check_valid_keys=False,
653
- **self._kwargs,
654
- )
655
- for modality in self._modalities
656
- }
657
- for modality in self._var_fields.keys():
658
- self._save_from_var_index_modality(
659
- modality=modality, validated_only=True, **self._kwargs
660
- )
661
-
662
- @property
663
- def var_index(self) -> FieldAttr:
664
- """Return the registry field to validate variables index against."""
665
- return self._var_fields
666
-
667
- @property
668
- def categoricals(self) -> dict:
669
- """Return the obs fields to validate against."""
670
- return self._obs_fields
671
-
672
- def _verify_modality(self, modalities: Iterable[str]):
673
- """Verify the modality exists."""
674
- for modality in modalities:
675
- if modality not in self._mdata.mod.keys():
676
- raise ValueError(f"modality '{modality}' does not exist!")
677
-
678
- def _save_from_var_index_modality(
679
- self, modality: str, validated_only: bool = True, **kwargs
680
- ):
681
- """Save variable records."""
682
- update_registry(
683
- values=list(self._mdata[modality].var.index),
684
- field=self._var_fields[modality],
685
- key="var_index",
686
- save_function="add_new_from_var_index",
687
- using_key=self._using_key,
688
- validated_only=validated_only,
689
- dtype="number",
690
- source=self._sources.get(modality, {}).get("var_index"),
691
- exclude=self._exclude.get(modality, {}).get("var_index"),
692
- **kwargs,
693
- )
694
-
695
- def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
696
- """Parse the categorical fields."""
697
- prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
698
- obs_fields: dict[str, dict[str, FieldAttr]] = {}
699
- for k, v in categoricals.items():
700
- if k not in self._mdata.obs.columns:
701
- raise ValueError(f"column '{k}' does not exist in mdata.obs!")
702
- if any(k.startswith(prefix) for prefix in prefixes):
703
- modality, col = k.split(":")[0], k.split(":")[1]
704
- if modality not in obs_fields.keys():
705
- obs_fields[modality] = {}
706
- obs_fields[modality][col] = v
707
- else:
708
- if "obs" not in obs_fields.keys():
709
- obs_fields["obs"] = {}
710
- obs_fields["obs"][k] = v
711
- return obs_fields
712
-
713
- def lookup(self, using_key: str | None = None) -> CurateLookup:
714
- """Lookup categories.
715
-
716
- Args:
717
- using_key: The instance where the lookup is performed.
718
- if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
719
- if "public", the lookup is performed on the public reference.
720
- """
721
- return CurateLookup(
722
- categoricals=self._obs_fields,
723
- slots={
724
- **self._obs_fields,
725
- **{f"{k}_var_index": v for k, v in self._var_fields.items()},
726
- },
727
- using_key=using_key or self._using_key,
728
- )
729
-
730
- def add_new_from_columns(
731
- self,
732
- modality: str,
733
- column_names: list[str] | None = None,
734
- organism: str | None = None,
735
- **kwargs,
736
- ):
737
- """Update columns records.
738
-
739
- Args:
740
- modality: The modality name.
741
- column_names: The column names to save.
742
- organism: The organism name.
743
- **kwargs: Additional keyword arguments to pass to the registry model.
744
- """
745
- self._kwargs.update({"organism": organism} if organism else {})
746
- values = column_names or self._mdata[modality].obs.columns
747
- update_registry(
748
- values=list(values),
749
- field=Feature.name,
750
- key=f"{modality} obs columns",
751
- using_key=self._using_key,
752
- validated_only=False,
753
- df=self._mdata[modality].obs,
754
- source=self._sources.get(modality, {}).get("columns"),
755
- exclude=self._exclude.get(modality, {}).get("columns"),
756
- **self._kwargs, # type: ignore
757
- **kwargs,
758
- )
759
-
760
- def add_new_from_var_index(
761
- self, modality: str, organism: str | None = None, **kwargs
762
- ):
763
- """Update variable records.
764
-
765
- Args:
766
- modality: The modality name.
767
- organism: The organism name.
768
- **kwargs: Additional keyword arguments to pass to the registry model.
769
- """
770
- self._kwargs.update({"organism": organism} if organism else {})
771
- self._save_from_var_index_modality(
772
- modality=modality, validated_only=False, **self._kwargs, **kwargs
773
- )
774
-
775
- def add_validated_from_var_index(self, modality: str, organism: str | None = None):
776
- """Add validated variable records.
777
-
778
- Args:
779
- modality: The modality name.
780
- organism: The organism name.
781
- """
782
- self._kwargs.update({"organism": organism} if organism else {})
783
- self._save_from_var_index_modality(
784
- modality=modality, validated_only=True, **self._kwargs
785
- )
786
-
787
- def add_validated_from(
788
- self, key: str, modality: str | None = None, organism: str | None = None
789
- ):
790
- """Add validated categories.
791
-
792
- Args:
793
- key: The key referencing the slot in the DataFrame.
794
- modality: The modality name.
795
- organism: The organism name.
796
- """
797
- self._kwargs.update({"organism": organism} if organism else {})
798
- modality = modality or "obs"
799
- if modality in self._df_annotators:
800
- df_annotator = self._df_annotators[modality]
801
- df_annotator.add_validated_from(key=key, **self._kwargs)
802
-
803
- def add_new_from(
804
- self,
805
- key: str,
806
- modality: str | None = None,
807
- organism: str | None = None,
808
- **kwargs,
809
- ):
810
- """Add validated & new categories.
811
-
812
- Args:
813
- key: The key referencing the slot in the DataFrame.
814
- modality: The modality name.
815
- organism: The organism name.
816
- **kwargs: Additional keyword arguments to pass to the registry model.
817
- """
818
- if len(kwargs) > 0 and key == "all":
819
- raise ValueError("Cannot pass additional arguments to 'all' key!")
820
- self._kwargs.update({"organism": organism} if organism else {})
821
- modality = modality or "obs"
822
- if modality in self._df_annotators:
823
- df_annotator = self._df_annotators[modality]
824
- df_annotator.add_new_from(key=key, **self._kwargs, **kwargs)
825
-
826
- def validate(self, organism: str | None = None) -> bool:
827
- """Validate categories."""
828
- self._kwargs.update({"organism": organism} if organism else {})
829
- if self._using_key is not None and self._using_key != "default":
830
- logger.important(
831
- f"validating metadata using registries of instance {colors.italic(self._using_key)}"
832
- )
833
- validated_var = True
834
- non_validated_var_modality = {}
835
- for modality, var_field in self._var_fields.items():
836
- is_validated_var, non_validated_var = validate_categories(
837
- self._mdata[modality].var.index,
838
- field=var_field,
839
- key=f"{modality}_var_index",
840
- using_key=self._using_key,
841
- source=self._sources.get(modality, {}).get("var_index"),
842
- exclude=self._exclude.get(modality, {}).get("var_index"),
843
- **self._kwargs, # type: ignore
844
- )
845
- validated_var &= is_validated_var
846
- if len(non_validated_var) > 0:
847
- non_validated_var_modality[modality] = non_validated_var
848
-
849
- validated_obs = True
850
- non_validated_obs_modality = {}
851
- for modality, fields in self._obs_fields.items():
852
- if modality == "obs":
853
- obs = self._mdata.obs
854
- else:
855
- obs = self._mdata[modality].obs
856
- is_validated_obs, non_validated_obs = validate_categories_in_df(
857
- obs,
858
- fields=fields,
859
- using_key=self._using_key,
860
- sources=self._sources.get(modality),
861
- exclude=self._exclude.get(modality),
862
- **self._kwargs,
863
- )
864
- validated_obs &= is_validated_obs
865
- non_validated_obs_modality[modality] = non_validated_obs
866
- if modality in non_validated_var_modality:
867
- non_validated_obs_modality[modality]["var_index"] = (
868
- non_validated_var_modality[modality]
869
- )
870
- if len(non_validated_obs_modality[modality]) > 0:
871
- self._non_validated = non_validated_obs_modality[modality]
872
- self._validated = validated_var and validated_obs
873
- return self._validated
874
-
875
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
876
- """Save the validated ``MuData`` and metadata.
877
-
878
- Args:
879
- description: Description of the ``MuData`` object.
880
- **kwargs: Object level metadata.
881
-
882
- Returns:
883
- A saved artifact record.
884
- """
885
- if not self._validated:
886
- raise ValidationError("Please run `validate()` first!")
887
-
888
- self._artifact = save_artifact(
889
- self._mdata,
890
- description=description,
891
- columns_field=self.var_index,
892
- fields=self.categoricals,
893
- **self._kwargs,
894
- **kwargs,
895
- )
896
- return self._artifact
897
-
898
-
899
- class Curator(BaseCurator):
900
- """Dataset curator.
901
-
902
- Data curation entails accurately labeling datasets with standardized metadata
903
- to facilitate data integration, interpretation and analysis.
904
-
905
- The curation flow has several steps:
906
-
907
- 1. Instantiate `Curator` from one of the following dataset objects:
908
-
909
- - :meth:`~lamindb.Curator.from_df`
910
- - :meth:`~lamindb.Curator.from_anndata`
911
- - :meth:`~lamindb.Curator.from_mudata`
912
-
913
- During object creation, any passed categoricals found in the object will be saved.
914
-
915
- 2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
916
-
917
- - Values that can successfully validated and already exist in the registry.
918
- - Values which are new and not yet validated or potentially problematic values.
919
-
920
- 3. Determine how to handle validated and non-validated values:
921
-
922
- - Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
923
- - Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
924
- - All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
925
- """
926
-
927
- @classmethod
928
- @doc_args(DataFrameCurator.__doc__)
929
- def from_df(
930
- cls,
931
- df: pd.DataFrame,
932
- categoricals: dict[str, FieldAttr] | None = None,
933
- columns: FieldAttr = Feature.name,
934
- using_key: str | None = None,
935
- verbosity: str = "hint",
936
- organism: str | None = None,
937
- ) -> DataFrameCurator:
938
- """{}""" # noqa: D415
939
- return DataFrameCurator(
940
- df=df,
941
- categoricals=categoricals,
942
- columns=columns,
943
- using_key=using_key,
944
- verbosity=verbosity,
945
- organism=organism,
946
- )
947
-
948
- @classmethod
949
- @doc_args(AnnDataCurator.__doc__)
950
- def from_anndata(
951
- cls,
952
- data: ad.AnnData | UPathStr,
953
- var_index: FieldAttr,
954
- categoricals: dict[str, FieldAttr] | None = None,
955
- obs_columns: FieldAttr = Feature.name,
956
- using_key: str = "default",
957
- verbosity: str = "hint",
958
- organism: str | None = None,
959
- sources: dict[str, Record] | None = None,
960
- ) -> AnnDataCurator:
961
- """{}""" # noqa: D415
962
- return AnnDataCurator(
963
- data=data,
964
- var_index=var_index,
965
- categoricals=categoricals,
966
- obs_columns=obs_columns,
967
- using_key=using_key,
968
- verbosity=verbosity,
969
- organism=organism,
970
- sources=sources,
971
- )
972
-
973
- @classmethod
974
- @doc_args(MuDataCurator.__doc__)
975
- def from_mudata(
976
- cls,
977
- mdata: MuData,
978
- var_index: dict[str, dict[str, FieldAttr]],
979
- categoricals: dict[str, FieldAttr] | None = None,
980
- using_key: str = "default",
981
- verbosity: str = "hint",
982
- organism: str | None = None,
983
- ) -> MuDataCurator:
984
- """{}""" # noqa: D415
985
- return MuDataCurator(
986
- mdata=mdata,
987
- var_index=var_index,
988
- categoricals=categoricals,
989
- using_key=using_key,
990
- verbosity=verbosity,
991
- organism=organism,
992
- )
993
-
994
-
995
- def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
996
- """Get a registry instance using a specific instance."""
997
- if using_key is not None and using_key != "default":
998
- return registry.using(using_key)
999
- return registry
1000
-
1001
-
1002
- def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
1003
- """Make sure the source and organism are saved in the same database as the registry."""
1004
- from lamindb.core._settings import settings
1005
-
1006
- db = registry.filter().db
1007
- source = kwargs.get("source")
1008
- organism = kwargs.get("organism")
1009
- filter_kwargs = kwargs.copy()
1010
- try:
1011
- verbosity = settings.verbosity
1012
- settings.verbosity = "error"
1013
- if isinstance(organism, Record) and organism._state.db != "default":
1014
- if db is None or db == "default":
1015
- organism_default = copy.copy(organism)
1016
- # save the organism record in the default database
1017
- organism_default.save()
1018
- filter_kwargs["organism"] = organism_default
1019
- if isinstance(source, Record) and source._state.db != "default":
1020
- if db is None or db == "default":
1021
- source_default = copy.copy(source)
1022
- # save the source record in the default database
1023
- source_default.save()
1024
- filter_kwargs["source"] = source_default
1025
- finally:
1026
- settings.verbosity = verbosity
1027
- return filter_kwargs
1028
-
1029
-
1030
- def standardize_and_inspect(
1031
- values: Iterable[str],
1032
- field: FieldAttr,
1033
- registry: type[Record],
1034
- standardize: bool = False,
1035
- exclude: str | list | None = None,
1036
- **kwargs,
1037
- ):
1038
- """Standardize and inspect values using a registry."""
1039
- # inspect exclude values in the default instance
1040
- values = list(values)
1041
- include_validated = []
1042
- if exclude is not None:
1043
- exclude = [exclude] if isinstance(exclude, str) else exclude
1044
- exclude = [i for i in exclude if i in values]
1045
- if len(exclude) > 0:
1046
- # exclude values are validated without source and organism
1047
- inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
1048
- # if exclude values are validated, remove them from the values
1049
- values = [i for i in values if i not in inspect_result_exclude.validated]
1050
- include_validated = inspect_result_exclude.validated
1051
-
1052
- if standardize:
1053
- if hasattr(registry, "standardize") and hasattr(
1054
- registry,
1055
- "synonyms", # https://github.com/laminlabs/lamindb/issues/1685
1056
- ):
1057
- standardized_values = registry.standardize(
1058
- values, field=field, mute=True, **kwargs
1059
- )
1060
- values = standardized_values
1061
-
1062
- inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
1063
- inspect_result._validated += include_validated
1064
- inspect_result._non_validated = [
1065
- i for i in inspect_result.non_validated if i not in include_validated
1066
- ]
1067
-
1068
- return inspect_result
1069
-
1070
-
1071
- def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
1072
- """Check if a registry needs an organism and return the organism name."""
1073
- if hasattr(registry, "organism_id"):
1074
- import bionty as bt
1075
-
1076
- if organism is None and bt.settings.organism is None:
1077
- raise ValueError(
1078
- f"{registry.__name__} registry requires an organism!\n"
1079
- " → please pass an organism name via organism="
1080
- )
1081
- return {"organism": organism or bt.settings.organism.name}
1082
- return {}
1083
-
1084
-
1085
- def validate_categories(
1086
- values: Iterable[str],
1087
- field: FieldAttr,
1088
- key: str,
1089
- using_key: str | None = None,
1090
- organism: str | None = None,
1091
- source: Record | None = None,
1092
- exclude: str | list | None = None,
1093
- standardize: bool = True,
1094
- validated_hint_print: str | None = None,
1095
- ) -> tuple[bool, list]:
1096
- """Validate ontology terms in a pandas series using LaminDB registries.
1097
-
1098
- Args:
1099
- values: The values to validate.
1100
- field: The field attribute.
1101
- key: The key referencing the slot in the DataFrame.
1102
- using_key: A reference LaminDB instance.
1103
- organism: The organism name.
1104
- source: The source record.
1105
- exclude: Exclude specific values.
1106
- standardize: Standardize the values.
1107
- validated_hint_print: The hint to print for validated values.
1108
- """
1109
- from lamindb._from_values import _print_values
1110
- from lamindb.core._settings import settings
1111
-
1112
- model_field = f"{field.field.model.__name__}.{field.field.name}"
1113
-
1114
- def _log_mapping_info():
1115
- logger.indent = ""
1116
- logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}")
1117
- logger.indent = " "
1118
-
1119
- registry = field.field.model
1120
-
1121
- kwargs = check_registry_organism(registry, organism)
1122
- kwargs.update({"source": source} if source else {})
1123
- kwargs_current = get_current_filter_kwargs(registry, kwargs)
1124
-
1125
- # inspect the default instance
1126
- inspect_result = standardize_and_inspect(
1127
- values=values,
1128
- field=field,
1129
- registry=registry,
1130
- standardize=standardize,
1131
- exclude=exclude,
1132
- **kwargs_current,
1133
- )
1134
- non_validated = inspect_result.non_validated
1135
-
1136
- # inspect the using instance
1137
- values_validated = []
1138
- if using_key is not None and using_key != "default" and non_validated:
1139
- registry_using = get_registry_instance(registry, using_key)
1140
- inspect_result = standardize_and_inspect(
1141
- values=non_validated,
1142
- field=field,
1143
- registry=registry_using,
1144
- standardize=standardize,
1145
- exclude=exclude,
1146
- **kwargs,
1147
- )
1148
- non_validated = inspect_result.non_validated
1149
- values_validated += inspect_result.validated
1150
-
1151
- # inspect from public (bionty only)
1152
- if hasattr(registry, "public"):
1153
- verbosity = settings.verbosity
1154
- try:
1155
- settings.verbosity = "error"
1156
- public_records = registry.from_values(
1157
- non_validated,
1158
- field=field,
1159
- **kwargs_current,
1160
- )
1161
- values_validated += [getattr(r, field.field.name) for r in public_records]
1162
- finally:
1163
- settings.verbosity = verbosity
1164
-
1165
- validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
1166
- n_validated = len(values_validated)
1167
- if n_validated > 0:
1168
- _log_mapping_info()
1169
- logger.warning(
1170
- f"found {colors.yellow(n_validated)} validated terms: "
1171
- f"{colors.yellow(values_validated)}\n → save terms via "
1172
- f"{colors.yellow(validated_hint_print)}"
1173
- )
1174
-
1175
- non_validated_hint_print = f".add_new_from('{key}')"
1176
- non_validated = [i for i in non_validated if i not in values_validated]
1177
- n_non_validated = len(non_validated)
1178
- if n_non_validated == 0:
1179
- if n_validated == 0:
1180
- logger.indent = ""
1181
- logger.success(f"{key} is validated against {colors.italic(model_field)}")
1182
- return True, []
1183
- else:
1184
- # validated values still need to be saved to the current instance
1185
- return False, []
1186
- else:
1187
- are = "are" if n_non_validated > 1 else "is"
1188
- print_values = _print_values(non_validated)
1189
- warning_message = (
1190
- f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
1191
- f"{colors.red(print_values)}\n → fix typos, remove non-existent values, or save terms via "
1192
- f"{colors.red(non_validated_hint_print)}"
1193
- )
1194
- if logger.indent == "":
1195
- _log_mapping_info()
1196
- logger.warning(warning_message)
1197
- logger.indent = ""
1198
- return False, non_validated
1199
-
1200
-
1201
- def validate_categories_in_df(
1202
- df: pd.DataFrame,
1203
- fields: dict[str, FieldAttr],
1204
- using_key: str | None = None,
1205
- sources: dict[str, Record] = None,
1206
- exclude: dict | None = None,
1207
- **kwargs,
1208
- ) -> tuple[bool, dict]:
1209
- """Validate categories in DataFrame columns using LaminDB registries."""
1210
- if not fields:
1211
- return True, {}
1212
-
1213
- if sources is None:
1214
- sources = {}
1215
- validated = True
1216
- non_validated = {}
1217
- for key, field in fields.items():
1218
- is_val, non_val = validate_categories(
1219
- df[key],
1220
- field=field,
1221
- key=key,
1222
- using_key=using_key,
1223
- source=sources.get(key),
1224
- exclude=exclude.get(key) if exclude else None,
1225
- **kwargs,
1226
- )
1227
- validated &= is_val
1228
- if len(non_val) > 0:
1229
- non_validated[key] = non_val
1230
- return validated, non_validated
1231
-
1232
-
1233
- def save_artifact(
1234
- data: pd.DataFrame | ad.AnnData | MuData,
1235
- fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
1236
- columns_field: FieldAttr | dict[str, FieldAttr],
1237
- description: str | None = None,
1238
- organism: str | None = None,
1239
- adata: ad.AnnData | None = None,
1240
- **kwargs,
1241
- ) -> Artifact:
1242
- """Save all metadata with an Artifact.
1243
-
1244
- Args:
1245
- data: The DataFrame or AnnData object to save.
1246
- description: A description of the artifact.
1247
- fields: A dictionary mapping obs_column to registry_field.
1248
- columns_field: The registry field to validate variables index against.
1249
- organism: The organism name.
1250
- adata: The AnnData object to save, must be provided if data is a path.
1251
- kwargs: Additional keyword arguments to pass to the registry model.
1252
-
1253
- Returns:
1254
- The saved Artifact.
1255
- """
1256
- from ._artifact import data_is_anndata
1257
-
1258
- artifact = None
1259
- if data_is_anndata(data):
1260
- assert adata is not None # noqa: S101
1261
- artifact = Artifact.from_anndata(data, description=description, **kwargs)
1262
- artifact.n_observations = adata.shape[0]
1263
- data = adata
1264
-
1265
- elif isinstance(data, pd.DataFrame):
1266
- artifact = Artifact.from_df(data, description=description, **kwargs)
1267
- else:
1268
- try:
1269
- from mudata import MuData
1270
-
1271
- if isinstance(data, MuData):
1272
- artifact = Artifact.from_mudata(data, description=description, **kwargs)
1273
- artifact.n_observations = data.n_obs
1274
- except ImportError:
1275
- pass
1276
- if artifact is None:
1277
- raise ValueError("data must be a DataFrame, AnnData or MuData object.")
1278
- artifact.save()
1279
-
1280
- feature_kwargs = check_registry_organism(
1281
- (
1282
- list(columns_field.values())[0].field.model
1283
- if isinstance(columns_field, dict)
1284
- else columns_field.field.model
1285
- ),
1286
- organism,
1287
- )
1288
-
1289
- if artifact._accessor == "DataFrame":
1290
- artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
1291
- elif artifact._accessor == "AnnData":
1292
- artifact.features._add_set_from_anndata(
1293
- var_field=columns_field, **feature_kwargs
1294
- )
1295
- elif artifact._accessor == "MuData":
1296
- artifact.features._add_set_from_mudata(
1297
- var_fields=columns_field, **feature_kwargs
1298
- )
1299
- else:
1300
- raise NotImplementedError
1301
-
1302
- def _add_labels(data, artifact: Artifact, fields: dict[str, FieldAttr]):
1303
- features = Feature.lookup().dict()
1304
- for key, field in fields.items():
1305
- feature = features.get(key)
1306
- registry = field.field.model
1307
- filter_kwargs = check_registry_organism(registry, organism)
1308
- filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1309
- df = data if isinstance(data, pd.DataFrame) else data.obs
1310
- labels = registry.from_values(
1311
- df[key],
1312
- field=field,
1313
- **filter_kwargs_current,
1314
- )
1315
- artifact.labels.add(labels, feature)
1316
-
1317
- if artifact._accessor == "MuData":
1318
- for modality, modality_fields in fields.items():
1319
- if modality == "obs":
1320
- _add_labels(data, artifact, modality_fields)
1321
- else:
1322
- _add_labels(data[modality], artifact, modality_fields)
1323
- else:
1324
- _add_labels(data, artifact, fields)
1325
-
1326
- slug = ln_setup.settings.instance.slug
1327
- if ln_setup.settings.instance.is_remote: # pragma: no cover
1328
- logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
1329
- return artifact
1330
-
1331
-
1332
- def update_registry(
1333
- values: list[str],
1334
- field: FieldAttr,
1335
- key: str,
1336
- save_function: str = "add_new_from",
1337
- using_key: str | None = None,
1338
- validated_only: bool = True,
1339
- df: pd.DataFrame | None = None,
1340
- organism: str | None = None,
1341
- dtype: str | None = None,
1342
- source: Record | None = None,
1343
- standardize: bool = True,
1344
- warning: bool = True,
1345
- exclude: str | list | None = None,
1346
- **kwargs,
1347
- ) -> None:
1348
- """Save features or labels records in the default instance from the using_key instance.
1349
-
1350
- Args:
1351
- values: A list of values to be saved as labels.
1352
- field: The FieldAttr object representing the field for which labels are being saved.
1353
- key: The name of the feature to save.
1354
- save_function: The name of the function to save the labels.
1355
- using_key: The name of the instance from which to transfer labels (if applicable).
1356
- validated_only: If True, only save validated labels.
1357
- df: A DataFrame to save labels from.
1358
- organism: The organism name.
1359
- dtype: The type of the feature.
1360
- source: The source record.
1361
- kwargs: Additional keyword arguments to pass to the registry model to create new records.
1362
- """
1363
- from lamindb._save import save as ln_save
1364
- from lamindb.core._settings import settings
1365
-
1366
- registry = field.field.model
1367
- filter_kwargs = check_registry_organism(registry, organism)
1368
- filter_kwargs.update({"source": source} if source else {})
1369
-
1370
- verbosity = settings.verbosity
1371
- try:
1372
- settings.verbosity = "error"
1373
-
1374
- # save from public
1375
- filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1376
- existing_and_public_records = (
1377
- registry.from_values(
1378
- list(values),
1379
- field=field,
1380
- **filter_kwargs_current,
1381
- )
1382
- if values
1383
- else []
1384
- )
1385
-
1386
- labels_saved: dict = {"from public": [], "without reference": []}
1387
-
1388
- public_records = [r for r in existing_and_public_records if r._state.adding]
1389
- # here we check to only save the public records if they are from the specified source
1390
- # we check the uid because r.source and soruce can be from different instances
1391
- if source:
1392
- public_records = [r for r in public_records if r.source.uid == source.uid]
1393
- ln_save(public_records)
1394
- labels_saved["from public"] = [
1395
- getattr(r, field.field.name) for r in public_records
1396
- ]
1397
- non_public_labels = [i for i in values if i not in labels_saved["from public"]]
1398
-
1399
- # inspect the default instance
1400
- inspect_result_current = standardize_and_inspect(
1401
- values=non_public_labels,
1402
- field=field,
1403
- registry=registry,
1404
- standardize=standardize,
1405
- exclude=exclude,
1406
- **filter_kwargs_current,
1407
- )
1408
- if not inspect_result_current.non_validated:
1409
- all_labels = registry.from_values(
1410
- inspect_result_current.validated,
1411
- field=field,
1412
- **filter_kwargs_current,
1413
- )
1414
- settings.verbosity = verbosity
1415
- return all_labels
1416
-
1417
- # inspect the using_key instance
1418
- (
1419
- labels_saved[f"from {using_key}"],
1420
- non_validated_labels,
1421
- ) = update_registry_from_using_instance(
1422
- inspect_result_current.non_validated,
1423
- field=field,
1424
- using_key=using_key,
1425
- exclude=exclude,
1426
- **filter_kwargs,
1427
- )
1428
-
1429
- labels_saved["without reference"] = [
1430
- i
1431
- for i in non_validated_labels
1432
- if i not in labels_saved[f"from {using_key}"]
1433
- ]
1434
-
1435
- # save non-validated records
1436
- if not validated_only:
1437
- non_validated_records = []
1438
- if df is not None and registry == Feature:
1439
- non_validated_records = Feature.from_df(df)
1440
- else:
1441
- if "organism" in filter_kwargs:
1442
- # make sure organism record is saved to the current instance
1443
- filter_kwargs["organism"] = _save_organism(name=organism)
1444
- init_kwargs = {}
1445
- for value in labels_saved["without reference"]:
1446
- init_kwargs[field.field.name] = value
1447
- if registry == Feature:
1448
- init_kwargs["dtype"] = "cat" if dtype is None else dtype
1449
- non_validated_records.append(
1450
- registry(
1451
- **init_kwargs,
1452
- **{k: v for k, v in filter_kwargs.items() if k != "source"},
1453
- **{k: v for k, v in kwargs.items() if k != "sources"},
1454
- )
1455
- )
1456
- ln_save(non_validated_records)
1457
-
1458
- # save parent labels for ulabels
1459
- if registry == ULabel and field.field.name == "name":
1460
- save_ulabels_with_parent(values, field=field, key=key)
1461
-
1462
- # # get all records that are now validated in the current instance
1463
- # all_labels = registry.from_values(
1464
- # inspect_result_current.validated + inspect_result_current.non_validated,
1465
- # field=field,
1466
- # **get_current_filter_kwargs(registry, filter_kwargs),
1467
- # )
1468
- finally:
1469
- settings.verbosity = verbosity
1470
-
1471
- log_saved_labels(
1472
- labels_saved,
1473
- key=key,
1474
- save_function=save_function,
1475
- model_field=f"{registry.__name__}.{field.field.name}",
1476
- validated_only=validated_only,
1477
- warning=warning,
1478
- )
1479
-
1480
- # return all_labels
1481
-
1482
-
1483
- def log_saved_labels(
1484
- labels_saved: dict,
1485
- key: str,
1486
- save_function: str,
1487
- model_field: str,
1488
- validated_only: bool = True,
1489
- warning: bool = True,
1490
- ) -> None:
1491
- """Log the saved labels."""
1492
- from ._from_values import _print_values
1493
-
1494
- model_field = colors.italic(model_field)
1495
- for k, labels in labels_saved.items():
1496
- if not labels:
1497
- continue
1498
-
1499
- if k == "without reference" and validated_only:
1500
- msg = colors.yellow(
1501
- f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
1502
- )
1503
- lookup_print = (
1504
- f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
1505
- )
1506
-
1507
- hint = f".add_new_from('{key}')"
1508
- msg += f"\n → to lookup values, use {lookup_print}"
1509
- msg += (
1510
- f"\n → to save, run {colors.yellow(hint)}"
1511
- if save_function == "add_new_from"
1512
- else f"\n → to save, run {colors.yellow(save_function)}"
1513
- )
1514
- if warning:
1515
- logger.warning(msg)
1516
- else:
1517
- logger.info(msg)
1518
- else:
1519
- k = "" if k == "without reference" else f"{colors.green(k)} "
1520
- # the term "transferred" stresses that this is always in the context of transferring
1521
- # labels from a public ontology or a different instance to the present instance
1522
- s = "s" if len(labels) > 1 else ""
1523
- logger.success(
1524
- f"added {len(labels)} record{s} {k}with {model_field} for {colors.italic(key)}: {_print_values(labels)}"
1525
- )
1526
-
1527
-
1528
- def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None:
1529
- """Save a parent label for the given labels."""
1530
- registry = field.field.model
1531
- assert registry == ULabel # noqa: S101
1532
- all_records = registry.from_values(list(values), field=field)
1533
- is_feature = registry.filter(name=f"is_{key}").one_or_none()
1534
- if is_feature is None:
1535
- is_feature = registry(name=f"is_{key}")
1536
- is_feature.save()
1537
- is_feature.children.add(*all_records)
1538
-
1539
-
1540
- def update_registry_from_using_instance(
1541
- values: list[str],
1542
- field: FieldAttr,
1543
- using_key: str | None = None,
1544
- standardize: bool = False,
1545
- exclude: str | list | None = None,
1546
- **kwargs,
1547
- ) -> tuple[list[str], list[str]]:
1548
- """Save features or labels records from the using_key instance.
1549
-
1550
- Args:
1551
- values: A list of values to be saved as labels.
1552
- field: The FieldAttr object representing the field for which labels are being saved.
1553
- using_key: The name of the instance from which to transfer labels (if applicable).
1554
- standardize: Whether to also standardize the values.
1555
- kwargs: Additional keyword arguments to pass to the registry model.
1556
-
1557
- Returns:
1558
- A tuple containing the list of saved labels and the list of non-saved labels.
1559
- """
1560
- labels_saved = []
1561
- not_saved = values
1562
-
1563
- if using_key is not None and using_key != "default":
1564
- registry_using = get_registry_instance(field.field.model, using_key)
1565
-
1566
- inspect_result_using = standardize_and_inspect(
1567
- values=values,
1568
- field=field,
1569
- registry=registry_using,
1570
- standardize=standardize,
1571
- exclude=exclude,
1572
- **kwargs,
1573
- )
1574
- labels_using = registry_using.filter(
1575
- **{f"{field.field.name}__in": inspect_result_using.validated}
1576
- ).all()
1577
- for label_using in labels_using:
1578
- label_using.save()
1579
- labels_saved.append(getattr(label_using, field.field.name))
1580
- not_saved = inspect_result_using.non_validated
1581
-
1582
- return labels_saved, not_saved
1583
-
1584
-
1585
- def _save_organism(name: str): # pragma: no cover
1586
- """Save an organism record."""
1587
- import bionty as bt
1588
-
1589
- organism = bt.Organism.filter(name=name).one_or_none()
1590
- if organism is None:
1591
- organism = bt.Organism.from_source(name=name)
1592
- if organism is None:
1593
- raise ValueError(
1594
- f"Organism '{name}' not found\n"
1595
- f" → please save it: bt.Organism(name='{name}').save()"
1596
- )
1597
- organism.save()
1598
- return organism
1599
-
1600
-
1601
- Curate = Curator # backward compat
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from typing import TYPE_CHECKING, Iterable
5
+
6
+ import anndata as ad
7
+ import lamindb_setup as ln_setup
8
+ import pandas as pd
9
+ from lamin_utils import colors, logger
10
+ from lamindb_setup.core._docs import doc_args
11
+ from lnschema_core import (
12
+ Artifact,
13
+ Feature,
14
+ Record,
15
+ Run,
16
+ ULabel,
17
+ )
18
+
19
+ from .core.exceptions import ValidationError
20
+
21
+ if TYPE_CHECKING:
22
+ from lamindb_setup.core.types import UPathStr
23
+ from lnschema_core.types import FieldAttr
24
+ from mudata import MuData
25
+
26
+
27
+ class CurateLookup:
28
+ """Lookup categories from the reference instance."""
29
+
30
+ def __init__(
31
+ self,
32
+ categoricals: dict[str, FieldAttr],
33
+ slots: dict[str, FieldAttr] = None,
34
+ using_key: str | None = None,
35
+ ) -> None:
36
+ if slots is None:
37
+ slots = {}
38
+ self._fields = {**categoricals, **slots}
39
+ self._using_key = None if using_key == "default" else using_key
40
+ self._using_key_name = self._using_key or ln_setup.settings.instance.slug
41
+ debug_message = (
42
+ f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
43
+ )
44
+ logger.debug(debug_message)
45
+
46
+ def __getattr__(self, name):
47
+ if name in self._fields:
48
+ registry = self._fields[name].field.model
49
+ if self._using_key == "public":
50
+ return registry.public().lookup()
51
+ else:
52
+ return get_registry_instance(registry, self._using_key).lookup()
53
+ raise AttributeError(
54
+ f"'{self.__class__.__name__}' object has no attribute '{name}'"
55
+ )
56
+
57
+ def __getitem__(self, name):
58
+ if name in self._fields:
59
+ registry = self._fields[name].field.model
60
+ if self._using_key == "public":
61
+ return registry.public().lookup()
62
+ else:
63
+ return get_registry_instance(registry, self._using_key).lookup()
64
+ raise AttributeError(
65
+ f"'{self.__class__.__name__}' object has no attribute '{name}'"
66
+ )
67
+
68
+ def __repr__(self) -> str:
69
+ if len(self._fields) > 0:
70
+ getattr_keys = "\n ".join(
71
+ [f".{key}" for key in self._fields if key.isidentifier()]
72
+ )
73
+ getitem_keys = "\n ".join(
74
+ [str([key]) for key in self._fields if not key.isidentifier()]
75
+ )
76
+ return (
77
+ f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
78
+ f"{colors.green(getattr_keys)}\n "
79
+ f"{colors.green(getitem_keys)}\n\n"
80
+ "Example:\n → categories = validator.lookup().cell_type\n"
81
+ " → categories.alveolar_type_1_fibroblast_cell"
82
+ )
83
+ else: # pragma: no cover
84
+ return colors.warning("No fields are found!")
85
+
86
+
87
+ class BaseCurator:
88
+ """Curate a dataset."""
89
+
90
+ def validate(self) -> bool:
91
+ """Validate dataset.
92
+
93
+ Returns:
94
+ Boolean indicating whether the dataset is validated.
95
+ """
96
+ pass
97
+
98
+ def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
99
+ """Save the dataset as artifact.
100
+
101
+ Args:
102
+ description: Description of the DataFrame object.
103
+ **kwargs: Object level metadata.
104
+
105
+ Returns:
106
+ A saved artifact record.
107
+ """
108
+ pass
109
+
110
+
111
+ class DataFrameCurator(BaseCurator):
112
+ """Curation flow for a DataFrame object.
113
+
114
+ See also :class:`~lamindb.Curator`.
115
+
116
+ Args:
117
+ df: The DataFrame object to curate.
118
+ columns: The field attribute for the feature column.
119
+ categoricals: A dictionary mapping column names to registry_field.
120
+ using_key: The reference instance containing registries to validate against.
121
+ verbosity: The verbosity level.
122
+ organism: The organism name.
123
+ sources: A dictionary mapping column names to Source records.
124
+ exclude: A dictionary mapping column names to values to exclude.
125
+
126
+ Examples:
127
+ >>> import bionty as bt
128
+ >>> curate = ln.Curator.from_df(
129
+ ... df,
130
+ ... categoricals={
131
+ ... "cell_type_ontology_id": bt.CellType.ontology_id,
132
+ ... "donor_id": ln.ULabel.name
133
+ ... }
134
+ ... )
135
+ """
136
+
137
+ def __init__(
138
+ self,
139
+ df: pd.DataFrame,
140
+ columns: FieldAttr = Feature.name,
141
+ categoricals: dict[str, FieldAttr] | None = None,
142
+ using_key: str | None = None,
143
+ verbosity: str = "hint",
144
+ organism: str | None = None,
145
+ sources: dict[str, Record] | None = None,
146
+ exclude: dict | None = None,
147
+ check_valid_keys: bool = True,
148
+ ) -> None:
149
+ from lamindb.core._settings import settings
150
+
151
+ self._df = df
152
+ self._fields = categoricals or {}
153
+ self._columns_field = columns
154
+ self._using_key = using_key
155
+ settings.verbosity = verbosity
156
+ self._artifact = None
157
+ self._collection = None
158
+ self._validated = False
159
+ self._kwargs = {"organism": organism} if organism else {}
160
+ if sources is None:
161
+ sources = {}
162
+ self._sources = sources
163
+ if exclude is None:
164
+ exclude = {}
165
+ self._exclude = exclude
166
+ self._non_validated = None
167
+ if check_valid_keys:
168
+ self._check_valid_keys()
169
+ self._save_columns()
170
+
171
+ @property
172
+ def non_validated(self) -> list:
173
+ """Return the non-validated features and labels."""
174
+ if self._non_validated is None:
175
+ raise ValueError("Please run validate() first!")
176
+ return self._non_validated
177
+
178
+ @property
179
+ def fields(self) -> dict:
180
+ """Return the columns fields to validate against."""
181
+ return self._fields
182
+
183
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
184
+ """Lookup categories.
185
+
186
+ Args:
187
+ using_key: The instance where the lookup is performed.
188
+ if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
189
+ if "public", the lookup is performed on the public reference.
190
+ """
191
+ return CurateLookup(
192
+ categoricals=self._fields,
193
+ slots={"columns": self._columns_field},
194
+ using_key=using_key or self._using_key,
195
+ )
196
+
197
+ def _check_valid_keys(self, extra: set = None) -> None:
198
+ if extra is None:
199
+ extra = set()
200
+ for name, d in {
201
+ "categoricals": self._fields,
202
+ "sources": self._sources,
203
+ "exclude": self._exclude,
204
+ }.items():
205
+ if not isinstance(d, dict):
206
+ raise TypeError(f"{name} must be a dictionary!")
207
+ valid_keys = set(self._df.columns) | {"columns"} | extra
208
+ nonval_keys = [key for key in d.keys() if key not in valid_keys]
209
+ if len(nonval_keys) > 0:
210
+ raise ValueError(
211
+ f"the following keys passed to {name} are not allowed: {nonval_keys}"
212
+ )
213
+
214
+ def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
215
+ """Save column name records."""
216
+ # Always save features specified as the fields keys
217
+ update_registry(
218
+ values=list(self.fields.keys()),
219
+ field=self._columns_field,
220
+ key="columns",
221
+ save_function="add_new_from_columns",
222
+ using_key=self._using_key,
223
+ validated_only=False,
224
+ source=self._sources.get("columns"),
225
+ exclude=self._exclude.get("columns"),
226
+ **kwargs,
227
+ )
228
+
229
+ # Save the rest of the columns based on validated_only
230
+ additional_columns = set(self._df.columns) - set(self.fields.keys())
231
+ if additional_columns:
232
+ update_registry(
233
+ values=list(additional_columns),
234
+ field=self._columns_field,
235
+ key="columns",
236
+ save_function="add_new_from_columns",
237
+ using_key=self._using_key,
238
+ validated_only=validated_only,
239
+ df=self._df, # Get the Feature type from df
240
+ source=self._sources.get("columns"),
241
+ exclude=self._exclude.get("columns"),
242
+ warning=False, # Do not warn about missing columns, just an info message
243
+ **kwargs,
244
+ )
245
+
246
+ def add_validated_from(self, key: str, organism: str | None = None):
247
+ """Add validated categories.
248
+
249
+ Args:
250
+ key: The key referencing the slot in the DataFrame.
251
+ organism: The organism name.
252
+ """
253
+ self._kwargs.update({"organism": organism} if organism else {})
254
+ self._update_registry(key, validated_only=True, **self._kwargs)
255
+
256
+ def add_new_from(self, key: str, organism: str | None = None, **kwargs):
257
+ """Add validated & new categories.
258
+
259
+ Args:
260
+ key: The key referencing the slot in the DataFrame from which to draw terms.
261
+ organism: The organism name.
262
+ **kwargs: Additional keyword arguments to pass to the registry model.
263
+ """
264
+ if len(kwargs) > 0 and key == "all":
265
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
266
+ self._kwargs.update({"organism": organism} if organism else {})
267
+ self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
268
+
269
+ def add_new_from_columns(self, organism: str | None = None, **kwargs):
270
+ """Add validated & new column names to its registry.
271
+
272
+ Args:
273
+ organism: The organism name.
274
+ **kwargs: Additional keyword arguments to pass to the registry model.
275
+ """
276
+ self._kwargs.update({"organism": organism} if organism else {})
277
+ self._save_columns(validated_only=False, **self._kwargs, **kwargs)
278
+
279
+ def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
280
+ if categorical == "all":
281
+ self._update_registry_all(validated_only=validated_only, **kwargs)
282
+ elif categorical == "columns":
283
+ self._save_columns(validated_only=validated_only, **kwargs)
284
+ else:
285
+ if categorical not in self.fields:
286
+ raise ValueError(f"Feature {categorical} is not part of the fields!")
287
+ update_registry(
288
+ values=self._df[categorical].unique().tolist(),
289
+ field=self.fields[categorical],
290
+ key=categorical,
291
+ using_key=self._using_key,
292
+ validated_only=validated_only,
293
+ source=self._sources.get(categorical),
294
+ exclude=self._exclude.get(categorical),
295
+ **kwargs,
296
+ )
297
+
298
+ def _update_registry_all(self, validated_only: bool = True, **kwargs):
299
+ """Save labels for all features."""
300
+ for name in self.fields.keys():
301
+ logger.info(f"saving labels for '{name}'")
302
+ self._update_registry(name, validated_only=validated_only, **kwargs)
303
+
304
+ def validate(self, organism: str | None = None) -> bool:
305
+ """Validate variables and categorical observations.
306
+
307
+ Args:
308
+ organism: The organism name.
309
+
310
+ Returns:
311
+ Whether the DataFrame is validated.
312
+ """
313
+ self._kwargs.update({"organism": organism} if organism else {})
314
+ self._validated, self._non_validated = validate_categories_in_df( # type: ignore
315
+ self._df,
316
+ fields=self.fields,
317
+ using_key=self._using_key,
318
+ sources=self._sources,
319
+ exclude=self._exclude,
320
+ **self._kwargs,
321
+ )
322
+ return self._validated
323
+
324
+ def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
325
+ """Save the validated DataFrame and metadata.
326
+
327
+ Args:
328
+ description: Description of the DataFrame object.
329
+ **kwargs: Object level metadata.
330
+
331
+ Returns:
332
+ A saved artifact record.
333
+ """
334
+ from lamindb.core._settings import settings
335
+
336
+ if not self._validated:
337
+ self.validate()
338
+ if not self._validated:
339
+ raise ValidationError("Dataset does not validate. Please curate.")
340
+
341
+ # Make sure all labels are saved in the current instance
342
+ verbosity = settings.verbosity
343
+ try:
344
+ settings.verbosity = "warning"
345
+ # save all validated records to the current instance
346
+ self.add_validated_from("all")
347
+
348
+ self._artifact = save_artifact(
349
+ self._df,
350
+ description=description,
351
+ fields=self.fields,
352
+ columns_field=self._columns_field,
353
+ **kwargs,
354
+ **self._kwargs,
355
+ )
356
+ finally:
357
+ settings.verbosity = verbosity
358
+
359
+ return self._artifact
360
+
361
+ def clean_up_failed_runs(self):
362
+ """Clean up previous failed runs that don't save any outputs."""
363
+ from lamindb.core._context import context
364
+
365
+ if context.run is not None:
366
+ Run.filter(transform=context.run.transform, output_artifacts=None).exclude(
367
+ uid=context.run.uid
368
+ ).delete()
369
+
370
+
371
+ class AnnDataCurator(DataFrameCurator):
372
+ """Curation flow for ``AnnData``.
373
+
374
+ See also :class:`~lamindb.Curator`.
375
+
376
+ Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
377
+
378
+ See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
379
+
380
+ Args:
381
+ data: The AnnData object or an AnnData-like path.
382
+ var_index: The registry field for mapping the ``.var`` index.
383
+ categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
384
+ using_key: A reference LaminDB instance.
385
+ verbosity: The verbosity level.
386
+ organism: The organism name.
387
+ sources: A dictionary mapping ``.obs.columns`` to Source records.
388
+ exclude: A dictionary mapping column names to values to exclude.
389
+
390
+ Examples:
391
+ >>> import bionty as bt
392
+ >>> curate = ln.Curator.from_anndata(
393
+ ... adata,
394
+ ... var_index=bt.Gene.ensembl_gene_id,
395
+ ... categoricals={
396
+ ... "cell_type_ontology_id": bt.CellType.ontology_id,
397
+ ... "donor_id": ln.ULabel.name
398
+ ... },
399
+ ... organism="human",
400
+ ... )
401
+ """
402
+
403
+ def __init__(
404
+ self,
405
+ data: ad.AnnData | UPathStr,
406
+ var_index: FieldAttr,
407
+ categoricals: dict[str, FieldAttr] | None = None,
408
+ obs_columns: FieldAttr = Feature.name,
409
+ using_key: str = "default",
410
+ verbosity: str = "hint",
411
+ organism: str | None = None,
412
+ sources: dict[str, Record] | None = None,
413
+ exclude: dict | None = None,
414
+ ) -> None:
415
+ from lamindb_setup.core import upath
416
+
417
+ from ._artifact import data_is_anndata
418
+
419
+ if sources is None:
420
+ sources = {}
421
+ if not data_is_anndata(data):
422
+ raise ValueError(
423
+ "data has to be an AnnData object or a path to AnnData-like"
424
+ )
425
+ if isinstance(data, ad.AnnData):
426
+ self._adata = data
427
+ else: # pragma: no cover
428
+ from lamindb.core.storage._backed_access import backed_access
429
+
430
+ self._adata = backed_access(upath.create_path(data))
431
+
432
+ self._data = data
433
+ self._var_field = var_index
434
+ super().__init__(
435
+ df=self._adata.obs,
436
+ categoricals=categoricals,
437
+ columns=obs_columns,
438
+ using_key=using_key,
439
+ verbosity=verbosity,
440
+ organism=organism,
441
+ sources=sources,
442
+ exclude=exclude,
443
+ check_valid_keys=False,
444
+ )
445
+ self._obs_fields = categoricals or {}
446
+ self._check_valid_keys(extra={"var_index"})
447
+
448
+ @property
449
+ def var_index(self) -> FieldAttr:
450
+ """Return the registry field to validate variables index against."""
451
+ return self._var_field
452
+
453
+ @property
454
+ def categoricals(self) -> dict:
455
+ """Return the obs fields to validate against."""
456
+ return self._obs_fields
457
+
458
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
459
+ """Lookup categories.
460
+
461
+ Args:
462
+ using_key: The instance where the lookup is performed.
463
+ if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
464
+ if "public", the lookup is performed on the public reference.
465
+ """
466
+ return CurateLookup(
467
+ categoricals=self._obs_fields,
468
+ slots={"columns": self._columns_field, "var_index": self._var_field},
469
+ using_key=using_key or self._using_key,
470
+ )
471
+
472
+ def _save_from_var_index(
473
+ self, validated_only: bool = True, organism: str | None = None
474
+ ):
475
+ """Save variable records."""
476
+ update_registry(
477
+ values=list(self._adata.var.index),
478
+ field=self.var_index,
479
+ key="var_index",
480
+ save_function="add_new_from_var_index",
481
+ using_key=self._using_key,
482
+ validated_only=validated_only,
483
+ organism=organism,
484
+ source=self._sources.get("var_index"),
485
+ exclude=self._exclude.get("var_index"),
486
+ )
487
+
488
+ def _update_registry_all(self, validated_only: bool = True, **kwargs):
489
+ """Save labels for all features."""
490
+ for name in self.fields.keys():
491
+ logger.info(f"saving labels for '{name}'")
492
+ if name == "var_index":
493
+ self._save_from_var_index(validated_only=validated_only, **kwargs)
494
+ else:
495
+ self._update_registry(name, validated_only=validated_only, **kwargs)
496
+
497
+ def add_new_from_var_index(self, organism: str | None = None, **kwargs):
498
+ """Update variable records.
499
+
500
+ Args:
501
+ organism: The organism name.
502
+ **kwargs: Additional keyword arguments to pass to the registry model.
503
+ """
504
+ self._kwargs.update({"organism": organism} if organism else {})
505
+ self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
506
+
507
+ def add_validated_from_var_index(self, organism: str | None = None):
508
+ """Add validated variable records.
509
+
510
+ Args:
511
+ organism: The organism name.
512
+ """
513
+ self._kwargs.update({"organism": organism} if organism else {})
514
+ self._save_from_var_index(validated_only=True, **self._kwargs)
515
+
516
+ def validate(self, organism: str | None = None) -> bool:
517
+ """Validate categories.
518
+
519
+ Args:
520
+ organism: The organism name.
521
+
522
+ Returns:
523
+ Whether the AnnData object is validated.
524
+ """
525
+ self._kwargs.update({"organism": organism} if organism else {})
526
+ if self._using_key is not None and self._using_key != "default":
527
+ logger.important(
528
+ f"validating metadata using registries of instance {colors.italic(self._using_key)}"
529
+ )
530
+
531
+ validated_var, non_validated_var = validate_categories(
532
+ self._adata.var.index,
533
+ field=self._var_field,
534
+ key="var_index",
535
+ using_key=self._using_key,
536
+ source=self._sources.get("var_index"),
537
+ validated_hint_print=".add_validated_from_var_index()",
538
+ exclude=self._exclude.get("var_index"),
539
+ **self._kwargs, # type: ignore
540
+ )
541
+ validated_obs, non_validated_obs = validate_categories_in_df(
542
+ self._adata.obs,
543
+ fields=self.categoricals,
544
+ using_key=self._using_key,
545
+ sources=self._sources,
546
+ exclude=self._exclude,
547
+ **self._kwargs,
548
+ )
549
+ self._non_validated = non_validated_obs # type: ignore
550
+ if len(non_validated_var) > 0:
551
+ self._non_validated["var_index"] = non_validated_var # type: ignore
552
+ self._validated = validated_var and validated_obs
553
+ return self._validated
554
+
555
+ def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
556
+ """Save the validated ``AnnData`` and metadata.
557
+
558
+ Args:
559
+ description: Description of the ``AnnData`` object.
560
+ **kwargs: Object level metadata.
561
+
562
+ Returns:
563
+ A saved artifact record.
564
+ """
565
+ if not self._validated:
566
+ self.validate()
567
+ if not self._validated:
568
+ raise ValidationError("Dataset does not validate. Please curate.")
569
+
570
+ self._artifact = save_artifact(
571
+ self._data,
572
+ adata=self._adata,
573
+ description=description,
574
+ columns_field=self.var_index,
575
+ fields=self.categoricals,
576
+ **self._kwargs,
577
+ **kwargs,
578
+ )
579
+ return self._artifact
580
+
581
+
582
+ class MuDataCurator:
583
+ """Curation flow for a ``MuData`` object.
584
+
585
+ See also :class:`~lamindb.Curator`.
586
+
587
+ Note that if genes or other measurements are removed from the MuData object,
588
+ the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
589
+
590
+ Args:
591
+ mdata: The MuData object to curate.
592
+ var_index: The registry field for mapping the ``.var`` index for each modality.
593
+ For example:
594
+ ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
595
+ categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
596
+ Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
597
+ using_key: A reference LaminDB instance.
598
+ verbosity: The verbosity level.
599
+ organism: The organism name.
600
+ sources: A dictionary mapping ``.obs.columns`` to Source records.
601
+ exclude: A dictionary mapping column names to values to exclude.
602
+
603
+ Examples:
604
+ >>> import bionty as bt
605
+ >>> curate = ln.Curator.from_mudata(
606
+ ... mdata,
607
+ ... var_index={
608
+ ... "rna": bt.Gene.ensembl_gene_id,
609
+ ... "adt": ln.CellMarker.name
610
+ ... },
611
+ ... categoricals={
612
+ ... "cell_type_ontology_id": bt.CellType.ontology_id,
613
+ ... "donor_id": ln.ULabel.name
614
+ ... },
615
+ ... organism="human",
616
+ ... )
617
+ """
618
+
619
+ def __init__(
620
+ self,
621
+ mdata: MuData,
622
+ var_index: dict[str, dict[str, FieldAttr]],
623
+ categoricals: dict[str, FieldAttr] | None = None,
624
+ using_key: str = "default",
625
+ verbosity: str = "hint",
626
+ organism: str | None = None,
627
+ sources: dict[str, Record] | None = None,
628
+ exclude: dict | None = None,
629
+ ) -> None:
630
+ if sources is None:
631
+ sources = {}
632
+ self._sources = sources
633
+ if exclude is None:
634
+ exclude = {}
635
+ self._exclude = exclude
636
+ self._mdata = mdata
637
+ self._kwargs = {"organism": organism} if organism else {}
638
+ self._var_fields = var_index
639
+ self._verify_modality(self._var_fields.keys())
640
+ self._obs_fields = self._parse_categoricals(categoricals)
641
+ self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
642
+ self._using_key = using_key
643
+ self._verbosity = verbosity
644
+ self._df_annotators = {
645
+ modality: DataFrameCurator(
646
+ df=mdata[modality].obs if modality != "obs" else mdata.obs,
647
+ categoricals=self._obs_fields.get(modality, {}),
648
+ using_key=using_key,
649
+ verbosity=verbosity,
650
+ sources=self._sources.get(modality),
651
+ exclude=self._exclude.get(modality),
652
+ check_valid_keys=False,
653
+ **self._kwargs,
654
+ )
655
+ for modality in self._modalities
656
+ }
657
+ for modality in self._var_fields.keys():
658
+ self._save_from_var_index_modality(
659
+ modality=modality, validated_only=True, **self._kwargs
660
+ )
661
+
662
+ @property
663
+ def var_index(self) -> FieldAttr:
664
+ """Return the registry field to validate variables index against."""
665
+ return self._var_fields
666
+
667
+ @property
668
+ def categoricals(self) -> dict:
669
+ """Return the obs fields to validate against."""
670
+ return self._obs_fields
671
+
672
+ def _verify_modality(self, modalities: Iterable[str]):
673
+ """Verify the modality exists."""
674
+ for modality in modalities:
675
+ if modality not in self._mdata.mod.keys():
676
+ raise ValueError(f"modality '{modality}' does not exist!")
677
+
678
+ def _save_from_var_index_modality(
679
+ self, modality: str, validated_only: bool = True, **kwargs
680
+ ):
681
+ """Save variable records."""
682
+ update_registry(
683
+ values=list(self._mdata[modality].var.index),
684
+ field=self._var_fields[modality],
685
+ key="var_index",
686
+ save_function="add_new_from_var_index",
687
+ using_key=self._using_key,
688
+ validated_only=validated_only,
689
+ dtype="number",
690
+ source=self._sources.get(modality, {}).get("var_index"),
691
+ exclude=self._exclude.get(modality, {}).get("var_index"),
692
+ **kwargs,
693
+ )
694
+
695
+ def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
696
+ """Parse the categorical fields."""
697
+ prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
698
+ obs_fields: dict[str, dict[str, FieldAttr]] = {}
699
+ for k, v in categoricals.items():
700
+ if k not in self._mdata.obs.columns:
701
+ raise ValueError(f"column '{k}' does not exist in mdata.obs!")
702
+ if any(k.startswith(prefix) for prefix in prefixes):
703
+ modality, col = k.split(":")[0], k.split(":")[1]
704
+ if modality not in obs_fields.keys():
705
+ obs_fields[modality] = {}
706
+ obs_fields[modality][col] = v
707
+ else:
708
+ if "obs" not in obs_fields.keys():
709
+ obs_fields["obs"] = {}
710
+ obs_fields["obs"][k] = v
711
+ return obs_fields
712
+
713
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
714
+ """Lookup categories.
715
+
716
+ Args:
717
+ using_key: The instance where the lookup is performed.
718
+ if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
719
+ if "public", the lookup is performed on the public reference.
720
+ """
721
+ return CurateLookup(
722
+ categoricals=self._obs_fields,
723
+ slots={
724
+ **self._obs_fields,
725
+ **{f"{k}_var_index": v for k, v in self._var_fields.items()},
726
+ },
727
+ using_key=using_key or self._using_key,
728
+ )
729
+
730
+ def add_new_from_columns(
731
+ self,
732
+ modality: str,
733
+ column_names: list[str] | None = None,
734
+ organism: str | None = None,
735
+ **kwargs,
736
+ ):
737
+ """Update columns records.
738
+
739
+ Args:
740
+ modality: The modality name.
741
+ column_names: The column names to save.
742
+ organism: The organism name.
743
+ **kwargs: Additional keyword arguments to pass to the registry model.
744
+ """
745
+ self._kwargs.update({"organism": organism} if organism else {})
746
+ values = column_names or self._mdata[modality].obs.columns
747
+ update_registry(
748
+ values=list(values),
749
+ field=Feature.name,
750
+ key=f"{modality} obs columns",
751
+ using_key=self._using_key,
752
+ validated_only=False,
753
+ df=self._mdata[modality].obs,
754
+ source=self._sources.get(modality, {}).get("columns"),
755
+ exclude=self._exclude.get(modality, {}).get("columns"),
756
+ **self._kwargs, # type: ignore
757
+ **kwargs,
758
+ )
759
+
760
+ def add_new_from_var_index(
761
+ self, modality: str, organism: str | None = None, **kwargs
762
+ ):
763
+ """Update variable records.
764
+
765
+ Args:
766
+ modality: The modality name.
767
+ organism: The organism name.
768
+ **kwargs: Additional keyword arguments to pass to the registry model.
769
+ """
770
+ self._kwargs.update({"organism": organism} if organism else {})
771
+ self._save_from_var_index_modality(
772
+ modality=modality, validated_only=False, **self._kwargs, **kwargs
773
+ )
774
+
775
+ def add_validated_from_var_index(self, modality: str, organism: str | None = None):
776
+ """Add validated variable records.
777
+
778
+ Args:
779
+ modality: The modality name.
780
+ organism: The organism name.
781
+ """
782
+ self._kwargs.update({"organism": organism} if organism else {})
783
+ self._save_from_var_index_modality(
784
+ modality=modality, validated_only=True, **self._kwargs
785
+ )
786
+
787
+ def add_validated_from(
788
+ self, key: str, modality: str | None = None, organism: str | None = None
789
+ ):
790
+ """Add validated categories.
791
+
792
+ Args:
793
+ key: The key referencing the slot in the DataFrame.
794
+ modality: The modality name.
795
+ organism: The organism name.
796
+ """
797
+ self._kwargs.update({"organism": organism} if organism else {})
798
+ modality = modality or "obs"
799
+ if modality in self._df_annotators:
800
+ df_annotator = self._df_annotators[modality]
801
+ df_annotator.add_validated_from(key=key, **self._kwargs)
802
+
803
+ def add_new_from(
804
+ self,
805
+ key: str,
806
+ modality: str | None = None,
807
+ organism: str | None = None,
808
+ **kwargs,
809
+ ):
810
+ """Add validated & new categories.
811
+
812
+ Args:
813
+ key: The key referencing the slot in the DataFrame.
814
+ modality: The modality name.
815
+ organism: The organism name.
816
+ **kwargs: Additional keyword arguments to pass to the registry model.
817
+ """
818
+ if len(kwargs) > 0 and key == "all":
819
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
820
+ self._kwargs.update({"organism": organism} if organism else {})
821
+ modality = modality or "obs"
822
+ if modality in self._df_annotators:
823
+ df_annotator = self._df_annotators[modality]
824
+ df_annotator.add_new_from(key=key, **self._kwargs, **kwargs)
825
+
826
+ def validate(self, organism: str | None = None) -> bool:
827
+ """Validate categories."""
828
+ self._kwargs.update({"organism": organism} if organism else {})
829
+ if self._using_key is not None and self._using_key != "default":
830
+ logger.important(
831
+ f"validating metadata using registries of instance {colors.italic(self._using_key)}"
832
+ )
833
+ validated_var = True
834
+ non_validated_var_modality = {}
835
+ for modality, var_field in self._var_fields.items():
836
+ is_validated_var, non_validated_var = validate_categories(
837
+ self._mdata[modality].var.index,
838
+ field=var_field,
839
+ key=f"{modality}_var_index",
840
+ using_key=self._using_key,
841
+ source=self._sources.get(modality, {}).get("var_index"),
842
+ exclude=self._exclude.get(modality, {}).get("var_index"),
843
+ **self._kwargs, # type: ignore
844
+ )
845
+ validated_var &= is_validated_var
846
+ if len(non_validated_var) > 0:
847
+ non_validated_var_modality[modality] = non_validated_var
848
+
849
+ validated_obs = True
850
+ non_validated_obs_modality = {}
851
+ for modality, fields in self._obs_fields.items():
852
+ if modality == "obs":
853
+ obs = self._mdata.obs
854
+ else:
855
+ obs = self._mdata[modality].obs
856
+ is_validated_obs, non_validated_obs = validate_categories_in_df(
857
+ obs,
858
+ fields=fields,
859
+ using_key=self._using_key,
860
+ sources=self._sources.get(modality),
861
+ exclude=self._exclude.get(modality),
862
+ **self._kwargs,
863
+ )
864
+ validated_obs &= is_validated_obs
865
+ non_validated_obs_modality[modality] = non_validated_obs
866
+ if modality in non_validated_var_modality:
867
+ non_validated_obs_modality[modality]["var_index"] = (
868
+ non_validated_var_modality[modality]
869
+ )
870
+ if len(non_validated_obs_modality[modality]) > 0:
871
+ self._non_validated = non_validated_obs_modality[modality]
872
+ self._validated = validated_var and validated_obs
873
+ return self._validated
874
+
875
+ def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
876
+ """Save the validated ``MuData`` and metadata.
877
+
878
+ Args:
879
+ description: Description of the ``MuData`` object.
880
+ **kwargs: Object level metadata.
881
+
882
+ Returns:
883
+ A saved artifact record.
884
+ """
885
+ if not self._validated:
886
+ raise ValidationError("Please run `validate()` first!")
887
+
888
+ self._artifact = save_artifact(
889
+ self._mdata,
890
+ description=description,
891
+ columns_field=self.var_index,
892
+ fields=self.categoricals,
893
+ **self._kwargs,
894
+ **kwargs,
895
+ )
896
+ return self._artifact
897
+
898
+
899
+ class Curator(BaseCurator):
900
+ """Dataset curator.
901
+
902
+ Data curation entails accurately labeling datasets with standardized metadata
903
+ to facilitate data integration, interpretation and analysis.
904
+
905
+ The curation flow has several steps:
906
+
907
+ 1. Instantiate `Curator` from one of the following dataset objects:
908
+
909
+ - :meth:`~lamindb.Curator.from_df`
910
+ - :meth:`~lamindb.Curator.from_anndata`
911
+ - :meth:`~lamindb.Curator.from_mudata`
912
+
913
+ During object creation, any passed categoricals found in the object will be saved.
914
+
915
+ 2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
916
+
917
+ - Values that can successfully validated and already exist in the registry.
918
+ - Values which are new and not yet validated or potentially problematic values.
919
+
920
+ 3. Determine how to handle validated and non-validated values:
921
+
922
+ - Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
923
+ - Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
924
+ - All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
925
+ """
926
+
927
+ @classmethod
928
+ @doc_args(DataFrameCurator.__doc__)
929
+ def from_df(
930
+ cls,
931
+ df: pd.DataFrame,
932
+ categoricals: dict[str, FieldAttr] | None = None,
933
+ columns: FieldAttr = Feature.name,
934
+ using_key: str | None = None,
935
+ verbosity: str = "hint",
936
+ organism: str | None = None,
937
+ ) -> DataFrameCurator:
938
+ """{}""" # noqa: D415
939
+ return DataFrameCurator(
940
+ df=df,
941
+ categoricals=categoricals,
942
+ columns=columns,
943
+ using_key=using_key,
944
+ verbosity=verbosity,
945
+ organism=organism,
946
+ )
947
+
948
+ @classmethod
949
+ @doc_args(AnnDataCurator.__doc__)
950
+ def from_anndata(
951
+ cls,
952
+ data: ad.AnnData | UPathStr,
953
+ var_index: FieldAttr,
954
+ categoricals: dict[str, FieldAttr] | None = None,
955
+ obs_columns: FieldAttr = Feature.name,
956
+ using_key: str = "default",
957
+ verbosity: str = "hint",
958
+ organism: str | None = None,
959
+ sources: dict[str, Record] | None = None,
960
+ ) -> AnnDataCurator:
961
+ """{}""" # noqa: D415
962
+ return AnnDataCurator(
963
+ data=data,
964
+ var_index=var_index,
965
+ categoricals=categoricals,
966
+ obs_columns=obs_columns,
967
+ using_key=using_key,
968
+ verbosity=verbosity,
969
+ organism=organism,
970
+ sources=sources,
971
+ )
972
+
973
+ @classmethod
974
+ @doc_args(MuDataCurator.__doc__)
975
+ def from_mudata(
976
+ cls,
977
+ mdata: MuData,
978
+ var_index: dict[str, dict[str, FieldAttr]],
979
+ categoricals: dict[str, FieldAttr] | None = None,
980
+ using_key: str = "default",
981
+ verbosity: str = "hint",
982
+ organism: str | None = None,
983
+ ) -> MuDataCurator:
984
+ """{}""" # noqa: D415
985
+ return MuDataCurator(
986
+ mdata=mdata,
987
+ var_index=var_index,
988
+ categoricals=categoricals,
989
+ using_key=using_key,
990
+ verbosity=verbosity,
991
+ organism=organism,
992
+ )
993
+
994
+
995
+ def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
996
+ """Get a registry instance using a specific instance."""
997
+ if using_key is not None and using_key != "default":
998
+ return registry.using(using_key)
999
+ return registry
1000
+
1001
+
1002
+ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
1003
+ """Make sure the source and organism are saved in the same database as the registry."""
1004
+ from lamindb.core._settings import settings
1005
+
1006
+ db = registry.filter().db
1007
+ source = kwargs.get("source")
1008
+ organism = kwargs.get("organism")
1009
+ filter_kwargs = kwargs.copy()
1010
+ try:
1011
+ verbosity = settings.verbosity
1012
+ settings.verbosity = "error"
1013
+ if isinstance(organism, Record) and organism._state.db != "default":
1014
+ if db is None or db == "default":
1015
+ organism_default = copy.copy(organism)
1016
+ # save the organism record in the default database
1017
+ organism_default.save()
1018
+ filter_kwargs["organism"] = organism_default
1019
+ if isinstance(source, Record) and source._state.db != "default":
1020
+ if db is None or db == "default":
1021
+ source_default = copy.copy(source)
1022
+ # save the source record in the default database
1023
+ source_default.save()
1024
+ filter_kwargs["source"] = source_default
1025
+ finally:
1026
+ settings.verbosity = verbosity
1027
+ return filter_kwargs
1028
+
1029
+
1030
+ def standardize_and_inspect(
1031
+ values: Iterable[str],
1032
+ field: FieldAttr,
1033
+ registry: type[Record],
1034
+ standardize: bool = False,
1035
+ exclude: str | list | None = None,
1036
+ **kwargs,
1037
+ ):
1038
+ """Standardize and inspect values using a registry."""
1039
+ # inspect exclude values in the default instance
1040
+ values = list(values)
1041
+ include_validated = []
1042
+ if exclude is not None:
1043
+ exclude = [exclude] if isinstance(exclude, str) else exclude
1044
+ exclude = [i for i in exclude if i in values]
1045
+ if len(exclude) > 0:
1046
+ # exclude values are validated without source and organism
1047
+ inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
1048
+ # if exclude values are validated, remove them from the values
1049
+ values = [i for i in values if i not in inspect_result_exclude.validated]
1050
+ include_validated = inspect_result_exclude.validated
1051
+
1052
+ if standardize:
1053
+ if hasattr(registry, "standardize") and hasattr(
1054
+ registry,
1055
+ "synonyms", # https://github.com/laminlabs/lamindb/issues/1685
1056
+ ):
1057
+ standardized_values = registry.standardize(
1058
+ values, field=field, mute=True, **kwargs
1059
+ )
1060
+ values = standardized_values
1061
+
1062
+ inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
1063
+ inspect_result._validated += include_validated
1064
+ inspect_result._non_validated = [
1065
+ i for i in inspect_result.non_validated if i not in include_validated
1066
+ ]
1067
+
1068
+ return inspect_result
1069
+
1070
+
1071
+ def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
1072
+ """Check if a registry needs an organism and return the organism name."""
1073
+ if hasattr(registry, "organism_id"):
1074
+ import bionty as bt
1075
+
1076
+ if organism is None and bt.settings.organism is None:
1077
+ raise ValueError(
1078
+ f"{registry.__name__} registry requires an organism!\n"
1079
+ " → please pass an organism name via organism="
1080
+ )
1081
+ return {"organism": organism or bt.settings.organism.name}
1082
+ return {}
1083
+
1084
+
1085
+ def validate_categories(
1086
+ values: Iterable[str],
1087
+ field: FieldAttr,
1088
+ key: str,
1089
+ using_key: str | None = None,
1090
+ organism: str | None = None,
1091
+ source: Record | None = None,
1092
+ exclude: str | list | None = None,
1093
+ standardize: bool = True,
1094
+ validated_hint_print: str | None = None,
1095
+ ) -> tuple[bool, list]:
1096
+ """Validate ontology terms in a pandas series using LaminDB registries.
1097
+
1098
+ Args:
1099
+ values: The values to validate.
1100
+ field: The field attribute.
1101
+ key: The key referencing the slot in the DataFrame.
1102
+ using_key: A reference LaminDB instance.
1103
+ organism: The organism name.
1104
+ source: The source record.
1105
+ exclude: Exclude specific values.
1106
+ standardize: Standardize the values.
1107
+ validated_hint_print: The hint to print for validated values.
1108
+ """
1109
+ from lamindb._from_values import _print_values
1110
+ from lamindb.core._settings import settings
1111
+
1112
+ model_field = f"{field.field.model.__name__}.{field.field.name}"
1113
+
1114
+ def _log_mapping_info():
1115
+ logger.indent = ""
1116
+ logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}")
1117
+ logger.indent = " "
1118
+
1119
+ registry = field.field.model
1120
+
1121
+ kwargs = check_registry_organism(registry, organism)
1122
+ kwargs.update({"source": source} if source else {})
1123
+ kwargs_current = get_current_filter_kwargs(registry, kwargs)
1124
+
1125
+ # inspect the default instance
1126
+ inspect_result = standardize_and_inspect(
1127
+ values=values,
1128
+ field=field,
1129
+ registry=registry,
1130
+ standardize=standardize,
1131
+ exclude=exclude,
1132
+ **kwargs_current,
1133
+ )
1134
+ non_validated = inspect_result.non_validated
1135
+
1136
+ # inspect the using instance
1137
+ values_validated = []
1138
+ if using_key is not None and using_key != "default" and non_validated:
1139
+ registry_using = get_registry_instance(registry, using_key)
1140
+ inspect_result = standardize_and_inspect(
1141
+ values=non_validated,
1142
+ field=field,
1143
+ registry=registry_using,
1144
+ standardize=standardize,
1145
+ exclude=exclude,
1146
+ **kwargs,
1147
+ )
1148
+ non_validated = inspect_result.non_validated
1149
+ values_validated += inspect_result.validated
1150
+
1151
+ # inspect from public (bionty only)
1152
+ if hasattr(registry, "public"):
1153
+ verbosity = settings.verbosity
1154
+ try:
1155
+ settings.verbosity = "error"
1156
+ public_records = registry.from_values(
1157
+ non_validated,
1158
+ field=field,
1159
+ **kwargs_current,
1160
+ )
1161
+ values_validated += [getattr(r, field.field.name) for r in public_records]
1162
+ finally:
1163
+ settings.verbosity = verbosity
1164
+
1165
+ validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
1166
+ n_validated = len(values_validated)
1167
+ if n_validated > 0:
1168
+ _log_mapping_info()
1169
+ logger.warning(
1170
+ f"found {colors.yellow(n_validated)} validated terms: "
1171
+ f"{colors.yellow(values_validated)}\n → save terms via "
1172
+ f"{colors.yellow(validated_hint_print)}"
1173
+ )
1174
+
1175
+ non_validated_hint_print = f".add_new_from('{key}')"
1176
+ non_validated = [i for i in non_validated if i not in values_validated]
1177
+ n_non_validated = len(non_validated)
1178
+ if n_non_validated == 0:
1179
+ if n_validated == 0:
1180
+ logger.indent = ""
1181
+ logger.success(f"{key} is validated against {colors.italic(model_field)}")
1182
+ return True, []
1183
+ else:
1184
+ # validated values still need to be saved to the current instance
1185
+ return False, []
1186
+ else:
1187
+ are = "are" if n_non_validated > 1 else "is"
1188
+ print_values = _print_values(non_validated)
1189
+ warning_message = (
1190
+ f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
1191
+ f"{colors.red(print_values)}\n → fix typos, remove non-existent values, or save terms via "
1192
+ f"{colors.red(non_validated_hint_print)}"
1193
+ )
1194
+ if logger.indent == "":
1195
+ _log_mapping_info()
1196
+ logger.warning(warning_message)
1197
+ logger.indent = ""
1198
+ return False, non_validated
1199
+
1200
+
1201
+ def validate_categories_in_df(
1202
+ df: pd.DataFrame,
1203
+ fields: dict[str, FieldAttr],
1204
+ using_key: str | None = None,
1205
+ sources: dict[str, Record] = None,
1206
+ exclude: dict | None = None,
1207
+ **kwargs,
1208
+ ) -> tuple[bool, dict]:
1209
+ """Validate categories in DataFrame columns using LaminDB registries."""
1210
+ if not fields:
1211
+ return True, {}
1212
+
1213
+ if sources is None:
1214
+ sources = {}
1215
+ validated = True
1216
+ non_validated = {}
1217
+ for key, field in fields.items():
1218
+ is_val, non_val = validate_categories(
1219
+ df[key],
1220
+ field=field,
1221
+ key=key,
1222
+ using_key=using_key,
1223
+ source=sources.get(key),
1224
+ exclude=exclude.get(key) if exclude else None,
1225
+ **kwargs,
1226
+ )
1227
+ validated &= is_val
1228
+ if len(non_val) > 0:
1229
+ non_validated[key] = non_val
1230
+ return validated, non_validated
1231
+
1232
+
1233
+ def save_artifact(
1234
+ data: pd.DataFrame | ad.AnnData | MuData,
1235
+ fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
1236
+ columns_field: FieldAttr | dict[str, FieldAttr],
1237
+ description: str | None = None,
1238
+ organism: str | None = None,
1239
+ adata: ad.AnnData | None = None,
1240
+ **kwargs,
1241
+ ) -> Artifact:
1242
+ """Save all metadata with an Artifact.
1243
+
1244
+ Args:
1245
+ data: The DataFrame or AnnData object to save.
1246
+ description: A description of the artifact.
1247
+ fields: A dictionary mapping obs_column to registry_field.
1248
+ columns_field: The registry field to validate variables index against.
1249
+ organism: The organism name.
1250
+ adata: The AnnData object to save, must be provided if data is a path.
1251
+ kwargs: Additional keyword arguments to pass to the registry model.
1252
+
1253
+ Returns:
1254
+ The saved Artifact.
1255
+ """
1256
+ from ._artifact import data_is_anndata
1257
+
1258
+ artifact = None
1259
+ if data_is_anndata(data):
1260
+ assert adata is not None # noqa: S101
1261
+ artifact = Artifact.from_anndata(data, description=description, **kwargs)
1262
+ artifact.n_observations = adata.shape[0]
1263
+ data = adata
1264
+
1265
+ elif isinstance(data, pd.DataFrame):
1266
+ artifact = Artifact.from_df(data, description=description, **kwargs)
1267
+ else:
1268
+ try:
1269
+ from mudata import MuData
1270
+
1271
+ if isinstance(data, MuData):
1272
+ artifact = Artifact.from_mudata(data, description=description, **kwargs)
1273
+ artifact.n_observations = data.n_obs
1274
+ except ImportError:
1275
+ pass
1276
+ if artifact is None:
1277
+ raise ValueError("data must be a DataFrame, AnnData or MuData object.")
1278
+ artifact.save()
1279
+
1280
+ feature_kwargs = check_registry_organism(
1281
+ (
1282
+ list(columns_field.values())[0].field.model
1283
+ if isinstance(columns_field, dict)
1284
+ else columns_field.field.model
1285
+ ),
1286
+ organism,
1287
+ )
1288
+
1289
+ if artifact._accessor == "DataFrame":
1290
+ artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
1291
+ elif artifact._accessor == "AnnData":
1292
+ artifact.features._add_set_from_anndata(
1293
+ var_field=columns_field, **feature_kwargs
1294
+ )
1295
+ elif artifact._accessor == "MuData":
1296
+ artifact.features._add_set_from_mudata(
1297
+ var_fields=columns_field, **feature_kwargs
1298
+ )
1299
+ else:
1300
+ raise NotImplementedError
1301
+
1302
+ def _add_labels(data, artifact: Artifact, fields: dict[str, FieldAttr]):
1303
+ features = Feature.lookup().dict()
1304
+ for key, field in fields.items():
1305
+ feature = features.get(key)
1306
+ registry = field.field.model
1307
+ filter_kwargs = check_registry_organism(registry, organism)
1308
+ filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1309
+ df = data if isinstance(data, pd.DataFrame) else data.obs
1310
+ labels = registry.from_values(
1311
+ df[key],
1312
+ field=field,
1313
+ **filter_kwargs_current,
1314
+ )
1315
+ artifact.labels.add(labels, feature)
1316
+
1317
+ if artifact._accessor == "MuData":
1318
+ for modality, modality_fields in fields.items():
1319
+ if modality == "obs":
1320
+ _add_labels(data, artifact, modality_fields)
1321
+ else:
1322
+ _add_labels(data[modality], artifact, modality_fields)
1323
+ else:
1324
+ _add_labels(data, artifact, fields)
1325
+
1326
+ slug = ln_setup.settings.instance.slug
1327
+ if ln_setup.settings.instance.is_remote: # pragma: no cover
1328
+ logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
1329
+ return artifact
1330
+
1331
+
1332
+ def update_registry(
1333
+ values: list[str],
1334
+ field: FieldAttr,
1335
+ key: str,
1336
+ save_function: str = "add_new_from",
1337
+ using_key: str | None = None,
1338
+ validated_only: bool = True,
1339
+ df: pd.DataFrame | None = None,
1340
+ organism: str | None = None,
1341
+ dtype: str | None = None,
1342
+ source: Record | None = None,
1343
+ standardize: bool = True,
1344
+ warning: bool = True,
1345
+ exclude: str | list | None = None,
1346
+ **kwargs,
1347
+ ) -> None:
1348
+ """Save features or labels records in the default instance from the using_key instance.
1349
+
1350
+ Args:
1351
+ values: A list of values to be saved as labels.
1352
+ field: The FieldAttr object representing the field for which labels are being saved.
1353
+ key: The name of the feature to save.
1354
+ save_function: The name of the function to save the labels.
1355
+ using_key: The name of the instance from which to transfer labels (if applicable).
1356
+ validated_only: If True, only save validated labels.
1357
+ df: A DataFrame to save labels from.
1358
+ organism: The organism name.
1359
+ dtype: The type of the feature.
1360
+ source: The source record.
1361
+ kwargs: Additional keyword arguments to pass to the registry model to create new records.
1362
+ """
1363
+ from lamindb._save import save as ln_save
1364
+ from lamindb.core._settings import settings
1365
+
1366
+ registry = field.field.model
1367
+ filter_kwargs = check_registry_organism(registry, organism)
1368
+ filter_kwargs.update({"source": source} if source else {})
1369
+
1370
+ verbosity = settings.verbosity
1371
+ try:
1372
+ settings.verbosity = "error"
1373
+
1374
+ # save from public
1375
+ filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1376
+ existing_and_public_records = (
1377
+ registry.from_values(
1378
+ list(values),
1379
+ field=field,
1380
+ **filter_kwargs_current,
1381
+ )
1382
+ if values
1383
+ else []
1384
+ )
1385
+
1386
+ labels_saved: dict = {"from public": [], "without reference": []}
1387
+
1388
+ public_records = [r for r in existing_and_public_records if r._state.adding]
1389
+ # here we check to only save the public records if they are from the specified source
1390
+ # we check the uid because r.source and soruce can be from different instances
1391
+ if source:
1392
+ public_records = [r for r in public_records if r.source.uid == source.uid]
1393
+ ln_save(public_records)
1394
+ labels_saved["from public"] = [
1395
+ getattr(r, field.field.name) for r in public_records
1396
+ ]
1397
+ non_public_labels = [i for i in values if i not in labels_saved["from public"]]
1398
+
1399
+ # inspect the default instance
1400
+ inspect_result_current = standardize_and_inspect(
1401
+ values=non_public_labels,
1402
+ field=field,
1403
+ registry=registry,
1404
+ standardize=standardize,
1405
+ exclude=exclude,
1406
+ **filter_kwargs_current,
1407
+ )
1408
+ if not inspect_result_current.non_validated:
1409
+ all_labels = registry.from_values(
1410
+ inspect_result_current.validated,
1411
+ field=field,
1412
+ **filter_kwargs_current,
1413
+ )
1414
+ settings.verbosity = verbosity
1415
+ return all_labels
1416
+
1417
+ # inspect the using_key instance
1418
+ (
1419
+ labels_saved[f"from {using_key}"],
1420
+ non_validated_labels,
1421
+ ) = update_registry_from_using_instance(
1422
+ inspect_result_current.non_validated,
1423
+ field=field,
1424
+ using_key=using_key,
1425
+ exclude=exclude,
1426
+ **filter_kwargs,
1427
+ )
1428
+
1429
+ labels_saved["without reference"] = [
1430
+ i
1431
+ for i in non_validated_labels
1432
+ if i not in labels_saved[f"from {using_key}"]
1433
+ ]
1434
+
1435
+ # save non-validated records
1436
+ if not validated_only:
1437
+ non_validated_records = []
1438
+ if df is not None and registry == Feature:
1439
+ non_validated_records = Feature.from_df(df)
1440
+ else:
1441
+ if "organism" in filter_kwargs:
1442
+ # make sure organism record is saved to the current instance
1443
+ filter_kwargs["organism"] = _save_organism(name=organism)
1444
+ init_kwargs = {}
1445
+ for value in labels_saved["without reference"]:
1446
+ init_kwargs[field.field.name] = value
1447
+ if registry == Feature:
1448
+ init_kwargs["dtype"] = "cat" if dtype is None else dtype
1449
+ non_validated_records.append(
1450
+ registry(
1451
+ **init_kwargs,
1452
+ **{k: v for k, v in filter_kwargs.items() if k != "source"},
1453
+ **{k: v for k, v in kwargs.items() if k != "sources"},
1454
+ )
1455
+ )
1456
+ ln_save(non_validated_records)
1457
+
1458
+ # save parent labels for ulabels
1459
+ if registry == ULabel and field.field.name == "name":
1460
+ save_ulabels_with_parent(values, field=field, key=key)
1461
+
1462
+ # # get all records that are now validated in the current instance
1463
+ # all_labels = registry.from_values(
1464
+ # inspect_result_current.validated + inspect_result_current.non_validated,
1465
+ # field=field,
1466
+ # **get_current_filter_kwargs(registry, filter_kwargs),
1467
+ # )
1468
+ finally:
1469
+ settings.verbosity = verbosity
1470
+
1471
+ log_saved_labels(
1472
+ labels_saved,
1473
+ key=key,
1474
+ save_function=save_function,
1475
+ model_field=f"{registry.__name__}.{field.field.name}",
1476
+ validated_only=validated_only,
1477
+ warning=warning,
1478
+ )
1479
+
1480
+ # return all_labels
1481
+
1482
+
1483
+ def log_saved_labels(
1484
+ labels_saved: dict,
1485
+ key: str,
1486
+ save_function: str,
1487
+ model_field: str,
1488
+ validated_only: bool = True,
1489
+ warning: bool = True,
1490
+ ) -> None:
1491
+ """Log the saved labels."""
1492
+ from ._from_values import _print_values
1493
+
1494
+ model_field = colors.italic(model_field)
1495
+ for k, labels in labels_saved.items():
1496
+ if not labels:
1497
+ continue
1498
+
1499
+ if k == "without reference" and validated_only:
1500
+ msg = colors.yellow(
1501
+ f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
1502
+ )
1503
+ lookup_print = (
1504
+ f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
1505
+ )
1506
+
1507
+ hint = f".add_new_from('{key}')"
1508
+ msg += f"\n → to lookup values, use {lookup_print}"
1509
+ msg += (
1510
+ f"\n → to save, run {colors.yellow(hint)}"
1511
+ if save_function == "add_new_from"
1512
+ else f"\n → to save, run {colors.yellow(save_function)}"
1513
+ )
1514
+ if warning:
1515
+ logger.warning(msg)
1516
+ else:
1517
+ logger.info(msg)
1518
+ else:
1519
+ k = "" if k == "without reference" else f"{colors.green(k)} "
1520
+ # the term "transferred" stresses that this is always in the context of transferring
1521
+ # labels from a public ontology or a different instance to the present instance
1522
+ s = "s" if len(labels) > 1 else ""
1523
+ logger.success(
1524
+ f"added {len(labels)} record{s} {k}with {model_field} for {colors.italic(key)}: {_print_values(labels)}"
1525
+ )
1526
+
1527
+
1528
+ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None:
1529
+ """Save a parent label for the given labels."""
1530
+ registry = field.field.model
1531
+ assert registry == ULabel # noqa: S101
1532
+ all_records = registry.from_values(list(values), field=field)
1533
+ is_feature = registry.filter(name=f"is_{key}").one_or_none()
1534
+ if is_feature is None:
1535
+ is_feature = registry(name=f"is_{key}")
1536
+ is_feature.save()
1537
+ is_feature.children.add(*all_records)
1538
+
1539
+
1540
+ def update_registry_from_using_instance(
1541
+ values: list[str],
1542
+ field: FieldAttr,
1543
+ using_key: str | None = None,
1544
+ standardize: bool = False,
1545
+ exclude: str | list | None = None,
1546
+ **kwargs,
1547
+ ) -> tuple[list[str], list[str]]:
1548
+ """Save features or labels records from the using_key instance.
1549
+
1550
+ Args:
1551
+ values: A list of values to be saved as labels.
1552
+ field: The FieldAttr object representing the field for which labels are being saved.
1553
+ using_key: The name of the instance from which to transfer labels (if applicable).
1554
+ standardize: Whether to also standardize the values.
1555
+ kwargs: Additional keyword arguments to pass to the registry model.
1556
+
1557
+ Returns:
1558
+ A tuple containing the list of saved labels and the list of non-saved labels.
1559
+ """
1560
+ labels_saved = []
1561
+ not_saved = values
1562
+
1563
+ if using_key is not None and using_key != "default":
1564
+ registry_using = get_registry_instance(field.field.model, using_key)
1565
+
1566
+ inspect_result_using = standardize_and_inspect(
1567
+ values=values,
1568
+ field=field,
1569
+ registry=registry_using,
1570
+ standardize=standardize,
1571
+ exclude=exclude,
1572
+ **kwargs,
1573
+ )
1574
+ labels_using = registry_using.filter(
1575
+ **{f"{field.field.name}__in": inspect_result_using.validated}
1576
+ ).all()
1577
+ for label_using in labels_using:
1578
+ label_using.save()
1579
+ labels_saved.append(getattr(label_using, field.field.name))
1580
+ not_saved = inspect_result_using.non_validated
1581
+
1582
+ return labels_saved, not_saved
1583
+
1584
+
1585
+ def _save_organism(name: str): # pragma: no cover
1586
+ """Save an organism record."""
1587
+ import bionty as bt
1588
+
1589
+ organism = bt.Organism.filter(name=name).one_or_none()
1590
+ if organism is None:
1591
+ organism = bt.Organism.from_source(name=name)
1592
+ if organism is None:
1593
+ raise ValueError(
1594
+ f"Organism '{name}' not found\n"
1595
+ f" → please save it: bt.Organism(name='{name}').save()"
1596
+ )
1597
+ organism.save()
1598
+ return organism
1599
+
1600
+
1601
+ Curate = Curator # backward compat