lamindb 0.76.8__py3-none-any.whl → 0.76.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. lamindb/__init__.py +114 -113
  2. lamindb/_artifact.py +1206 -1205
  3. lamindb/_can_validate.py +621 -579
  4. lamindb/_collection.py +390 -387
  5. lamindb/_curate.py +1603 -1601
  6. lamindb/_feature.py +155 -155
  7. lamindb/_feature_set.py +244 -242
  8. lamindb/_filter.py +23 -23
  9. lamindb/_finish.py +250 -256
  10. lamindb/_from_values.py +403 -382
  11. lamindb/_is_versioned.py +40 -40
  12. lamindb/_parents.py +476 -476
  13. lamindb/_query_manager.py +125 -125
  14. lamindb/_query_set.py +364 -362
  15. lamindb/_record.py +668 -649
  16. lamindb/_run.py +60 -57
  17. lamindb/_save.py +310 -308
  18. lamindb/_storage.py +14 -14
  19. lamindb/_transform.py +130 -127
  20. lamindb/_ulabel.py +56 -56
  21. lamindb/_utils.py +9 -9
  22. lamindb/_view.py +72 -72
  23. lamindb/core/__init__.py +94 -94
  24. lamindb/core/_context.py +590 -574
  25. lamindb/core/_data.py +510 -438
  26. lamindb/core/_django.py +209 -0
  27. lamindb/core/_feature_manager.py +994 -867
  28. lamindb/core/_label_manager.py +289 -253
  29. lamindb/core/_mapped_collection.py +631 -597
  30. lamindb/core/_settings.py +188 -187
  31. lamindb/core/_sync_git.py +138 -138
  32. lamindb/core/_track_environment.py +27 -27
  33. lamindb/core/datasets/__init__.py +59 -59
  34. lamindb/core/datasets/_core.py +581 -571
  35. lamindb/core/datasets/_fake.py +36 -36
  36. lamindb/core/exceptions.py +90 -90
  37. lamindb/core/fields.py +12 -12
  38. lamindb/core/loaders.py +164 -164
  39. lamindb/core/schema.py +56 -56
  40. lamindb/core/storage/__init__.py +25 -25
  41. lamindb/core/storage/_anndata_accessor.py +741 -740
  42. lamindb/core/storage/_anndata_sizes.py +41 -41
  43. lamindb/core/storage/_backed_access.py +98 -98
  44. lamindb/core/storage/_tiledbsoma.py +204 -204
  45. lamindb/core/storage/_valid_suffixes.py +21 -21
  46. lamindb/core/storage/_zarr.py +110 -110
  47. lamindb/core/storage/objects.py +62 -62
  48. lamindb/core/storage/paths.py +172 -172
  49. lamindb/core/subsettings/__init__.py +12 -12
  50. lamindb/core/subsettings/_creation_settings.py +38 -38
  51. lamindb/core/subsettings/_transform_settings.py +21 -21
  52. lamindb/core/types.py +19 -19
  53. lamindb/core/versioning.py +146 -158
  54. lamindb/integrations/__init__.py +12 -12
  55. lamindb/integrations/_vitessce.py +107 -107
  56. lamindb/setup/__init__.py +14 -14
  57. lamindb/setup/core/__init__.py +4 -4
  58. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/LICENSE +201 -201
  59. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/METADATA +8 -8
  60. lamindb-0.76.10.dist-info/RECORD +61 -0
  61. {lamindb-0.76.8.dist-info → lamindb-0.76.10.dist-info}/WHEEL +1 -1
  62. lamindb-0.76.8.dist-info/RECORD +0 -60
lamindb/_curate.py CHANGED
@@ -1,1601 +1,1603 @@
1
- from __future__ import annotations
2
-
3
- import copy
4
- from typing import TYPE_CHECKING, Iterable
5
-
6
- import anndata as ad
7
- import lamindb_setup as ln_setup
8
- import pandas as pd
9
- from lamin_utils import colors, logger
10
- from lamindb_setup.core._docs import doc_args
11
- from lnschema_core import (
12
- Artifact,
13
- Feature,
14
- Record,
15
- Run,
16
- ULabel,
17
- )
18
-
19
- from .core.exceptions import ValidationError
20
-
21
- if TYPE_CHECKING:
22
- from lamindb_setup.core.types import UPathStr
23
- from lnschema_core.types import FieldAttr
24
- from mudata import MuData
25
-
26
-
27
- class CurateLookup:
28
- """Lookup categories from the reference instance."""
29
-
30
- def __init__(
31
- self,
32
- categoricals: dict[str, FieldAttr],
33
- slots: dict[str, FieldAttr] = None,
34
- using_key: str | None = None,
35
- ) -> None:
36
- if slots is None:
37
- slots = {}
38
- self._fields = {**categoricals, **slots}
39
- self._using_key = None if using_key == "default" else using_key
40
- self._using_key_name = self._using_key or ln_setup.settings.instance.slug
41
- debug_message = (
42
- f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
43
- )
44
- logger.debug(debug_message)
45
-
46
- def __getattr__(self, name):
47
- if name in self._fields:
48
- registry = self._fields[name].field.model
49
- if self._using_key == "public":
50
- return registry.public().lookup()
51
- else:
52
- return get_registry_instance(registry, self._using_key).lookup()
53
- raise AttributeError(
54
- f"'{self.__class__.__name__}' object has no attribute '{name}'"
55
- )
56
-
57
- def __getitem__(self, name):
58
- if name in self._fields:
59
- registry = self._fields[name].field.model
60
- if self._using_key == "public":
61
- return registry.public().lookup()
62
- else:
63
- return get_registry_instance(registry, self._using_key).lookup()
64
- raise AttributeError(
65
- f"'{self.__class__.__name__}' object has no attribute '{name}'"
66
- )
67
-
68
- def __repr__(self) -> str:
69
- if len(self._fields) > 0:
70
- getattr_keys = "\n ".join(
71
- [f".{key}" for key in self._fields if key.isidentifier()]
72
- )
73
- getitem_keys = "\n ".join(
74
- [str([key]) for key in self._fields if not key.isidentifier()]
75
- )
76
- return (
77
- f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
78
- f"{colors.green(getattr_keys)}\n "
79
- f"{colors.green(getitem_keys)}\n\n"
80
- "Example:\n → categories = validator.lookup().cell_type\n"
81
- " → categories.alveolar_type_1_fibroblast_cell"
82
- )
83
- else: # pragma: no cover
84
- return colors.warning("No fields are found!")
85
-
86
-
87
- class BaseCurator:
88
- """Curate a dataset."""
89
-
90
- def validate(self) -> bool:
91
- """Validate dataset.
92
-
93
- Returns:
94
- Boolean indicating whether the dataset is validated.
95
- """
96
- pass
97
-
98
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
99
- """Save the dataset as artifact.
100
-
101
- Args:
102
- description: Description of the DataFrame object.
103
- **kwargs: Object level metadata.
104
-
105
- Returns:
106
- A saved artifact record.
107
- """
108
- pass
109
-
110
-
111
- class DataFrameCurator(BaseCurator):
112
- """Curation flow for a DataFrame object.
113
-
114
- See also :class:`~lamindb.Curator`.
115
-
116
- Args:
117
- df: The DataFrame object to curate.
118
- columns: The field attribute for the feature column.
119
- categoricals: A dictionary mapping column names to registry_field.
120
- using_key: The reference instance containing registries to validate against.
121
- verbosity: The verbosity level.
122
- organism: The organism name.
123
- sources: A dictionary mapping column names to Source records.
124
- exclude: A dictionary mapping column names to values to exclude.
125
-
126
- Examples:
127
- >>> import bionty as bt
128
- >>> curate = ln.Curator.from_df(
129
- ... df,
130
- ... categoricals={
131
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
132
- ... "donor_id": ln.ULabel.name
133
- ... }
134
- ... )
135
- """
136
-
137
- def __init__(
138
- self,
139
- df: pd.DataFrame,
140
- columns: FieldAttr = Feature.name,
141
- categoricals: dict[str, FieldAttr] | None = None,
142
- using_key: str | None = None,
143
- verbosity: str = "hint",
144
- organism: str | None = None,
145
- sources: dict[str, Record] | None = None,
146
- exclude: dict | None = None,
147
- check_valid_keys: bool = True,
148
- ) -> None:
149
- from lamindb.core._settings import settings
150
-
151
- self._df = df
152
- self._fields = categoricals or {}
153
- self._columns_field = columns
154
- self._using_key = using_key
155
- settings.verbosity = verbosity
156
- self._artifact = None
157
- self._collection = None
158
- self._validated = False
159
- self._kwargs = {"organism": organism} if organism else {}
160
- if sources is None:
161
- sources = {}
162
- self._sources = sources
163
- if exclude is None:
164
- exclude = {}
165
- self._exclude = exclude
166
- self._non_validated = None
167
- if check_valid_keys:
168
- self._check_valid_keys()
169
- self._save_columns()
170
-
171
- @property
172
- def non_validated(self) -> list:
173
- """Return the non-validated features and labels."""
174
- if self._non_validated is None:
175
- raise ValueError("Please run validate() first!")
176
- return self._non_validated
177
-
178
- @property
179
- def fields(self) -> dict:
180
- """Return the columns fields to validate against."""
181
- return self._fields
182
-
183
- def lookup(self, using_key: str | None = None) -> CurateLookup:
184
- """Lookup categories.
185
-
186
- Args:
187
- using_key: The instance where the lookup is performed.
188
- if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
189
- if "public", the lookup is performed on the public reference.
190
- """
191
- return CurateLookup(
192
- categoricals=self._fields,
193
- slots={"columns": self._columns_field},
194
- using_key=using_key or self._using_key,
195
- )
196
-
197
- def _check_valid_keys(self, extra: set = None) -> None:
198
- if extra is None:
199
- extra = set()
200
- for name, d in {
201
- "categoricals": self._fields,
202
- "sources": self._sources,
203
- "exclude": self._exclude,
204
- }.items():
205
- if not isinstance(d, dict):
206
- raise TypeError(f"{name} must be a dictionary!")
207
- valid_keys = set(self._df.columns) | {"columns"} | extra
208
- nonval_keys = [key for key in d.keys() if key not in valid_keys]
209
- if len(nonval_keys) > 0:
210
- raise ValueError(
211
- f"the following keys passed to {name} are not allowed: {nonval_keys}"
212
- )
213
-
214
- def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
215
- """Save column name records."""
216
- # Always save features specified as the fields keys
217
- update_registry(
218
- values=list(self.fields.keys()),
219
- field=self._columns_field,
220
- key="columns",
221
- save_function="add_new_from_columns",
222
- using_key=self._using_key,
223
- validated_only=False,
224
- source=self._sources.get("columns"),
225
- exclude=self._exclude.get("columns"),
226
- **kwargs,
227
- )
228
-
229
- # Save the rest of the columns based on validated_only
230
- additional_columns = set(self._df.columns) - set(self.fields.keys())
231
- if additional_columns:
232
- update_registry(
233
- values=list(additional_columns),
234
- field=self._columns_field,
235
- key="columns",
236
- save_function="add_new_from_columns",
237
- using_key=self._using_key,
238
- validated_only=validated_only,
239
- df=self._df, # Get the Feature type from df
240
- source=self._sources.get("columns"),
241
- exclude=self._exclude.get("columns"),
242
- warning=False, # Do not warn about missing columns, just an info message
243
- **kwargs,
244
- )
245
-
246
- def add_validated_from(self, key: str, organism: str | None = None):
247
- """Add validated categories.
248
-
249
- Args:
250
- key: The key referencing the slot in the DataFrame.
251
- organism: The organism name.
252
- """
253
- self._kwargs.update({"organism": organism} if organism else {})
254
- self._update_registry(key, validated_only=True, **self._kwargs)
255
-
256
- def add_new_from(self, key: str, organism: str | None = None, **kwargs):
257
- """Add validated & new categories.
258
-
259
- Args:
260
- key: The key referencing the slot in the DataFrame from which to draw terms.
261
- organism: The organism name.
262
- **kwargs: Additional keyword arguments to pass to the registry model.
263
- """
264
- if len(kwargs) > 0 and key == "all":
265
- raise ValueError("Cannot pass additional arguments to 'all' key!")
266
- self._kwargs.update({"organism": organism} if organism else {})
267
- self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
268
-
269
- def add_new_from_columns(self, organism: str | None = None, **kwargs):
270
- """Add validated & new column names to its registry.
271
-
272
- Args:
273
- organism: The organism name.
274
- **kwargs: Additional keyword arguments to pass to the registry model.
275
- """
276
- self._kwargs.update({"organism": organism} if organism else {})
277
- self._save_columns(validated_only=False, **self._kwargs, **kwargs)
278
-
279
- def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
280
- if categorical == "all":
281
- self._update_registry_all(validated_only=validated_only, **kwargs)
282
- elif categorical == "columns":
283
- self._save_columns(validated_only=validated_only, **kwargs)
284
- else:
285
- if categorical not in self.fields:
286
- raise ValueError(f"Feature {categorical} is not part of the fields!")
287
- update_registry(
288
- values=self._df[categorical].unique().tolist(),
289
- field=self.fields[categorical],
290
- key=categorical,
291
- using_key=self._using_key,
292
- validated_only=validated_only,
293
- source=self._sources.get(categorical),
294
- exclude=self._exclude.get(categorical),
295
- **kwargs,
296
- )
297
-
298
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
299
- """Save labels for all features."""
300
- for name in self.fields.keys():
301
- logger.info(f"saving labels for '{name}'")
302
- self._update_registry(name, validated_only=validated_only, **kwargs)
303
-
304
- def validate(self, organism: str | None = None) -> bool:
305
- """Validate variables and categorical observations.
306
-
307
- Args:
308
- organism: The organism name.
309
-
310
- Returns:
311
- Whether the DataFrame is validated.
312
- """
313
- self._kwargs.update({"organism": organism} if organism else {})
314
- self._validated, self._non_validated = validate_categories_in_df( # type: ignore
315
- self._df,
316
- fields=self.fields,
317
- using_key=self._using_key,
318
- sources=self._sources,
319
- exclude=self._exclude,
320
- **self._kwargs,
321
- )
322
- return self._validated
323
-
324
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
325
- """Save the validated DataFrame and metadata.
326
-
327
- Args:
328
- description: Description of the DataFrame object.
329
- **kwargs: Object level metadata.
330
-
331
- Returns:
332
- A saved artifact record.
333
- """
334
- from lamindb.core._settings import settings
335
-
336
- if not self._validated:
337
- self.validate()
338
- if not self._validated:
339
- raise ValidationError("Dataset does not validate. Please curate.")
340
-
341
- # Make sure all labels are saved in the current instance
342
- verbosity = settings.verbosity
343
- try:
344
- settings.verbosity = "warning"
345
- # save all validated records to the current instance
346
- self.add_validated_from("all")
347
-
348
- self._artifact = save_artifact(
349
- self._df,
350
- description=description,
351
- fields=self.fields,
352
- columns_field=self._columns_field,
353
- **kwargs,
354
- **self._kwargs,
355
- )
356
- finally:
357
- settings.verbosity = verbosity
358
-
359
- return self._artifact
360
-
361
- def clean_up_failed_runs(self):
362
- """Clean up previous failed runs that don't save any outputs."""
363
- from lamindb.core._context import context
364
-
365
- if context.run is not None:
366
- Run.filter(transform=context.run.transform, output_artifacts=None).exclude(
367
- uid=context.run.uid
368
- ).delete()
369
-
370
-
371
- class AnnDataCurator(DataFrameCurator):
372
- """Curation flow for ``AnnData``.
373
-
374
- See also :class:`~lamindb.Curator`.
375
-
376
- Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
377
-
378
- See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
379
-
380
- Args:
381
- data: The AnnData object or an AnnData-like path.
382
- var_index: The registry field for mapping the ``.var`` index.
383
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
384
- using_key: A reference LaminDB instance.
385
- verbosity: The verbosity level.
386
- organism: The organism name.
387
- sources: A dictionary mapping ``.obs.columns`` to Source records.
388
- exclude: A dictionary mapping column names to values to exclude.
389
-
390
- Examples:
391
- >>> import bionty as bt
392
- >>> curate = ln.Curator.from_anndata(
393
- ... adata,
394
- ... var_index=bt.Gene.ensembl_gene_id,
395
- ... categoricals={
396
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
397
- ... "donor_id": ln.ULabel.name
398
- ... },
399
- ... organism="human",
400
- ... )
401
- """
402
-
403
- def __init__(
404
- self,
405
- data: ad.AnnData | UPathStr,
406
- var_index: FieldAttr,
407
- categoricals: dict[str, FieldAttr] | None = None,
408
- obs_columns: FieldAttr = Feature.name,
409
- using_key: str = "default",
410
- verbosity: str = "hint",
411
- organism: str | None = None,
412
- sources: dict[str, Record] | None = None,
413
- exclude: dict | None = None,
414
- ) -> None:
415
- from lamindb_setup.core import upath
416
-
417
- from ._artifact import data_is_anndata
418
-
419
- if sources is None:
420
- sources = {}
421
- if not data_is_anndata(data):
422
- raise ValueError(
423
- "data has to be an AnnData object or a path to AnnData-like"
424
- )
425
- if isinstance(data, ad.AnnData):
426
- self._adata = data
427
- else: # pragma: no cover
428
- from lamindb.core.storage._backed_access import backed_access
429
-
430
- self._adata = backed_access(upath.create_path(data))
431
-
432
- self._data = data
433
- self._var_field = var_index
434
- super().__init__(
435
- df=self._adata.obs,
436
- categoricals=categoricals,
437
- columns=obs_columns,
438
- using_key=using_key,
439
- verbosity=verbosity,
440
- organism=organism,
441
- sources=sources,
442
- exclude=exclude,
443
- check_valid_keys=False,
444
- )
445
- self._obs_fields = categoricals or {}
446
- self._check_valid_keys(extra={"var_index"})
447
-
448
- @property
449
- def var_index(self) -> FieldAttr:
450
- """Return the registry field to validate variables index against."""
451
- return self._var_field
452
-
453
- @property
454
- def categoricals(self) -> dict:
455
- """Return the obs fields to validate against."""
456
- return self._obs_fields
457
-
458
- def lookup(self, using_key: str | None = None) -> CurateLookup:
459
- """Lookup categories.
460
-
461
- Args:
462
- using_key: The instance where the lookup is performed.
463
- if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
464
- if "public", the lookup is performed on the public reference.
465
- """
466
- return CurateLookup(
467
- categoricals=self._obs_fields,
468
- slots={"columns": self._columns_field, "var_index": self._var_field},
469
- using_key=using_key or self._using_key,
470
- )
471
-
472
- def _save_from_var_index(
473
- self, validated_only: bool = True, organism: str | None = None
474
- ):
475
- """Save variable records."""
476
- update_registry(
477
- values=list(self._adata.var.index),
478
- field=self.var_index,
479
- key="var_index",
480
- save_function="add_new_from_var_index",
481
- using_key=self._using_key,
482
- validated_only=validated_only,
483
- organism=organism,
484
- source=self._sources.get("var_index"),
485
- exclude=self._exclude.get("var_index"),
486
- )
487
-
488
- def _update_registry_all(self, validated_only: bool = True, **kwargs):
489
- """Save labels for all features."""
490
- for name in self.fields.keys():
491
- logger.info(f"saving labels for '{name}'")
492
- if name == "var_index":
493
- self._save_from_var_index(validated_only=validated_only, **kwargs)
494
- else:
495
- self._update_registry(name, validated_only=validated_only, **kwargs)
496
-
497
- def add_new_from_var_index(self, organism: str | None = None, **kwargs):
498
- """Update variable records.
499
-
500
- Args:
501
- organism: The organism name.
502
- **kwargs: Additional keyword arguments to pass to the registry model.
503
- """
504
- self._kwargs.update({"organism": organism} if organism else {})
505
- self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
506
-
507
- def add_validated_from_var_index(self, organism: str | None = None):
508
- """Add validated variable records.
509
-
510
- Args:
511
- organism: The organism name.
512
- """
513
- self._kwargs.update({"organism": organism} if organism else {})
514
- self._save_from_var_index(validated_only=True, **self._kwargs)
515
-
516
- def validate(self, organism: str | None = None) -> bool:
517
- """Validate categories.
518
-
519
- Args:
520
- organism: The organism name.
521
-
522
- Returns:
523
- Whether the AnnData object is validated.
524
- """
525
- self._kwargs.update({"organism": organism} if organism else {})
526
- if self._using_key is not None and self._using_key != "default":
527
- logger.important(
528
- f"validating metadata using registries of instance {colors.italic(self._using_key)}"
529
- )
530
-
531
- validated_var, non_validated_var = validate_categories(
532
- self._adata.var.index,
533
- field=self._var_field,
534
- key="var_index",
535
- using_key=self._using_key,
536
- source=self._sources.get("var_index"),
537
- validated_hint_print=".add_validated_from_var_index()",
538
- exclude=self._exclude.get("var_index"),
539
- **self._kwargs, # type: ignore
540
- )
541
- validated_obs, non_validated_obs = validate_categories_in_df(
542
- self._adata.obs,
543
- fields=self.categoricals,
544
- using_key=self._using_key,
545
- sources=self._sources,
546
- exclude=self._exclude,
547
- **self._kwargs,
548
- )
549
- self._non_validated = non_validated_obs # type: ignore
550
- if len(non_validated_var) > 0:
551
- self._non_validated["var_index"] = non_validated_var # type: ignore
552
- self._validated = validated_var and validated_obs
553
- return self._validated
554
-
555
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
556
- """Save the validated ``AnnData`` and metadata.
557
-
558
- Args:
559
- description: Description of the ``AnnData`` object.
560
- **kwargs: Object level metadata.
561
-
562
- Returns:
563
- A saved artifact record.
564
- """
565
- if not self._validated:
566
- self.validate()
567
- if not self._validated:
568
- raise ValidationError("Dataset does not validate. Please curate.")
569
-
570
- self._artifact = save_artifact(
571
- self._data,
572
- adata=self._adata,
573
- description=description,
574
- columns_field=self.var_index,
575
- fields=self.categoricals,
576
- **self._kwargs,
577
- **kwargs,
578
- )
579
- return self._artifact
580
-
581
-
582
- class MuDataCurator:
583
- """Curation flow for a ``MuData`` object.
584
-
585
- See also :class:`~lamindb.Curator`.
586
-
587
- Note that if genes or other measurements are removed from the MuData object,
588
- the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
589
-
590
- Args:
591
- mdata: The MuData object to curate.
592
- var_index: The registry field for mapping the ``.var`` index for each modality.
593
- For example:
594
- ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
595
- categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
596
- Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
597
- using_key: A reference LaminDB instance.
598
- verbosity: The verbosity level.
599
- organism: The organism name.
600
- sources: A dictionary mapping ``.obs.columns`` to Source records.
601
- exclude: A dictionary mapping column names to values to exclude.
602
-
603
- Examples:
604
- >>> import bionty as bt
605
- >>> curate = ln.Curator.from_mudata(
606
- ... mdata,
607
- ... var_index={
608
- ... "rna": bt.Gene.ensembl_gene_id,
609
- ... "adt": ln.CellMarker.name
610
- ... },
611
- ... categoricals={
612
- ... "cell_type_ontology_id": bt.CellType.ontology_id,
613
- ... "donor_id": ln.ULabel.name
614
- ... },
615
- ... organism="human",
616
- ... )
617
- """
618
-
619
- def __init__(
620
- self,
621
- mdata: MuData,
622
- var_index: dict[str, dict[str, FieldAttr]],
623
- categoricals: dict[str, FieldAttr] | None = None,
624
- using_key: str = "default",
625
- verbosity: str = "hint",
626
- organism: str | None = None,
627
- sources: dict[str, Record] | None = None,
628
- exclude: dict | None = None,
629
- ) -> None:
630
- if sources is None:
631
- sources = {}
632
- self._sources = sources
633
- if exclude is None:
634
- exclude = {}
635
- self._exclude = exclude
636
- self._mdata = mdata
637
- self._kwargs = {"organism": organism} if organism else {}
638
- self._var_fields = var_index
639
- self._verify_modality(self._var_fields.keys())
640
- self._obs_fields = self._parse_categoricals(categoricals)
641
- self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
642
- self._using_key = using_key
643
- self._verbosity = verbosity
644
- self._df_annotators = {
645
- modality: DataFrameCurator(
646
- df=mdata[modality].obs if modality != "obs" else mdata.obs,
647
- categoricals=self._obs_fields.get(modality, {}),
648
- using_key=using_key,
649
- verbosity=verbosity,
650
- sources=self._sources.get(modality),
651
- exclude=self._exclude.get(modality),
652
- check_valid_keys=False,
653
- **self._kwargs,
654
- )
655
- for modality in self._modalities
656
- }
657
- for modality in self._var_fields.keys():
658
- self._save_from_var_index_modality(
659
- modality=modality, validated_only=True, **self._kwargs
660
- )
661
-
662
- @property
663
- def var_index(self) -> FieldAttr:
664
- """Return the registry field to validate variables index against."""
665
- return self._var_fields
666
-
667
- @property
668
- def categoricals(self) -> dict:
669
- """Return the obs fields to validate against."""
670
- return self._obs_fields
671
-
672
- def _verify_modality(self, modalities: Iterable[str]):
673
- """Verify the modality exists."""
674
- for modality in modalities:
675
- if modality not in self._mdata.mod.keys():
676
- raise ValueError(f"modality '{modality}' does not exist!")
677
-
678
- def _save_from_var_index_modality(
679
- self, modality: str, validated_only: bool = True, **kwargs
680
- ):
681
- """Save variable records."""
682
- update_registry(
683
- values=list(self._mdata[modality].var.index),
684
- field=self._var_fields[modality],
685
- key="var_index",
686
- save_function="add_new_from_var_index",
687
- using_key=self._using_key,
688
- validated_only=validated_only,
689
- dtype="number",
690
- source=self._sources.get(modality, {}).get("var_index"),
691
- exclude=self._exclude.get(modality, {}).get("var_index"),
692
- **kwargs,
693
- )
694
-
695
- def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
696
- """Parse the categorical fields."""
697
- prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
698
- obs_fields: dict[str, dict[str, FieldAttr]] = {}
699
- for k, v in categoricals.items():
700
- if k not in self._mdata.obs.columns:
701
- raise ValueError(f"column '{k}' does not exist in mdata.obs!")
702
- if any(k.startswith(prefix) for prefix in prefixes):
703
- modality, col = k.split(":")[0], k.split(":")[1]
704
- if modality not in obs_fields.keys():
705
- obs_fields[modality] = {}
706
- obs_fields[modality][col] = v
707
- else:
708
- if "obs" not in obs_fields.keys():
709
- obs_fields["obs"] = {}
710
- obs_fields["obs"][k] = v
711
- return obs_fields
712
-
713
- def lookup(self, using_key: str | None = None) -> CurateLookup:
714
- """Lookup categories.
715
-
716
- Args:
717
- using_key: The instance where the lookup is performed.
718
- if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
719
- if "public", the lookup is performed on the public reference.
720
- """
721
- return CurateLookup(
722
- categoricals=self._obs_fields,
723
- slots={
724
- **self._obs_fields,
725
- **{f"{k}_var_index": v for k, v in self._var_fields.items()},
726
- },
727
- using_key=using_key or self._using_key,
728
- )
729
-
730
- def add_new_from_columns(
731
- self,
732
- modality: str,
733
- column_names: list[str] | None = None,
734
- organism: str | None = None,
735
- **kwargs,
736
- ):
737
- """Update columns records.
738
-
739
- Args:
740
- modality: The modality name.
741
- column_names: The column names to save.
742
- organism: The organism name.
743
- **kwargs: Additional keyword arguments to pass to the registry model.
744
- """
745
- self._kwargs.update({"organism": organism} if organism else {})
746
- values = column_names or self._mdata[modality].obs.columns
747
- update_registry(
748
- values=list(values),
749
- field=Feature.name,
750
- key=f"{modality} obs columns",
751
- using_key=self._using_key,
752
- validated_only=False,
753
- df=self._mdata[modality].obs,
754
- source=self._sources.get(modality, {}).get("columns"),
755
- exclude=self._exclude.get(modality, {}).get("columns"),
756
- **self._kwargs, # type: ignore
757
- **kwargs,
758
- )
759
-
760
- def add_new_from_var_index(
761
- self, modality: str, organism: str | None = None, **kwargs
762
- ):
763
- """Update variable records.
764
-
765
- Args:
766
- modality: The modality name.
767
- organism: The organism name.
768
- **kwargs: Additional keyword arguments to pass to the registry model.
769
- """
770
- self._kwargs.update({"organism": organism} if organism else {})
771
- self._save_from_var_index_modality(
772
- modality=modality, validated_only=False, **self._kwargs, **kwargs
773
- )
774
-
775
- def add_validated_from_var_index(self, modality: str, organism: str | None = None):
776
- """Add validated variable records.
777
-
778
- Args:
779
- modality: The modality name.
780
- organism: The organism name.
781
- """
782
- self._kwargs.update({"organism": organism} if organism else {})
783
- self._save_from_var_index_modality(
784
- modality=modality, validated_only=True, **self._kwargs
785
- )
786
-
787
- def add_validated_from(
788
- self, key: str, modality: str | None = None, organism: str | None = None
789
- ):
790
- """Add validated categories.
791
-
792
- Args:
793
- key: The key referencing the slot in the DataFrame.
794
- modality: The modality name.
795
- organism: The organism name.
796
- """
797
- self._kwargs.update({"organism": organism} if organism else {})
798
- modality = modality or "obs"
799
- if modality in self._df_annotators:
800
- df_annotator = self._df_annotators[modality]
801
- df_annotator.add_validated_from(key=key, **self._kwargs)
802
-
803
- def add_new_from(
804
- self,
805
- key: str,
806
- modality: str | None = None,
807
- organism: str | None = None,
808
- **kwargs,
809
- ):
810
- """Add validated & new categories.
811
-
812
- Args:
813
- key: The key referencing the slot in the DataFrame.
814
- modality: The modality name.
815
- organism: The organism name.
816
- **kwargs: Additional keyword arguments to pass to the registry model.
817
- """
818
- if len(kwargs) > 0 and key == "all":
819
- raise ValueError("Cannot pass additional arguments to 'all' key!")
820
- self._kwargs.update({"organism": organism} if organism else {})
821
- modality = modality or "obs"
822
- if modality in self._df_annotators:
823
- df_annotator = self._df_annotators[modality]
824
- df_annotator.add_new_from(key=key, **self._kwargs, **kwargs)
825
-
826
- def validate(self, organism: str | None = None) -> bool:
827
- """Validate categories."""
828
- self._kwargs.update({"organism": organism} if organism else {})
829
- if self._using_key is not None and self._using_key != "default":
830
- logger.important(
831
- f"validating metadata using registries of instance {colors.italic(self._using_key)}"
832
- )
833
- validated_var = True
834
- non_validated_var_modality = {}
835
- for modality, var_field in self._var_fields.items():
836
- is_validated_var, non_validated_var = validate_categories(
837
- self._mdata[modality].var.index,
838
- field=var_field,
839
- key=f"{modality}_var_index",
840
- using_key=self._using_key,
841
- source=self._sources.get(modality, {}).get("var_index"),
842
- exclude=self._exclude.get(modality, {}).get("var_index"),
843
- **self._kwargs, # type: ignore
844
- )
845
- validated_var &= is_validated_var
846
- if len(non_validated_var) > 0:
847
- non_validated_var_modality[modality] = non_validated_var
848
-
849
- validated_obs = True
850
- non_validated_obs_modality = {}
851
- for modality, fields in self._obs_fields.items():
852
- if modality == "obs":
853
- obs = self._mdata.obs
854
- else:
855
- obs = self._mdata[modality].obs
856
- is_validated_obs, non_validated_obs = validate_categories_in_df(
857
- obs,
858
- fields=fields,
859
- using_key=self._using_key,
860
- sources=self._sources.get(modality),
861
- exclude=self._exclude.get(modality),
862
- **self._kwargs,
863
- )
864
- validated_obs &= is_validated_obs
865
- non_validated_obs_modality[modality] = non_validated_obs
866
- if modality in non_validated_var_modality:
867
- non_validated_obs_modality[modality]["var_index"] = (
868
- non_validated_var_modality[modality]
869
- )
870
- if len(non_validated_obs_modality[modality]) > 0:
871
- self._non_validated = non_validated_obs_modality[modality]
872
- self._validated = validated_var and validated_obs
873
- return self._validated
874
-
875
- def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
876
- """Save the validated ``MuData`` and metadata.
877
-
878
- Args:
879
- description: Description of the ``MuData`` object.
880
- **kwargs: Object level metadata.
881
-
882
- Returns:
883
- A saved artifact record.
884
- """
885
- if not self._validated:
886
- raise ValidationError("Please run `validate()` first!")
887
-
888
- self._artifact = save_artifact(
889
- self._mdata,
890
- description=description,
891
- columns_field=self.var_index,
892
- fields=self.categoricals,
893
- **self._kwargs,
894
- **kwargs,
895
- )
896
- return self._artifact
897
-
898
-
899
- class Curator(BaseCurator):
900
- """Dataset curator.
901
-
902
- Data curation entails accurately labeling datasets with standardized metadata
903
- to facilitate data integration, interpretation and analysis.
904
-
905
- The curation flow has several steps:
906
-
907
- 1. Instantiate `Curator` from one of the following dataset objects:
908
-
909
- - :meth:`~lamindb.Curator.from_df`
910
- - :meth:`~lamindb.Curator.from_anndata`
911
- - :meth:`~lamindb.Curator.from_mudata`
912
-
913
- During object creation, any passed categoricals found in the object will be saved.
914
-
915
- 2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
916
-
917
- - Values that can successfully validated and already exist in the registry.
918
- - Values which are new and not yet validated or potentially problematic values.
919
-
920
- 3. Determine how to handle validated and non-validated values:
921
-
922
- - Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
923
- - Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
924
- - All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
925
- """
926
-
927
- @classmethod
928
- @doc_args(DataFrameCurator.__doc__)
929
- def from_df(
930
- cls,
931
- df: pd.DataFrame,
932
- categoricals: dict[str, FieldAttr] | None = None,
933
- columns: FieldAttr = Feature.name,
934
- using_key: str | None = None,
935
- verbosity: str = "hint",
936
- organism: str | None = None,
937
- ) -> DataFrameCurator:
938
- """{}""" # noqa: D415
939
- return DataFrameCurator(
940
- df=df,
941
- categoricals=categoricals,
942
- columns=columns,
943
- using_key=using_key,
944
- verbosity=verbosity,
945
- organism=organism,
946
- )
947
-
948
- @classmethod
949
- @doc_args(AnnDataCurator.__doc__)
950
- def from_anndata(
951
- cls,
952
- data: ad.AnnData | UPathStr,
953
- var_index: FieldAttr,
954
- categoricals: dict[str, FieldAttr] | None = None,
955
- obs_columns: FieldAttr = Feature.name,
956
- using_key: str = "default",
957
- verbosity: str = "hint",
958
- organism: str | None = None,
959
- sources: dict[str, Record] | None = None,
960
- ) -> AnnDataCurator:
961
- """{}""" # noqa: D415
962
- return AnnDataCurator(
963
- data=data,
964
- var_index=var_index,
965
- categoricals=categoricals,
966
- obs_columns=obs_columns,
967
- using_key=using_key,
968
- verbosity=verbosity,
969
- organism=organism,
970
- sources=sources,
971
- )
972
-
973
- @classmethod
974
- @doc_args(MuDataCurator.__doc__)
975
- def from_mudata(
976
- cls,
977
- mdata: MuData,
978
- var_index: dict[str, dict[str, FieldAttr]],
979
- categoricals: dict[str, FieldAttr] | None = None,
980
- using_key: str = "default",
981
- verbosity: str = "hint",
982
- organism: str | None = None,
983
- ) -> MuDataCurator:
984
- """{}""" # noqa: D415
985
- return MuDataCurator(
986
- mdata=mdata,
987
- var_index=var_index,
988
- categoricals=categoricals,
989
- using_key=using_key,
990
- verbosity=verbosity,
991
- organism=organism,
992
- )
993
-
994
-
995
- def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
996
- """Get a registry instance using a specific instance."""
997
- if using_key is not None and using_key != "default":
998
- return registry.using(using_key)
999
- return registry
1000
-
1001
-
1002
- def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
1003
- """Make sure the source and organism are saved in the same database as the registry."""
1004
- from lamindb.core._settings import settings
1005
-
1006
- db = registry.filter().db
1007
- source = kwargs.get("source")
1008
- organism = kwargs.get("organism")
1009
- filter_kwargs = kwargs.copy()
1010
- try:
1011
- verbosity = settings.verbosity
1012
- settings.verbosity = "error"
1013
- if isinstance(organism, Record) and organism._state.db != "default":
1014
- if db is None or db == "default":
1015
- organism_default = copy.copy(organism)
1016
- # save the organism record in the default database
1017
- organism_default.save()
1018
- filter_kwargs["organism"] = organism_default
1019
- if isinstance(source, Record) and source._state.db != "default":
1020
- if db is None or db == "default":
1021
- source_default = copy.copy(source)
1022
- # save the source record in the default database
1023
- source_default.save()
1024
- filter_kwargs["source"] = source_default
1025
- finally:
1026
- settings.verbosity = verbosity
1027
- return filter_kwargs
1028
-
1029
-
1030
- def standardize_and_inspect(
1031
- values: Iterable[str],
1032
- field: FieldAttr,
1033
- registry: type[Record],
1034
- standardize: bool = False,
1035
- exclude: str | list | None = None,
1036
- **kwargs,
1037
- ):
1038
- """Standardize and inspect values using a registry."""
1039
- # inspect exclude values in the default instance
1040
- values = list(values)
1041
- include_validated = []
1042
- if exclude is not None:
1043
- exclude = [exclude] if isinstance(exclude, str) else exclude
1044
- exclude = [i for i in exclude if i in values]
1045
- if len(exclude) > 0:
1046
- # exclude values are validated without source and organism
1047
- inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
1048
- # if exclude values are validated, remove them from the values
1049
- values = [i for i in values if i not in inspect_result_exclude.validated]
1050
- include_validated = inspect_result_exclude.validated
1051
-
1052
- if standardize:
1053
- if hasattr(registry, "standardize") and hasattr(
1054
- registry,
1055
- "synonyms", # https://github.com/laminlabs/lamindb/issues/1685
1056
- ):
1057
- standardized_values = registry.standardize(
1058
- values, field=field, mute=True, **kwargs
1059
- )
1060
- values = standardized_values
1061
-
1062
- inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
1063
- inspect_result._validated += include_validated
1064
- inspect_result._non_validated = [
1065
- i for i in inspect_result.non_validated if i not in include_validated
1066
- ]
1067
-
1068
- return inspect_result
1069
-
1070
-
1071
- def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
1072
- """Check if a registry needs an organism and return the organism name."""
1073
- if hasattr(registry, "organism_id"):
1074
- import bionty as bt
1075
-
1076
- if organism is None and bt.settings.organism is None:
1077
- raise ValueError(
1078
- f"{registry.__name__} registry requires an organism!\n"
1079
- " → please pass an organism name via organism="
1080
- )
1081
- return {"organism": organism or bt.settings.organism.name}
1082
- return {}
1083
-
1084
-
1085
- def validate_categories(
1086
- values: Iterable[str],
1087
- field: FieldAttr,
1088
- key: str,
1089
- using_key: str | None = None,
1090
- organism: str | None = None,
1091
- source: Record | None = None,
1092
- exclude: str | list | None = None,
1093
- standardize: bool = True,
1094
- validated_hint_print: str | None = None,
1095
- ) -> tuple[bool, list]:
1096
- """Validate ontology terms in a pandas series using LaminDB registries.
1097
-
1098
- Args:
1099
- values: The values to validate.
1100
- field: The field attribute.
1101
- key: The key referencing the slot in the DataFrame.
1102
- using_key: A reference LaminDB instance.
1103
- organism: The organism name.
1104
- source: The source record.
1105
- exclude: Exclude specific values.
1106
- standardize: Standardize the values.
1107
- validated_hint_print: The hint to print for validated values.
1108
- """
1109
- from lamindb._from_values import _print_values
1110
- from lamindb.core._settings import settings
1111
-
1112
- model_field = f"{field.field.model.__name__}.{field.field.name}"
1113
-
1114
- def _log_mapping_info():
1115
- logger.indent = ""
1116
- logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}")
1117
- logger.indent = " "
1118
-
1119
- registry = field.field.model
1120
-
1121
- kwargs = check_registry_organism(registry, organism)
1122
- kwargs.update({"source": source} if source else {})
1123
- kwargs_current = get_current_filter_kwargs(registry, kwargs)
1124
-
1125
- # inspect the default instance
1126
- inspect_result = standardize_and_inspect(
1127
- values=values,
1128
- field=field,
1129
- registry=registry,
1130
- standardize=standardize,
1131
- exclude=exclude,
1132
- **kwargs_current,
1133
- )
1134
- non_validated = inspect_result.non_validated
1135
-
1136
- # inspect the using instance
1137
- values_validated = []
1138
- if using_key is not None and using_key != "default" and non_validated:
1139
- registry_using = get_registry_instance(registry, using_key)
1140
- inspect_result = standardize_and_inspect(
1141
- values=non_validated,
1142
- field=field,
1143
- registry=registry_using,
1144
- standardize=standardize,
1145
- exclude=exclude,
1146
- **kwargs,
1147
- )
1148
- non_validated = inspect_result.non_validated
1149
- values_validated += inspect_result.validated
1150
-
1151
- # inspect from public (bionty only)
1152
- if hasattr(registry, "public"):
1153
- verbosity = settings.verbosity
1154
- try:
1155
- settings.verbosity = "error"
1156
- public_records = registry.from_values(
1157
- non_validated,
1158
- field=field,
1159
- **kwargs_current,
1160
- )
1161
- values_validated += [getattr(r, field.field.name) for r in public_records]
1162
- finally:
1163
- settings.verbosity = verbosity
1164
-
1165
- validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
1166
- n_validated = len(values_validated)
1167
- if n_validated > 0:
1168
- _log_mapping_info()
1169
- logger.warning(
1170
- f"found {colors.yellow(n_validated)} validated terms: "
1171
- f"{colors.yellow(values_validated)}\n → save terms via "
1172
- f"{colors.yellow(validated_hint_print)}"
1173
- )
1174
-
1175
- non_validated_hint_print = f".add_new_from('{key}')"
1176
- non_validated = [i for i in non_validated if i not in values_validated]
1177
- n_non_validated = len(non_validated)
1178
- if n_non_validated == 0:
1179
- if n_validated == 0:
1180
- logger.indent = ""
1181
- logger.success(f"{key} is validated against {colors.italic(model_field)}")
1182
- return True, []
1183
- else:
1184
- # validated values still need to be saved to the current instance
1185
- return False, []
1186
- else:
1187
- are = "are" if n_non_validated > 1 else "is"
1188
- print_values = _print_values(non_validated)
1189
- warning_message = (
1190
- f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
1191
- f"{colors.red(print_values)}\n → fix typos, remove non-existent values, or save terms via "
1192
- f"{colors.red(non_validated_hint_print)}"
1193
- )
1194
- if logger.indent == "":
1195
- _log_mapping_info()
1196
- logger.warning(warning_message)
1197
- logger.indent = ""
1198
- return False, non_validated
1199
-
1200
-
1201
- def validate_categories_in_df(
1202
- df: pd.DataFrame,
1203
- fields: dict[str, FieldAttr],
1204
- using_key: str | None = None,
1205
- sources: dict[str, Record] = None,
1206
- exclude: dict | None = None,
1207
- **kwargs,
1208
- ) -> tuple[bool, dict]:
1209
- """Validate categories in DataFrame columns using LaminDB registries."""
1210
- if not fields:
1211
- return True, {}
1212
-
1213
- if sources is None:
1214
- sources = {}
1215
- validated = True
1216
- non_validated = {}
1217
- for key, field in fields.items():
1218
- is_val, non_val = validate_categories(
1219
- df[key],
1220
- field=field,
1221
- key=key,
1222
- using_key=using_key,
1223
- source=sources.get(key),
1224
- exclude=exclude.get(key) if exclude else None,
1225
- **kwargs,
1226
- )
1227
- validated &= is_val
1228
- if len(non_val) > 0:
1229
- non_validated[key] = non_val
1230
- return validated, non_validated
1231
-
1232
-
1233
- def save_artifact(
1234
- data: pd.DataFrame | ad.AnnData | MuData,
1235
- fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
1236
- columns_field: FieldAttr | dict[str, FieldAttr],
1237
- description: str | None = None,
1238
- organism: str | None = None,
1239
- adata: ad.AnnData | None = None,
1240
- **kwargs,
1241
- ) -> Artifact:
1242
- """Save all metadata with an Artifact.
1243
-
1244
- Args:
1245
- data: The DataFrame or AnnData object to save.
1246
- description: A description of the artifact.
1247
- fields: A dictionary mapping obs_column to registry_field.
1248
- columns_field: The registry field to validate variables index against.
1249
- organism: The organism name.
1250
- adata: The AnnData object to save, must be provided if data is a path.
1251
- kwargs: Additional keyword arguments to pass to the registry model.
1252
-
1253
- Returns:
1254
- The saved Artifact.
1255
- """
1256
- from ._artifact import data_is_anndata
1257
-
1258
- artifact = None
1259
- if data_is_anndata(data):
1260
- assert adata is not None # noqa: S101
1261
- artifact = Artifact.from_anndata(data, description=description, **kwargs)
1262
- artifact.n_observations = adata.shape[0]
1263
- data = adata
1264
-
1265
- elif isinstance(data, pd.DataFrame):
1266
- artifact = Artifact.from_df(data, description=description, **kwargs)
1267
- else:
1268
- try:
1269
- from mudata import MuData
1270
-
1271
- if isinstance(data, MuData):
1272
- artifact = Artifact.from_mudata(data, description=description, **kwargs)
1273
- artifact.n_observations = data.n_obs
1274
- except ImportError:
1275
- pass
1276
- if artifact is None:
1277
- raise ValueError("data must be a DataFrame, AnnData or MuData object.")
1278
- artifact.save()
1279
-
1280
- feature_kwargs = check_registry_organism(
1281
- (
1282
- list(columns_field.values())[0].field.model
1283
- if isinstance(columns_field, dict)
1284
- else columns_field.field.model
1285
- ),
1286
- organism,
1287
- )
1288
-
1289
- if artifact._accessor == "DataFrame":
1290
- artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
1291
- elif artifact._accessor == "AnnData":
1292
- artifact.features._add_set_from_anndata(
1293
- var_field=columns_field, **feature_kwargs
1294
- )
1295
- elif artifact._accessor == "MuData":
1296
- artifact.features._add_set_from_mudata(
1297
- var_fields=columns_field, **feature_kwargs
1298
- )
1299
- else:
1300
- raise NotImplementedError
1301
-
1302
- def _add_labels(data, artifact: Artifact, fields: dict[str, FieldAttr]):
1303
- features = Feature.lookup().dict()
1304
- for key, field in fields.items():
1305
- feature = features.get(key)
1306
- registry = field.field.model
1307
- filter_kwargs = check_registry_organism(registry, organism)
1308
- filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1309
- df = data if isinstance(data, pd.DataFrame) else data.obs
1310
- labels = registry.from_values(
1311
- df[key],
1312
- field=field,
1313
- **filter_kwargs_current,
1314
- )
1315
- artifact.labels.add(labels, feature)
1316
-
1317
- if artifact._accessor == "MuData":
1318
- for modality, modality_fields in fields.items():
1319
- if modality == "obs":
1320
- _add_labels(data, artifact, modality_fields)
1321
- else:
1322
- _add_labels(data[modality], artifact, modality_fields)
1323
- else:
1324
- _add_labels(data, artifact, fields)
1325
-
1326
- slug = ln_setup.settings.instance.slug
1327
- if ln_setup.settings.instance.is_remote: # pragma: no cover
1328
- logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
1329
- return artifact
1330
-
1331
-
1332
- def update_registry(
1333
- values: list[str],
1334
- field: FieldAttr,
1335
- key: str,
1336
- save_function: str = "add_new_from",
1337
- using_key: str | None = None,
1338
- validated_only: bool = True,
1339
- df: pd.DataFrame | None = None,
1340
- organism: str | None = None,
1341
- dtype: str | None = None,
1342
- source: Record | None = None,
1343
- standardize: bool = True,
1344
- warning: bool = True,
1345
- exclude: str | list | None = None,
1346
- **kwargs,
1347
- ) -> None:
1348
- """Save features or labels records in the default instance from the using_key instance.
1349
-
1350
- Args:
1351
- values: A list of values to be saved as labels.
1352
- field: The FieldAttr object representing the field for which labels are being saved.
1353
- key: The name of the feature to save.
1354
- save_function: The name of the function to save the labels.
1355
- using_key: The name of the instance from which to transfer labels (if applicable).
1356
- validated_only: If True, only save validated labels.
1357
- df: A DataFrame to save labels from.
1358
- organism: The organism name.
1359
- dtype: The type of the feature.
1360
- source: The source record.
1361
- kwargs: Additional keyword arguments to pass to the registry model to create new records.
1362
- """
1363
- from lamindb._save import save as ln_save
1364
- from lamindb.core._settings import settings
1365
-
1366
- registry = field.field.model
1367
- filter_kwargs = check_registry_organism(registry, organism)
1368
- filter_kwargs.update({"source": source} if source else {})
1369
-
1370
- verbosity = settings.verbosity
1371
- try:
1372
- settings.verbosity = "error"
1373
-
1374
- # save from public
1375
- filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1376
- existing_and_public_records = (
1377
- registry.from_values(
1378
- list(values),
1379
- field=field,
1380
- **filter_kwargs_current,
1381
- )
1382
- if values
1383
- else []
1384
- )
1385
-
1386
- labels_saved: dict = {"from public": [], "without reference": []}
1387
-
1388
- public_records = [r for r in existing_and_public_records if r._state.adding]
1389
- # here we check to only save the public records if they are from the specified source
1390
- # we check the uid because r.source and soruce can be from different instances
1391
- if source:
1392
- public_records = [r for r in public_records if r.source.uid == source.uid]
1393
- ln_save(public_records)
1394
- labels_saved["from public"] = [
1395
- getattr(r, field.field.name) for r in public_records
1396
- ]
1397
- non_public_labels = [i for i in values if i not in labels_saved["from public"]]
1398
-
1399
- # inspect the default instance
1400
- inspect_result_current = standardize_and_inspect(
1401
- values=non_public_labels,
1402
- field=field,
1403
- registry=registry,
1404
- standardize=standardize,
1405
- exclude=exclude,
1406
- **filter_kwargs_current,
1407
- )
1408
- if not inspect_result_current.non_validated:
1409
- all_labels = registry.from_values(
1410
- inspect_result_current.validated,
1411
- field=field,
1412
- **filter_kwargs_current,
1413
- )
1414
- settings.verbosity = verbosity
1415
- return all_labels
1416
-
1417
- # inspect the using_key instance
1418
- (
1419
- labels_saved[f"from {using_key}"],
1420
- non_validated_labels,
1421
- ) = update_registry_from_using_instance(
1422
- inspect_result_current.non_validated,
1423
- field=field,
1424
- using_key=using_key,
1425
- exclude=exclude,
1426
- **filter_kwargs,
1427
- )
1428
-
1429
- labels_saved["without reference"] = [
1430
- i
1431
- for i in non_validated_labels
1432
- if i not in labels_saved[f"from {using_key}"]
1433
- ]
1434
-
1435
- # save non-validated records
1436
- if not validated_only:
1437
- non_validated_records = []
1438
- if df is not None and registry == Feature:
1439
- non_validated_records = Feature.from_df(df)
1440
- else:
1441
- if "organism" in filter_kwargs:
1442
- # make sure organism record is saved to the current instance
1443
- filter_kwargs["organism"] = _save_organism(name=organism)
1444
- init_kwargs = {}
1445
- for value in labels_saved["without reference"]:
1446
- init_kwargs[field.field.name] = value
1447
- if registry == Feature:
1448
- init_kwargs["dtype"] = "cat" if dtype is None else dtype
1449
- non_validated_records.append(
1450
- registry(
1451
- **init_kwargs,
1452
- **{k: v for k, v in filter_kwargs.items() if k != "source"},
1453
- **{k: v for k, v in kwargs.items() if k != "sources"},
1454
- )
1455
- )
1456
- ln_save(non_validated_records)
1457
-
1458
- # save parent labels for ulabels
1459
- if registry == ULabel and field.field.name == "name":
1460
- save_ulabels_with_parent(values, field=field, key=key)
1461
-
1462
- # # get all records that are now validated in the current instance
1463
- # all_labels = registry.from_values(
1464
- # inspect_result_current.validated + inspect_result_current.non_validated,
1465
- # field=field,
1466
- # **get_current_filter_kwargs(registry, filter_kwargs),
1467
- # )
1468
- finally:
1469
- settings.verbosity = verbosity
1470
-
1471
- log_saved_labels(
1472
- labels_saved,
1473
- key=key,
1474
- save_function=save_function,
1475
- model_field=f"{registry.__name__}.{field.field.name}",
1476
- validated_only=validated_only,
1477
- warning=warning,
1478
- )
1479
-
1480
- # return all_labels
1481
-
1482
-
1483
- def log_saved_labels(
1484
- labels_saved: dict,
1485
- key: str,
1486
- save_function: str,
1487
- model_field: str,
1488
- validated_only: bool = True,
1489
- warning: bool = True,
1490
- ) -> None:
1491
- """Log the saved labels."""
1492
- from ._from_values import _print_values
1493
-
1494
- model_field = colors.italic(model_field)
1495
- for k, labels in labels_saved.items():
1496
- if not labels:
1497
- continue
1498
-
1499
- if k == "without reference" and validated_only:
1500
- msg = colors.yellow(
1501
- f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
1502
- )
1503
- lookup_print = (
1504
- f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
1505
- )
1506
-
1507
- hint = f".add_new_from('{key}')"
1508
- msg += f"\n → to lookup values, use {lookup_print}"
1509
- msg += (
1510
- f"\n → to save, run {colors.yellow(hint)}"
1511
- if save_function == "add_new_from"
1512
- else f"\n → to save, run {colors.yellow(save_function)}"
1513
- )
1514
- if warning:
1515
- logger.warning(msg)
1516
- else:
1517
- logger.info(msg)
1518
- else:
1519
- k = "" if k == "without reference" else f"{colors.green(k)} "
1520
- # the term "transferred" stresses that this is always in the context of transferring
1521
- # labels from a public ontology or a different instance to the present instance
1522
- s = "s" if len(labels) > 1 else ""
1523
- logger.success(
1524
- f"added {len(labels)} record{s} {k}with {model_field} for {colors.italic(key)}: {_print_values(labels)}"
1525
- )
1526
-
1527
-
1528
- def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None:
1529
- """Save a parent label for the given labels."""
1530
- registry = field.field.model
1531
- assert registry == ULabel # noqa: S101
1532
- all_records = registry.from_values(list(values), field=field)
1533
- is_feature = registry.filter(name=f"is_{key}").one_or_none()
1534
- if is_feature is None:
1535
- is_feature = registry(name=f"is_{key}")
1536
- is_feature.save()
1537
- is_feature.children.add(*all_records)
1538
-
1539
-
1540
- def update_registry_from_using_instance(
1541
- values: list[str],
1542
- field: FieldAttr,
1543
- using_key: str | None = None,
1544
- standardize: bool = False,
1545
- exclude: str | list | None = None,
1546
- **kwargs,
1547
- ) -> tuple[list[str], list[str]]:
1548
- """Save features or labels records from the using_key instance.
1549
-
1550
- Args:
1551
- values: A list of values to be saved as labels.
1552
- field: The FieldAttr object representing the field for which labels are being saved.
1553
- using_key: The name of the instance from which to transfer labels (if applicable).
1554
- standardize: Whether to also standardize the values.
1555
- kwargs: Additional keyword arguments to pass to the registry model.
1556
-
1557
- Returns:
1558
- A tuple containing the list of saved labels and the list of non-saved labels.
1559
- """
1560
- labels_saved = []
1561
- not_saved = values
1562
-
1563
- if using_key is not None and using_key != "default":
1564
- registry_using = get_registry_instance(field.field.model, using_key)
1565
-
1566
- inspect_result_using = standardize_and_inspect(
1567
- values=values,
1568
- field=field,
1569
- registry=registry_using,
1570
- standardize=standardize,
1571
- exclude=exclude,
1572
- **kwargs,
1573
- )
1574
- labels_using = registry_using.filter(
1575
- **{f"{field.field.name}__in": inspect_result_using.validated}
1576
- ).all()
1577
- for label_using in labels_using:
1578
- label_using.save()
1579
- labels_saved.append(getattr(label_using, field.field.name))
1580
- not_saved = inspect_result_using.non_validated
1581
-
1582
- return labels_saved, not_saved
1583
-
1584
-
1585
- def _save_organism(name: str): # pragma: no cover
1586
- """Save an organism record."""
1587
- import bionty as bt
1588
-
1589
- organism = bt.Organism.filter(name=name).one_or_none()
1590
- if organism is None:
1591
- organism = bt.Organism.from_source(name=name)
1592
- if organism is None:
1593
- raise ValueError(
1594
- f"Organism '{name}' not found\n"
1595
- f" → please save it: bt.Organism(name='{name}').save()"
1596
- )
1597
- organism.save()
1598
- return organism
1599
-
1600
-
1601
- Curate = Curator # backward compat
1
+ from __future__ import annotations
2
+
3
+ import copy
4
+ from typing import TYPE_CHECKING
5
+
6
+ import anndata as ad
7
+ import lamindb_setup as ln_setup
8
+ import pandas as pd
9
+ from lamin_utils import colors, logger
10
+ from lamindb_setup.core._docs import doc_args
11
+ from lnschema_core import (
12
+ Artifact,
13
+ Feature,
14
+ Record,
15
+ Run,
16
+ ULabel,
17
+ )
18
+
19
+ from .core.exceptions import ValidationError
20
+
21
+ if TYPE_CHECKING:
22
+ from collections.abc import Iterable
23
+
24
+ from lamindb_setup.core.types import UPathStr
25
+ from lnschema_core.types import FieldAttr
26
+ from mudata import MuData
27
+
28
+
29
+ class CurateLookup:
30
+ """Lookup categories from the reference instance."""
31
+
32
+ def __init__(
33
+ self,
34
+ categoricals: dict[str, FieldAttr],
35
+ slots: dict[str, FieldAttr] = None,
36
+ using_key: str | None = None,
37
+ ) -> None:
38
+ if slots is None:
39
+ slots = {}
40
+ self._fields = {**categoricals, **slots}
41
+ self._using_key = None if using_key == "default" else using_key
42
+ self._using_key_name = self._using_key or ln_setup.settings.instance.slug
43
+ debug_message = (
44
+ f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
45
+ )
46
+ logger.debug(debug_message)
47
+
48
+ def __getattr__(self, name):
49
+ if name in self._fields:
50
+ registry = self._fields[name].field.model
51
+ if self._using_key == "public":
52
+ return registry.public().lookup()
53
+ else:
54
+ return get_registry_instance(registry, self._using_key).lookup()
55
+ raise AttributeError(
56
+ f"'{self.__class__.__name__}' object has no attribute '{name}'"
57
+ )
58
+
59
+ def __getitem__(self, name):
60
+ if name in self._fields:
61
+ registry = self._fields[name].field.model
62
+ if self._using_key == "public":
63
+ return registry.public().lookup()
64
+ else:
65
+ return get_registry_instance(registry, self._using_key).lookup()
66
+ raise AttributeError(
67
+ f"'{self.__class__.__name__}' object has no attribute '{name}'"
68
+ )
69
+
70
+ def __repr__(self) -> str:
71
+ if len(self._fields) > 0:
72
+ getattr_keys = "\n ".join(
73
+ [f".{key}" for key in self._fields if key.isidentifier()]
74
+ )
75
+ getitem_keys = "\n ".join(
76
+ [str([key]) for key in self._fields if not key.isidentifier()]
77
+ )
78
+ return (
79
+ f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
80
+ f"{colors.green(getattr_keys)}\n "
81
+ f"{colors.green(getitem_keys)}\n\n"
82
+ "Example:\n → categories = validator.lookup().cell_type\n"
83
+ " → categories.alveolar_type_1_fibroblast_cell"
84
+ )
85
+ else: # pragma: no cover
86
+ return colors.warning("No fields are found!")
87
+
88
+
89
+ class BaseCurator:
90
+ """Curate a dataset."""
91
+
92
+ def validate(self) -> bool:
93
+ """Validate dataset.
94
+
95
+ Returns:
96
+ Boolean indicating whether the dataset is validated.
97
+ """
98
+ pass
99
+
100
+ def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
101
+ """Save the dataset as artifact.
102
+
103
+ Args:
104
+ description: Description of the DataFrame object.
105
+ **kwargs: Object level metadata.
106
+
107
+ Returns:
108
+ A saved artifact record.
109
+ """
110
+ pass
111
+
112
+
113
+ class DataFrameCurator(BaseCurator):
114
+ """Curation flow for a DataFrame object.
115
+
116
+ See also :class:`~lamindb.Curator`.
117
+
118
+ Args:
119
+ df: The DataFrame object to curate.
120
+ columns: The field attribute for the feature column.
121
+ categoricals: A dictionary mapping column names to registry_field.
122
+ using_key: The reference instance containing registries to validate against.
123
+ verbosity: The verbosity level.
124
+ organism: The organism name.
125
+ sources: A dictionary mapping column names to Source records.
126
+ exclude: A dictionary mapping column names to values to exclude.
127
+
128
+ Examples:
129
+ >>> import bionty as bt
130
+ >>> curate = ln.Curator.from_df(
131
+ ... df,
132
+ ... categoricals={
133
+ ... "cell_type_ontology_id": bt.CellType.ontology_id,
134
+ ... "donor_id": ln.ULabel.name
135
+ ... }
136
+ ... )
137
+ """
138
+
139
+ def __init__(
140
+ self,
141
+ df: pd.DataFrame,
142
+ columns: FieldAttr = Feature.name,
143
+ categoricals: dict[str, FieldAttr] | None = None,
144
+ using_key: str | None = None,
145
+ verbosity: str = "hint",
146
+ organism: str | None = None,
147
+ sources: dict[str, Record] | None = None,
148
+ exclude: dict | None = None,
149
+ check_valid_keys: bool = True,
150
+ ) -> None:
151
+ from lamindb.core._settings import settings
152
+
153
+ self._df = df
154
+ self._fields = categoricals or {}
155
+ self._columns_field = columns
156
+ self._using_key = using_key
157
+ settings.verbosity = verbosity
158
+ self._artifact = None
159
+ self._collection = None
160
+ self._validated = False
161
+ self._kwargs = {"organism": organism} if organism else {}
162
+ if sources is None:
163
+ sources = {}
164
+ self._sources = sources
165
+ if exclude is None:
166
+ exclude = {}
167
+ self._exclude = exclude
168
+ self._non_validated = None
169
+ if check_valid_keys:
170
+ self._check_valid_keys()
171
+ self._save_columns()
172
+
173
+ @property
174
+ def non_validated(self) -> list:
175
+ """Return the non-validated features and labels."""
176
+ if self._non_validated is None:
177
+ raise ValueError("Please run validate() first!")
178
+ return self._non_validated
179
+
180
+ @property
181
+ def fields(self) -> dict:
182
+ """Return the columns fields to validate against."""
183
+ return self._fields
184
+
185
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
186
+ """Lookup categories.
187
+
188
+ Args:
189
+ using_key: The instance where the lookup is performed.
190
+ if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
191
+ if "public", the lookup is performed on the public reference.
192
+ """
193
+ return CurateLookup(
194
+ categoricals=self._fields,
195
+ slots={"columns": self._columns_field},
196
+ using_key=using_key or self._using_key,
197
+ )
198
+
199
+ def _check_valid_keys(self, extra: set = None) -> None:
200
+ if extra is None:
201
+ extra = set()
202
+ for name, d in {
203
+ "categoricals": self._fields,
204
+ "sources": self._sources,
205
+ "exclude": self._exclude,
206
+ }.items():
207
+ if not isinstance(d, dict):
208
+ raise TypeError(f"{name} must be a dictionary!")
209
+ valid_keys = set(self._df.columns) | {"columns"} | extra
210
+ nonval_keys = [key for key in d.keys() if key not in valid_keys]
211
+ if len(nonval_keys) > 0:
212
+ raise ValueError(
213
+ f"the following keys passed to {name} are not allowed: {nonval_keys}"
214
+ )
215
+
216
+ def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
217
+ """Save column name records."""
218
+ # Always save features specified as the fields keys
219
+ update_registry(
220
+ values=list(self.fields.keys()),
221
+ field=self._columns_field,
222
+ key="columns",
223
+ save_function="add_new_from_columns",
224
+ using_key=self._using_key,
225
+ validated_only=False,
226
+ source=self._sources.get("columns"),
227
+ exclude=self._exclude.get("columns"),
228
+ **kwargs,
229
+ )
230
+
231
+ # Save the rest of the columns based on validated_only
232
+ additional_columns = set(self._df.columns) - set(self.fields.keys())
233
+ if additional_columns:
234
+ update_registry(
235
+ values=list(additional_columns),
236
+ field=self._columns_field,
237
+ key="columns",
238
+ save_function="add_new_from_columns",
239
+ using_key=self._using_key,
240
+ validated_only=validated_only,
241
+ df=self._df, # Get the Feature type from df
242
+ source=self._sources.get("columns"),
243
+ exclude=self._exclude.get("columns"),
244
+ warning=False, # Do not warn about missing columns, just an info message
245
+ **kwargs,
246
+ )
247
+
248
+ def add_validated_from(self, key: str, organism: str | None = None):
249
+ """Add validated categories.
250
+
251
+ Args:
252
+ key: The key referencing the slot in the DataFrame.
253
+ organism: The organism name.
254
+ """
255
+ self._kwargs.update({"organism": organism} if organism else {})
256
+ self._update_registry(key, validated_only=True, **self._kwargs)
257
+
258
+ def add_new_from(self, key: str, organism: str | None = None, **kwargs):
259
+ """Add validated & new categories.
260
+
261
+ Args:
262
+ key: The key referencing the slot in the DataFrame from which to draw terms.
263
+ organism: The organism name.
264
+ **kwargs: Additional keyword arguments to pass to the registry model.
265
+ """
266
+ if len(kwargs) > 0 and key == "all":
267
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
268
+ self._kwargs.update({"organism": organism} if organism else {})
269
+ self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
270
+
271
+ def add_new_from_columns(self, organism: str | None = None, **kwargs):
272
+ """Add validated & new column names to its registry.
273
+
274
+ Args:
275
+ organism: The organism name.
276
+ **kwargs: Additional keyword arguments to pass to the registry model.
277
+ """
278
+ self._kwargs.update({"organism": organism} if organism else {})
279
+ self._save_columns(validated_only=False, **self._kwargs, **kwargs)
280
+
281
+ def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
282
+ if categorical == "all":
283
+ self._update_registry_all(validated_only=validated_only, **kwargs)
284
+ elif categorical == "columns":
285
+ self._save_columns(validated_only=validated_only, **kwargs)
286
+ else:
287
+ if categorical not in self.fields:
288
+ raise ValueError(f"Feature {categorical} is not part of the fields!")
289
+ update_registry(
290
+ values=self._df[categorical].unique().tolist(),
291
+ field=self.fields[categorical],
292
+ key=categorical,
293
+ using_key=self._using_key,
294
+ validated_only=validated_only,
295
+ source=self._sources.get(categorical),
296
+ exclude=self._exclude.get(categorical),
297
+ **kwargs,
298
+ )
299
+
300
+ def _update_registry_all(self, validated_only: bool = True, **kwargs):
301
+ """Save labels for all features."""
302
+ for name in self.fields.keys():
303
+ logger.info(f"saving labels for '{name}'")
304
+ self._update_registry(name, validated_only=validated_only, **kwargs)
305
+
306
+ def validate(self, organism: str | None = None) -> bool:
307
+ """Validate variables and categorical observations.
308
+
309
+ Args:
310
+ organism: The organism name.
311
+
312
+ Returns:
313
+ Whether the DataFrame is validated.
314
+ """
315
+ self._kwargs.update({"organism": organism} if organism else {})
316
+ self._validated, self._non_validated = validate_categories_in_df( # type: ignore
317
+ self._df,
318
+ fields=self.fields,
319
+ using_key=self._using_key,
320
+ sources=self._sources,
321
+ exclude=self._exclude,
322
+ **self._kwargs,
323
+ )
324
+ return self._validated
325
+
326
+ def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
327
+ """Save the validated DataFrame and metadata.
328
+
329
+ Args:
330
+ description: Description of the DataFrame object.
331
+ **kwargs: Object level metadata.
332
+
333
+ Returns:
334
+ A saved artifact record.
335
+ """
336
+ from lamindb.core._settings import settings
337
+
338
+ if not self._validated:
339
+ self.validate()
340
+ if not self._validated:
341
+ raise ValidationError("Dataset does not validate. Please curate.")
342
+
343
+ # Make sure all labels are saved in the current instance
344
+ verbosity = settings.verbosity
345
+ try:
346
+ settings.verbosity = "warning"
347
+ # save all validated records to the current instance
348
+ self.add_validated_from("all")
349
+
350
+ self._artifact = save_artifact(
351
+ self._df,
352
+ description=description,
353
+ fields=self.fields,
354
+ columns_field=self._columns_field,
355
+ **kwargs,
356
+ **self._kwargs,
357
+ )
358
+ finally:
359
+ settings.verbosity = verbosity
360
+
361
+ return self._artifact
362
+
363
+ def clean_up_failed_runs(self):
364
+ """Clean up previous failed runs that don't save any outputs."""
365
+ from lamindb.core._context import context
366
+
367
+ if context.run is not None:
368
+ Run.filter(transform=context.run.transform, output_artifacts=None).exclude(
369
+ uid=context.run.uid
370
+ ).delete()
371
+
372
+
373
+ class AnnDataCurator(DataFrameCurator):
374
+ """Curation flow for ``AnnData``.
375
+
376
+ See also :class:`~lamindb.Curator`.
377
+
378
+ Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
379
+
380
+ See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
381
+
382
+ Args:
383
+ data: The AnnData object or an AnnData-like path.
384
+ var_index: The registry field for mapping the ``.var`` index.
385
+ categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
386
+ using_key: A reference LaminDB instance.
387
+ verbosity: The verbosity level.
388
+ organism: The organism name.
389
+ sources: A dictionary mapping ``.obs.columns`` to Source records.
390
+ exclude: A dictionary mapping column names to values to exclude.
391
+
392
+ Examples:
393
+ >>> import bionty as bt
394
+ >>> curate = ln.Curator.from_anndata(
395
+ ... adata,
396
+ ... var_index=bt.Gene.ensembl_gene_id,
397
+ ... categoricals={
398
+ ... "cell_type_ontology_id": bt.CellType.ontology_id,
399
+ ... "donor_id": ln.ULabel.name
400
+ ... },
401
+ ... organism="human",
402
+ ... )
403
+ """
404
+
405
+ def __init__(
406
+ self,
407
+ data: ad.AnnData | UPathStr,
408
+ var_index: FieldAttr,
409
+ categoricals: dict[str, FieldAttr] | None = None,
410
+ obs_columns: FieldAttr = Feature.name,
411
+ using_key: str = "default",
412
+ verbosity: str = "hint",
413
+ organism: str | None = None,
414
+ sources: dict[str, Record] | None = None,
415
+ exclude: dict | None = None,
416
+ ) -> None:
417
+ from lamindb_setup.core import upath
418
+
419
+ from ._artifact import data_is_anndata
420
+
421
+ if sources is None:
422
+ sources = {}
423
+ if not data_is_anndata(data):
424
+ raise ValueError(
425
+ "data has to be an AnnData object or a path to AnnData-like"
426
+ )
427
+ if isinstance(data, ad.AnnData):
428
+ self._adata = data
429
+ else: # pragma: no cover
430
+ from lamindb.core.storage._backed_access import backed_access
431
+
432
+ self._adata = backed_access(upath.create_path(data))
433
+
434
+ self._data = data
435
+ self._var_field = var_index
436
+ super().__init__(
437
+ df=self._adata.obs,
438
+ categoricals=categoricals,
439
+ columns=obs_columns,
440
+ using_key=using_key,
441
+ verbosity=verbosity,
442
+ organism=organism,
443
+ sources=sources,
444
+ exclude=exclude,
445
+ check_valid_keys=False,
446
+ )
447
+ self._obs_fields = categoricals or {}
448
+ self._check_valid_keys(extra={"var_index"})
449
+
450
+ @property
451
+ def var_index(self) -> FieldAttr:
452
+ """Return the registry field to validate variables index against."""
453
+ return self._var_field
454
+
455
+ @property
456
+ def categoricals(self) -> dict:
457
+ """Return the obs fields to validate against."""
458
+ return self._obs_fields
459
+
460
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
461
+ """Lookup categories.
462
+
463
+ Args:
464
+ using_key: The instance where the lookup is performed.
465
+ if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
466
+ if "public", the lookup is performed on the public reference.
467
+ """
468
+ return CurateLookup(
469
+ categoricals=self._obs_fields,
470
+ slots={"columns": self._columns_field, "var_index": self._var_field},
471
+ using_key=using_key or self._using_key,
472
+ )
473
+
474
+ def _save_from_var_index(
475
+ self, validated_only: bool = True, organism: str | None = None
476
+ ):
477
+ """Save variable records."""
478
+ update_registry(
479
+ values=list(self._adata.var.index),
480
+ field=self.var_index,
481
+ key="var_index",
482
+ save_function="add_new_from_var_index",
483
+ using_key=self._using_key,
484
+ validated_only=validated_only,
485
+ organism=organism,
486
+ source=self._sources.get("var_index"),
487
+ exclude=self._exclude.get("var_index"),
488
+ )
489
+
490
+ def _update_registry_all(self, validated_only: bool = True, **kwargs):
491
+ """Save labels for all features."""
492
+ for name in self.fields.keys():
493
+ logger.info(f"saving labels for '{name}'")
494
+ if name == "var_index":
495
+ self._save_from_var_index(validated_only=validated_only, **kwargs)
496
+ else:
497
+ self._update_registry(name, validated_only=validated_only, **kwargs)
498
+
499
+ def add_new_from_var_index(self, organism: str | None = None, **kwargs):
500
+ """Update variable records.
501
+
502
+ Args:
503
+ organism: The organism name.
504
+ **kwargs: Additional keyword arguments to pass to the registry model.
505
+ """
506
+ self._kwargs.update({"organism": organism} if organism else {})
507
+ self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
508
+
509
+ def add_validated_from_var_index(self, organism: str | None = None):
510
+ """Add validated variable records.
511
+
512
+ Args:
513
+ organism: The organism name.
514
+ """
515
+ self._kwargs.update({"organism": organism} if organism else {})
516
+ self._save_from_var_index(validated_only=True, **self._kwargs)
517
+
518
+ def validate(self, organism: str | None = None) -> bool:
519
+ """Validate categories.
520
+
521
+ Args:
522
+ organism: The organism name.
523
+
524
+ Returns:
525
+ Whether the AnnData object is validated.
526
+ """
527
+ self._kwargs.update({"organism": organism} if organism else {})
528
+ if self._using_key is not None and self._using_key != "default":
529
+ logger.important(
530
+ f"validating metadata using registries of instance {colors.italic(self._using_key)}"
531
+ )
532
+
533
+ validated_var, non_validated_var = validate_categories(
534
+ self._adata.var.index,
535
+ field=self._var_field,
536
+ key="var_index",
537
+ using_key=self._using_key,
538
+ source=self._sources.get("var_index"),
539
+ validated_hint_print=".add_validated_from_var_index()",
540
+ exclude=self._exclude.get("var_index"),
541
+ **self._kwargs, # type: ignore
542
+ )
543
+ validated_obs, non_validated_obs = validate_categories_in_df(
544
+ self._adata.obs,
545
+ fields=self.categoricals,
546
+ using_key=self._using_key,
547
+ sources=self._sources,
548
+ exclude=self._exclude,
549
+ **self._kwargs,
550
+ )
551
+ self._non_validated = non_validated_obs # type: ignore
552
+ if len(non_validated_var) > 0:
553
+ self._non_validated["var_index"] = non_validated_var # type: ignore
554
+ self._validated = validated_var and validated_obs
555
+ return self._validated
556
+
557
+ def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
558
+ """Save the validated ``AnnData`` and metadata.
559
+
560
+ Args:
561
+ description: Description of the ``AnnData`` object.
562
+ **kwargs: Object level metadata.
563
+
564
+ Returns:
565
+ A saved artifact record.
566
+ """
567
+ if not self._validated:
568
+ self.validate()
569
+ if not self._validated:
570
+ raise ValidationError("Dataset does not validate. Please curate.")
571
+
572
+ self._artifact = save_artifact(
573
+ self._data,
574
+ adata=self._adata,
575
+ description=description,
576
+ columns_field=self.var_index,
577
+ fields=self.categoricals,
578
+ **self._kwargs,
579
+ **kwargs,
580
+ )
581
+ return self._artifact
582
+
583
+
584
+ class MuDataCurator:
585
+ """Curation flow for a ``MuData`` object.
586
+
587
+ See also :class:`~lamindb.Curator`.
588
+
589
+ Note that if genes or other measurements are removed from the MuData object,
590
+ the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
591
+
592
+ Args:
593
+ mdata: The MuData object to curate.
594
+ var_index: The registry field for mapping the ``.var`` index for each modality.
595
+ For example:
596
+ ``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
597
+ categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
598
+ Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
599
+ using_key: A reference LaminDB instance.
600
+ verbosity: The verbosity level.
601
+ organism: The organism name.
602
+ sources: A dictionary mapping ``.obs.columns`` to Source records.
603
+ exclude: A dictionary mapping column names to values to exclude.
604
+
605
+ Examples:
606
+ >>> import bionty as bt
607
+ >>> curate = ln.Curator.from_mudata(
608
+ ... mdata,
609
+ ... var_index={
610
+ ... "rna": bt.Gene.ensembl_gene_id,
611
+ ... "adt": ln.CellMarker.name
612
+ ... },
613
+ ... categoricals={
614
+ ... "cell_type_ontology_id": bt.CellType.ontology_id,
615
+ ... "donor_id": ln.ULabel.name
616
+ ... },
617
+ ... organism="human",
618
+ ... )
619
+ """
620
+
621
+ def __init__(
622
+ self,
623
+ mdata: MuData,
624
+ var_index: dict[str, dict[str, FieldAttr]],
625
+ categoricals: dict[str, FieldAttr] | None = None,
626
+ using_key: str = "default",
627
+ verbosity: str = "hint",
628
+ organism: str | None = None,
629
+ sources: dict[str, Record] | None = None,
630
+ exclude: dict | None = None,
631
+ ) -> None:
632
+ if sources is None:
633
+ sources = {}
634
+ self._sources = sources
635
+ if exclude is None:
636
+ exclude = {}
637
+ self._exclude = exclude
638
+ self._mdata = mdata
639
+ self._kwargs = {"organism": organism} if organism else {}
640
+ self._var_fields = var_index
641
+ self._verify_modality(self._var_fields.keys())
642
+ self._obs_fields = self._parse_categoricals(categoricals)
643
+ self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
644
+ self._using_key = using_key
645
+ self._verbosity = verbosity
646
+ self._df_annotators = {
647
+ modality: DataFrameCurator(
648
+ df=mdata[modality].obs if modality != "obs" else mdata.obs,
649
+ categoricals=self._obs_fields.get(modality, {}),
650
+ using_key=using_key,
651
+ verbosity=verbosity,
652
+ sources=self._sources.get(modality),
653
+ exclude=self._exclude.get(modality),
654
+ check_valid_keys=False,
655
+ **self._kwargs,
656
+ )
657
+ for modality in self._modalities
658
+ }
659
+ for modality in self._var_fields.keys():
660
+ self._save_from_var_index_modality(
661
+ modality=modality, validated_only=True, **self._kwargs
662
+ )
663
+
664
+ @property
665
+ def var_index(self) -> FieldAttr:
666
+ """Return the registry field to validate variables index against."""
667
+ return self._var_fields
668
+
669
+ @property
670
+ def categoricals(self) -> dict:
671
+ """Return the obs fields to validate against."""
672
+ return self._obs_fields
673
+
674
+ def _verify_modality(self, modalities: Iterable[str]):
675
+ """Verify the modality exists."""
676
+ for modality in modalities:
677
+ if modality not in self._mdata.mod.keys():
678
+ raise ValueError(f"modality '{modality}' does not exist!")
679
+
680
+ def _save_from_var_index_modality(
681
+ self, modality: str, validated_only: bool = True, **kwargs
682
+ ):
683
+ """Save variable records."""
684
+ update_registry(
685
+ values=list(self._mdata[modality].var.index),
686
+ field=self._var_fields[modality],
687
+ key="var_index",
688
+ save_function="add_new_from_var_index",
689
+ using_key=self._using_key,
690
+ validated_only=validated_only,
691
+ dtype="number",
692
+ source=self._sources.get(modality, {}).get("var_index"),
693
+ exclude=self._exclude.get(modality, {}).get("var_index"),
694
+ **kwargs,
695
+ )
696
+
697
+ def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
698
+ """Parse the categorical fields."""
699
+ prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
700
+ obs_fields: dict[str, dict[str, FieldAttr]] = {}
701
+ for k, v in categoricals.items():
702
+ if k not in self._mdata.obs.columns:
703
+ raise ValueError(f"column '{k}' does not exist in mdata.obs!")
704
+ if any(k.startswith(prefix) for prefix in prefixes):
705
+ modality, col = k.split(":")[0], k.split(":")[1]
706
+ if modality not in obs_fields.keys():
707
+ obs_fields[modality] = {}
708
+ obs_fields[modality][col] = v
709
+ else:
710
+ if "obs" not in obs_fields.keys():
711
+ obs_fields["obs"] = {}
712
+ obs_fields["obs"][k] = v
713
+ return obs_fields
714
+
715
+ def lookup(self, using_key: str | None = None) -> CurateLookup:
716
+ """Lookup categories.
717
+
718
+ Args:
719
+ using_key: The instance where the lookup is performed.
720
+ if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
721
+ if "public", the lookup is performed on the public reference.
722
+ """
723
+ return CurateLookup(
724
+ categoricals=self._obs_fields,
725
+ slots={
726
+ **self._obs_fields,
727
+ **{f"{k}_var_index": v for k, v in self._var_fields.items()},
728
+ },
729
+ using_key=using_key or self._using_key,
730
+ )
731
+
732
+ def add_new_from_columns(
733
+ self,
734
+ modality: str,
735
+ column_names: list[str] | None = None,
736
+ organism: str | None = None,
737
+ **kwargs,
738
+ ):
739
+ """Update columns records.
740
+
741
+ Args:
742
+ modality: The modality name.
743
+ column_names: The column names to save.
744
+ organism: The organism name.
745
+ **kwargs: Additional keyword arguments to pass to the registry model.
746
+ """
747
+ self._kwargs.update({"organism": organism} if organism else {})
748
+ values = column_names or self._mdata[modality].obs.columns
749
+ update_registry(
750
+ values=list(values),
751
+ field=Feature.name,
752
+ key=f"{modality} obs columns",
753
+ using_key=self._using_key,
754
+ validated_only=False,
755
+ df=self._mdata[modality].obs,
756
+ source=self._sources.get(modality, {}).get("columns"),
757
+ exclude=self._exclude.get(modality, {}).get("columns"),
758
+ **self._kwargs, # type: ignore
759
+ **kwargs,
760
+ )
761
+
762
+ def add_new_from_var_index(
763
+ self, modality: str, organism: str | None = None, **kwargs
764
+ ):
765
+ """Update variable records.
766
+
767
+ Args:
768
+ modality: The modality name.
769
+ organism: The organism name.
770
+ **kwargs: Additional keyword arguments to pass to the registry model.
771
+ """
772
+ self._kwargs.update({"organism": organism} if organism else {})
773
+ self._save_from_var_index_modality(
774
+ modality=modality, validated_only=False, **self._kwargs, **kwargs
775
+ )
776
+
777
+ def add_validated_from_var_index(self, modality: str, organism: str | None = None):
778
+ """Add validated variable records.
779
+
780
+ Args:
781
+ modality: The modality name.
782
+ organism: The organism name.
783
+ """
784
+ self._kwargs.update({"organism": organism} if organism else {})
785
+ self._save_from_var_index_modality(
786
+ modality=modality, validated_only=True, **self._kwargs
787
+ )
788
+
789
+ def add_validated_from(
790
+ self, key: str, modality: str | None = None, organism: str | None = None
791
+ ):
792
+ """Add validated categories.
793
+
794
+ Args:
795
+ key: The key referencing the slot in the DataFrame.
796
+ modality: The modality name.
797
+ organism: The organism name.
798
+ """
799
+ self._kwargs.update({"organism": organism} if organism else {})
800
+ modality = modality or "obs"
801
+ if modality in self._df_annotators:
802
+ df_annotator = self._df_annotators[modality]
803
+ df_annotator.add_validated_from(key=key, **self._kwargs)
804
+
805
+ def add_new_from(
806
+ self,
807
+ key: str,
808
+ modality: str | None = None,
809
+ organism: str | None = None,
810
+ **kwargs,
811
+ ):
812
+ """Add validated & new categories.
813
+
814
+ Args:
815
+ key: The key referencing the slot in the DataFrame.
816
+ modality: The modality name.
817
+ organism: The organism name.
818
+ **kwargs: Additional keyword arguments to pass to the registry model.
819
+ """
820
+ if len(kwargs) > 0 and key == "all":
821
+ raise ValueError("Cannot pass additional arguments to 'all' key!")
822
+ self._kwargs.update({"organism": organism} if organism else {})
823
+ modality = modality or "obs"
824
+ if modality in self._df_annotators:
825
+ df_annotator = self._df_annotators[modality]
826
+ df_annotator.add_new_from(key=key, **self._kwargs, **kwargs)
827
+
828
+ def validate(self, organism: str | None = None) -> bool:
829
+ """Validate categories."""
830
+ self._kwargs.update({"organism": organism} if organism else {})
831
+ if self._using_key is not None and self._using_key != "default":
832
+ logger.important(
833
+ f"validating metadata using registries of instance {colors.italic(self._using_key)}"
834
+ )
835
+ validated_var = True
836
+ non_validated_var_modality = {}
837
+ for modality, var_field in self._var_fields.items():
838
+ is_validated_var, non_validated_var = validate_categories(
839
+ self._mdata[modality].var.index,
840
+ field=var_field,
841
+ key=f"{modality}_var_index",
842
+ using_key=self._using_key,
843
+ source=self._sources.get(modality, {}).get("var_index"),
844
+ exclude=self._exclude.get(modality, {}).get("var_index"),
845
+ **self._kwargs, # type: ignore
846
+ )
847
+ validated_var &= is_validated_var
848
+ if len(non_validated_var) > 0:
849
+ non_validated_var_modality[modality] = non_validated_var
850
+
851
+ validated_obs = True
852
+ non_validated_obs_modality = {}
853
+ for modality, fields in self._obs_fields.items():
854
+ if modality == "obs":
855
+ obs = self._mdata.obs
856
+ else:
857
+ obs = self._mdata[modality].obs
858
+ is_validated_obs, non_validated_obs = validate_categories_in_df(
859
+ obs,
860
+ fields=fields,
861
+ using_key=self._using_key,
862
+ sources=self._sources.get(modality),
863
+ exclude=self._exclude.get(modality),
864
+ **self._kwargs,
865
+ )
866
+ validated_obs &= is_validated_obs
867
+ non_validated_obs_modality[modality] = non_validated_obs
868
+ if modality in non_validated_var_modality:
869
+ non_validated_obs_modality[modality]["var_index"] = (
870
+ non_validated_var_modality[modality]
871
+ )
872
+ if len(non_validated_obs_modality[modality]) > 0:
873
+ self._non_validated = non_validated_obs_modality[modality]
874
+ self._validated = validated_var and validated_obs
875
+ return self._validated
876
+
877
+ def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
878
+ """Save the validated ``MuData`` and metadata.
879
+
880
+ Args:
881
+ description: Description of the ``MuData`` object.
882
+ **kwargs: Object level metadata.
883
+
884
+ Returns:
885
+ A saved artifact record.
886
+ """
887
+ if not self._validated:
888
+ raise ValidationError("Please run `validate()` first!")
889
+
890
+ self._artifact = save_artifact(
891
+ self._mdata,
892
+ description=description,
893
+ columns_field=self.var_index,
894
+ fields=self.categoricals,
895
+ **self._kwargs,
896
+ **kwargs,
897
+ )
898
+ return self._artifact
899
+
900
+
901
+ class Curator(BaseCurator):
902
+ """Dataset curator.
903
+
904
+ Data curation entails accurately labeling datasets with standardized metadata
905
+ to facilitate data integration, interpretation and analysis.
906
+
907
+ The curation flow has several steps:
908
+
909
+ 1. Instantiate `Curator` from one of the following dataset objects:
910
+
911
+ - :meth:`~lamindb.Curator.from_df`
912
+ - :meth:`~lamindb.Curator.from_anndata`
913
+ - :meth:`~lamindb.Curator.from_mudata`
914
+
915
+ During object creation, any passed categoricals found in the object will be saved.
916
+
917
+ 2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
918
+
919
+ - Values that can successfully validated and already exist in the registry.
920
+ - Values which are new and not yet validated or potentially problematic values.
921
+
922
+ 3. Determine how to handle validated and non-validated values:
923
+
924
+ - Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
925
+ - Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
926
+ - All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
927
+ """
928
+
929
+ @classmethod
930
+ @doc_args(DataFrameCurator.__doc__)
931
+ def from_df(
932
+ cls,
933
+ df: pd.DataFrame,
934
+ categoricals: dict[str, FieldAttr] | None = None,
935
+ columns: FieldAttr = Feature.name,
936
+ using_key: str | None = None,
937
+ verbosity: str = "hint",
938
+ organism: str | None = None,
939
+ ) -> DataFrameCurator:
940
+ """{}""" # noqa: D415
941
+ return DataFrameCurator(
942
+ df=df,
943
+ categoricals=categoricals,
944
+ columns=columns,
945
+ using_key=using_key,
946
+ verbosity=verbosity,
947
+ organism=organism,
948
+ )
949
+
950
+ @classmethod
951
+ @doc_args(AnnDataCurator.__doc__)
952
+ def from_anndata(
953
+ cls,
954
+ data: ad.AnnData | UPathStr,
955
+ var_index: FieldAttr,
956
+ categoricals: dict[str, FieldAttr] | None = None,
957
+ obs_columns: FieldAttr = Feature.name,
958
+ using_key: str = "default",
959
+ verbosity: str = "hint",
960
+ organism: str | None = None,
961
+ sources: dict[str, Record] | None = None,
962
+ ) -> AnnDataCurator:
963
+ """{}""" # noqa: D415
964
+ return AnnDataCurator(
965
+ data=data,
966
+ var_index=var_index,
967
+ categoricals=categoricals,
968
+ obs_columns=obs_columns,
969
+ using_key=using_key,
970
+ verbosity=verbosity,
971
+ organism=organism,
972
+ sources=sources,
973
+ )
974
+
975
+ @classmethod
976
+ @doc_args(MuDataCurator.__doc__)
977
+ def from_mudata(
978
+ cls,
979
+ mdata: MuData,
980
+ var_index: dict[str, dict[str, FieldAttr]],
981
+ categoricals: dict[str, FieldAttr] | None = None,
982
+ using_key: str = "default",
983
+ verbosity: str = "hint",
984
+ organism: str | None = None,
985
+ ) -> MuDataCurator:
986
+ """{}""" # noqa: D415
987
+ return MuDataCurator(
988
+ mdata=mdata,
989
+ var_index=var_index,
990
+ categoricals=categoricals,
991
+ using_key=using_key,
992
+ verbosity=verbosity,
993
+ organism=organism,
994
+ )
995
+
996
+
997
+ def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
998
+ """Get a registry instance using a specific instance."""
999
+ if using_key is not None and using_key != "default":
1000
+ return registry.using(using_key)
1001
+ return registry
1002
+
1003
+
1004
+ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
1005
+ """Make sure the source and organism are saved in the same database as the registry."""
1006
+ from lamindb.core._settings import settings
1007
+
1008
+ db = registry.filter().db
1009
+ source = kwargs.get("source")
1010
+ organism = kwargs.get("organism")
1011
+ filter_kwargs = kwargs.copy()
1012
+ try:
1013
+ verbosity = settings.verbosity
1014
+ settings.verbosity = "error"
1015
+ if isinstance(organism, Record) and organism._state.db != "default":
1016
+ if db is None or db == "default":
1017
+ organism_default = copy.copy(organism)
1018
+ # save the organism record in the default database
1019
+ organism_default.save()
1020
+ filter_kwargs["organism"] = organism_default
1021
+ if isinstance(source, Record) and source._state.db != "default":
1022
+ if db is None or db == "default":
1023
+ source_default = copy.copy(source)
1024
+ # save the source record in the default database
1025
+ source_default.save()
1026
+ filter_kwargs["source"] = source_default
1027
+ finally:
1028
+ settings.verbosity = verbosity
1029
+ return filter_kwargs
1030
+
1031
+
1032
+ def standardize_and_inspect(
1033
+ values: Iterable[str],
1034
+ field: FieldAttr,
1035
+ registry: type[Record],
1036
+ standardize: bool = False,
1037
+ exclude: str | list | None = None,
1038
+ **kwargs,
1039
+ ):
1040
+ """Standardize and inspect values using a registry."""
1041
+ # inspect exclude values in the default instance
1042
+ values = list(values)
1043
+ include_validated = []
1044
+ if exclude is not None:
1045
+ exclude = [exclude] if isinstance(exclude, str) else exclude
1046
+ exclude = [i for i in exclude if i in values]
1047
+ if len(exclude) > 0:
1048
+ # exclude values are validated without source and organism
1049
+ inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
1050
+ # if exclude values are validated, remove them from the values
1051
+ values = [i for i in values if i not in inspect_result_exclude.validated]
1052
+ include_validated = inspect_result_exclude.validated
1053
+
1054
+ if standardize:
1055
+ if hasattr(registry, "standardize") and hasattr(
1056
+ registry,
1057
+ "synonyms", # https://github.com/laminlabs/lamindb/issues/1685
1058
+ ):
1059
+ standardized_values = registry.standardize(
1060
+ values, field=field, mute=True, **kwargs
1061
+ )
1062
+ values = standardized_values
1063
+
1064
+ inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
1065
+ inspect_result._validated += include_validated
1066
+ inspect_result._non_validated = [
1067
+ i for i in inspect_result.non_validated if i not in include_validated
1068
+ ]
1069
+
1070
+ return inspect_result
1071
+
1072
+
1073
+ def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
1074
+ """Check if a registry needs an organism and return the organism name."""
1075
+ if hasattr(registry, "organism_id"):
1076
+ import bionty as bt
1077
+
1078
+ if organism is None and bt.settings.organism is None:
1079
+ raise ValueError(
1080
+ f"{registry.__name__} registry requires an organism!\n"
1081
+ " please pass an organism name via organism="
1082
+ )
1083
+ return {"organism": organism or bt.settings.organism.name}
1084
+ return {}
1085
+
1086
+
1087
+ def validate_categories(
1088
+ values: Iterable[str],
1089
+ field: FieldAttr,
1090
+ key: str,
1091
+ using_key: str | None = None,
1092
+ organism: str | None = None,
1093
+ source: Record | None = None,
1094
+ exclude: str | list | None = None,
1095
+ standardize: bool = True,
1096
+ validated_hint_print: str | None = None,
1097
+ ) -> tuple[bool, list]:
1098
+ """Validate ontology terms in a pandas series using LaminDB registries.
1099
+
1100
+ Args:
1101
+ values: The values to validate.
1102
+ field: The field attribute.
1103
+ key: The key referencing the slot in the DataFrame.
1104
+ using_key: A reference LaminDB instance.
1105
+ organism: The organism name.
1106
+ source: The source record.
1107
+ exclude: Exclude specific values.
1108
+ standardize: Standardize the values.
1109
+ validated_hint_print: The hint to print for validated values.
1110
+ """
1111
+ from lamindb._from_values import _print_values
1112
+ from lamindb.core._settings import settings
1113
+
1114
+ model_field = f"{field.field.model.__name__}.{field.field.name}"
1115
+
1116
+ def _log_mapping_info():
1117
+ logger.indent = ""
1118
+ logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}")
1119
+ logger.indent = " "
1120
+
1121
+ registry = field.field.model
1122
+
1123
+ kwargs = check_registry_organism(registry, organism)
1124
+ kwargs.update({"source": source} if source else {})
1125
+ kwargs_current = get_current_filter_kwargs(registry, kwargs)
1126
+
1127
+ # inspect the default instance
1128
+ inspect_result = standardize_and_inspect(
1129
+ values=values,
1130
+ field=field,
1131
+ registry=registry,
1132
+ standardize=standardize,
1133
+ exclude=exclude,
1134
+ **kwargs_current,
1135
+ )
1136
+ non_validated = inspect_result.non_validated
1137
+
1138
+ # inspect the using instance
1139
+ values_validated = []
1140
+ if using_key is not None and using_key != "default" and non_validated:
1141
+ registry_using = get_registry_instance(registry, using_key)
1142
+ inspect_result = standardize_and_inspect(
1143
+ values=non_validated,
1144
+ field=field,
1145
+ registry=registry_using,
1146
+ standardize=standardize,
1147
+ exclude=exclude,
1148
+ **kwargs,
1149
+ )
1150
+ non_validated = inspect_result.non_validated
1151
+ values_validated += inspect_result.validated
1152
+
1153
+ # inspect from public (bionty only)
1154
+ if hasattr(registry, "public"):
1155
+ verbosity = settings.verbosity
1156
+ try:
1157
+ settings.verbosity = "error"
1158
+ public_records = registry.from_values(
1159
+ non_validated,
1160
+ field=field,
1161
+ **kwargs_current,
1162
+ )
1163
+ values_validated += [getattr(r, field.field.name) for r in public_records]
1164
+ finally:
1165
+ settings.verbosity = verbosity
1166
+
1167
+ validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
1168
+ n_validated = len(values_validated)
1169
+ if n_validated > 0:
1170
+ _log_mapping_info()
1171
+ logger.warning(
1172
+ f"found {colors.yellow(n_validated)} validated terms: "
1173
+ f"{colors.yellow(values_validated)}\n → save terms via "
1174
+ f"{colors.yellow(validated_hint_print)}"
1175
+ )
1176
+
1177
+ non_validated_hint_print = f".add_new_from('{key}')"
1178
+ non_validated = [i for i in non_validated if i not in values_validated]
1179
+ n_non_validated = len(non_validated)
1180
+ if n_non_validated == 0:
1181
+ if n_validated == 0:
1182
+ logger.indent = ""
1183
+ logger.success(f"{key} is validated against {colors.italic(model_field)}")
1184
+ return True, []
1185
+ else:
1186
+ # validated values still need to be saved to the current instance
1187
+ return False, []
1188
+ else:
1189
+ are = "are" if n_non_validated > 1 else "is"
1190
+ print_values = _print_values(non_validated)
1191
+ warning_message = (
1192
+ f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
1193
+ f"{colors.red(print_values)}\n → fix typos, remove non-existent values, or save terms via "
1194
+ f"{colors.red(non_validated_hint_print)}"
1195
+ )
1196
+ if logger.indent == "":
1197
+ _log_mapping_info()
1198
+ logger.warning(warning_message)
1199
+ logger.indent = ""
1200
+ return False, non_validated
1201
+
1202
+
1203
+ def validate_categories_in_df(
1204
+ df: pd.DataFrame,
1205
+ fields: dict[str, FieldAttr],
1206
+ using_key: str | None = None,
1207
+ sources: dict[str, Record] = None,
1208
+ exclude: dict | None = None,
1209
+ **kwargs,
1210
+ ) -> tuple[bool, dict]:
1211
+ """Validate categories in DataFrame columns using LaminDB registries."""
1212
+ if not fields:
1213
+ return True, {}
1214
+
1215
+ if sources is None:
1216
+ sources = {}
1217
+ validated = True
1218
+ non_validated = {}
1219
+ for key, field in fields.items():
1220
+ is_val, non_val = validate_categories(
1221
+ df[key],
1222
+ field=field,
1223
+ key=key,
1224
+ using_key=using_key,
1225
+ source=sources.get(key),
1226
+ exclude=exclude.get(key) if exclude else None,
1227
+ **kwargs,
1228
+ )
1229
+ validated &= is_val
1230
+ if len(non_val) > 0:
1231
+ non_validated[key] = non_val
1232
+ return validated, non_validated
1233
+
1234
+
1235
+ def save_artifact(
1236
+ data: pd.DataFrame | ad.AnnData | MuData,
1237
+ fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
1238
+ columns_field: FieldAttr | dict[str, FieldAttr],
1239
+ description: str | None = None,
1240
+ organism: str | None = None,
1241
+ adata: ad.AnnData | None = None,
1242
+ **kwargs,
1243
+ ) -> Artifact:
1244
+ """Save all metadata with an Artifact.
1245
+
1246
+ Args:
1247
+ data: The DataFrame or AnnData object to save.
1248
+ description: A description of the artifact.
1249
+ fields: A dictionary mapping obs_column to registry_field.
1250
+ columns_field: The registry field to validate variables index against.
1251
+ organism: The organism name.
1252
+ adata: The AnnData object to save, must be provided if data is a path.
1253
+ kwargs: Additional keyword arguments to pass to the registry model.
1254
+
1255
+ Returns:
1256
+ The saved Artifact.
1257
+ """
1258
+ from ._artifact import data_is_anndata
1259
+
1260
+ artifact = None
1261
+ if data_is_anndata(data):
1262
+ assert adata is not None # noqa: S101
1263
+ artifact = Artifact.from_anndata(data, description=description, **kwargs)
1264
+ artifact.n_observations = adata.shape[0]
1265
+ data = adata
1266
+
1267
+ elif isinstance(data, pd.DataFrame):
1268
+ artifact = Artifact.from_df(data, description=description, **kwargs)
1269
+ else:
1270
+ try:
1271
+ from mudata import MuData
1272
+
1273
+ if isinstance(data, MuData):
1274
+ artifact = Artifact.from_mudata(data, description=description, **kwargs)
1275
+ artifact.n_observations = data.n_obs
1276
+ except ImportError:
1277
+ pass
1278
+ if artifact is None:
1279
+ raise ValueError("data must be a DataFrame, AnnData or MuData object.")
1280
+ artifact.save()
1281
+
1282
+ feature_kwargs = check_registry_organism(
1283
+ (
1284
+ list(columns_field.values())[0].field.model
1285
+ if isinstance(columns_field, dict)
1286
+ else columns_field.field.model
1287
+ ),
1288
+ organism,
1289
+ )
1290
+
1291
+ if artifact._accessor == "DataFrame":
1292
+ artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
1293
+ elif artifact._accessor == "AnnData":
1294
+ artifact.features._add_set_from_anndata(
1295
+ var_field=columns_field, **feature_kwargs
1296
+ )
1297
+ elif artifact._accessor == "MuData":
1298
+ artifact.features._add_set_from_mudata(
1299
+ var_fields=columns_field, **feature_kwargs
1300
+ )
1301
+ else:
1302
+ raise NotImplementedError
1303
+
1304
+ def _add_labels(data, artifact: Artifact, fields: dict[str, FieldAttr]):
1305
+ features = Feature.lookup().dict()
1306
+ for key, field in fields.items():
1307
+ feature = features.get(key)
1308
+ registry = field.field.model
1309
+ filter_kwargs = check_registry_organism(registry, organism)
1310
+ filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1311
+ df = data if isinstance(data, pd.DataFrame) else data.obs
1312
+ labels = registry.from_values(
1313
+ df[key],
1314
+ field=field,
1315
+ **filter_kwargs_current,
1316
+ )
1317
+ artifact.labels.add(labels, feature)
1318
+
1319
+ if artifact._accessor == "MuData":
1320
+ for modality, modality_fields in fields.items():
1321
+ if modality == "obs":
1322
+ _add_labels(data, artifact, modality_fields)
1323
+ else:
1324
+ _add_labels(data[modality], artifact, modality_fields)
1325
+ else:
1326
+ _add_labels(data, artifact, fields)
1327
+
1328
+ slug = ln_setup.settings.instance.slug
1329
+ if ln_setup.settings.instance.is_remote: # pragma: no cover
1330
+ logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
1331
+ return artifact
1332
+
1333
+
1334
+ def update_registry(
1335
+ values: list[str],
1336
+ field: FieldAttr,
1337
+ key: str,
1338
+ save_function: str = "add_new_from",
1339
+ using_key: str | None = None,
1340
+ validated_only: bool = True,
1341
+ df: pd.DataFrame | None = None,
1342
+ organism: str | None = None,
1343
+ dtype: str | None = None,
1344
+ source: Record | None = None,
1345
+ standardize: bool = True,
1346
+ warning: bool = True,
1347
+ exclude: str | list | None = None,
1348
+ **kwargs,
1349
+ ) -> None:
1350
+ """Save features or labels records in the default instance from the using_key instance.
1351
+
1352
+ Args:
1353
+ values: A list of values to be saved as labels.
1354
+ field: The FieldAttr object representing the field for which labels are being saved.
1355
+ key: The name of the feature to save.
1356
+ save_function: The name of the function to save the labels.
1357
+ using_key: The name of the instance from which to transfer labels (if applicable).
1358
+ validated_only: If True, only save validated labels.
1359
+ df: A DataFrame to save labels from.
1360
+ organism: The organism name.
1361
+ dtype: The type of the feature.
1362
+ source: The source record.
1363
+ kwargs: Additional keyword arguments to pass to the registry model to create new records.
1364
+ """
1365
+ from lamindb._save import save as ln_save
1366
+ from lamindb.core._settings import settings
1367
+
1368
+ registry = field.field.model
1369
+ filter_kwargs = check_registry_organism(registry, organism)
1370
+ filter_kwargs.update({"source": source} if source else {})
1371
+
1372
+ verbosity = settings.verbosity
1373
+ try:
1374
+ settings.verbosity = "error"
1375
+
1376
+ # save from public
1377
+ filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
1378
+ existing_and_public_records = (
1379
+ registry.from_values(
1380
+ list(values),
1381
+ field=field,
1382
+ **filter_kwargs_current,
1383
+ )
1384
+ if values
1385
+ else []
1386
+ )
1387
+
1388
+ labels_saved: dict = {"from public": [], "without reference": []}
1389
+
1390
+ public_records = [r for r in existing_and_public_records if r._state.adding]
1391
+ # here we check to only save the public records if they are from the specified source
1392
+ # we check the uid because r.source and soruce can be from different instances
1393
+ if source:
1394
+ public_records = [r for r in public_records if r.source.uid == source.uid]
1395
+ ln_save(public_records)
1396
+ labels_saved["from public"] = [
1397
+ getattr(r, field.field.name) for r in public_records
1398
+ ]
1399
+ non_public_labels = [i for i in values if i not in labels_saved["from public"]]
1400
+
1401
+ # inspect the default instance
1402
+ inspect_result_current = standardize_and_inspect(
1403
+ values=non_public_labels,
1404
+ field=field,
1405
+ registry=registry,
1406
+ standardize=standardize,
1407
+ exclude=exclude,
1408
+ **filter_kwargs_current,
1409
+ )
1410
+ if not inspect_result_current.non_validated:
1411
+ all_labels = registry.from_values(
1412
+ inspect_result_current.validated,
1413
+ field=field,
1414
+ **filter_kwargs_current,
1415
+ )
1416
+ settings.verbosity = verbosity
1417
+ return all_labels
1418
+
1419
+ # inspect the using_key instance
1420
+ (
1421
+ labels_saved[f"from {using_key}"],
1422
+ non_validated_labels,
1423
+ ) = update_registry_from_using_instance(
1424
+ inspect_result_current.non_validated,
1425
+ field=field,
1426
+ using_key=using_key,
1427
+ exclude=exclude,
1428
+ **filter_kwargs,
1429
+ )
1430
+
1431
+ labels_saved["without reference"] = [
1432
+ i
1433
+ for i in non_validated_labels
1434
+ if i not in labels_saved[f"from {using_key}"]
1435
+ ]
1436
+
1437
+ # save non-validated records
1438
+ if not validated_only:
1439
+ non_validated_records = []
1440
+ if df is not None and registry == Feature:
1441
+ non_validated_records = Feature.from_df(df)
1442
+ else:
1443
+ if "organism" in filter_kwargs:
1444
+ # make sure organism record is saved to the current instance
1445
+ filter_kwargs["organism"] = _save_organism(name=organism)
1446
+ init_kwargs = {}
1447
+ for value in labels_saved["without reference"]:
1448
+ init_kwargs[field.field.name] = value
1449
+ if registry == Feature:
1450
+ init_kwargs["dtype"] = "cat" if dtype is None else dtype
1451
+ non_validated_records.append(
1452
+ registry(
1453
+ **init_kwargs,
1454
+ **{k: v for k, v in filter_kwargs.items() if k != "source"},
1455
+ **{k: v for k, v in kwargs.items() if k != "sources"},
1456
+ )
1457
+ )
1458
+ ln_save(non_validated_records)
1459
+
1460
+ # save parent labels for ulabels
1461
+ if registry == ULabel and field.field.name == "name":
1462
+ save_ulabels_with_parent(values, field=field, key=key)
1463
+
1464
+ # # get all records that are now validated in the current instance
1465
+ # all_labels = registry.from_values(
1466
+ # inspect_result_current.validated + inspect_result_current.non_validated,
1467
+ # field=field,
1468
+ # **get_current_filter_kwargs(registry, filter_kwargs),
1469
+ # )
1470
+ finally:
1471
+ settings.verbosity = verbosity
1472
+
1473
+ log_saved_labels(
1474
+ labels_saved,
1475
+ key=key,
1476
+ save_function=save_function,
1477
+ model_field=f"{registry.__name__}.{field.field.name}",
1478
+ validated_only=validated_only,
1479
+ warning=warning,
1480
+ )
1481
+
1482
+ # return all_labels
1483
+
1484
+
1485
+ def log_saved_labels(
1486
+ labels_saved: dict,
1487
+ key: str,
1488
+ save_function: str,
1489
+ model_field: str,
1490
+ validated_only: bool = True,
1491
+ warning: bool = True,
1492
+ ) -> None:
1493
+ """Log the saved labels."""
1494
+ from ._from_values import _print_values
1495
+
1496
+ model_field = colors.italic(model_field)
1497
+ for k, labels in labels_saved.items():
1498
+ if not labels:
1499
+ continue
1500
+
1501
+ if k == "without reference" and validated_only:
1502
+ msg = colors.yellow(
1503
+ f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
1504
+ )
1505
+ lookup_print = (
1506
+ f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
1507
+ )
1508
+
1509
+ hint = f".add_new_from('{key}')"
1510
+ msg += f"\n → to lookup values, use {lookup_print}"
1511
+ msg += (
1512
+ f"\n → to save, run {colors.yellow(hint)}"
1513
+ if save_function == "add_new_from"
1514
+ else f"\n → to save, run {colors.yellow(save_function)}"
1515
+ )
1516
+ if warning:
1517
+ logger.warning(msg)
1518
+ else:
1519
+ logger.info(msg)
1520
+ else:
1521
+ k = "" if k == "without reference" else f"{colors.green(k)} "
1522
+ # the term "transferred" stresses that this is always in the context of transferring
1523
+ # labels from a public ontology or a different instance to the present instance
1524
+ s = "s" if len(labels) > 1 else ""
1525
+ logger.success(
1526
+ f"added {len(labels)} record{s} {k}with {model_field} for {colors.italic(key)}: {_print_values(labels)}"
1527
+ )
1528
+
1529
+
1530
+ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None:
1531
+ """Save a parent label for the given labels."""
1532
+ registry = field.field.model
1533
+ assert registry == ULabel # noqa: S101
1534
+ all_records = registry.from_values(list(values), field=field)
1535
+ is_feature = registry.filter(name=f"is_{key}").one_or_none()
1536
+ if is_feature is None:
1537
+ is_feature = registry(name=f"is_{key}")
1538
+ is_feature.save()
1539
+ is_feature.children.add(*all_records)
1540
+
1541
+
1542
+ def update_registry_from_using_instance(
1543
+ values: list[str],
1544
+ field: FieldAttr,
1545
+ using_key: str | None = None,
1546
+ standardize: bool = False,
1547
+ exclude: str | list | None = None,
1548
+ **kwargs,
1549
+ ) -> tuple[list[str], list[str]]:
1550
+ """Save features or labels records from the using_key instance.
1551
+
1552
+ Args:
1553
+ values: A list of values to be saved as labels.
1554
+ field: The FieldAttr object representing the field for which labels are being saved.
1555
+ using_key: The name of the instance from which to transfer labels (if applicable).
1556
+ standardize: Whether to also standardize the values.
1557
+ kwargs: Additional keyword arguments to pass to the registry model.
1558
+
1559
+ Returns:
1560
+ A tuple containing the list of saved labels and the list of non-saved labels.
1561
+ """
1562
+ labels_saved = []
1563
+ not_saved = values
1564
+
1565
+ if using_key is not None and using_key != "default":
1566
+ registry_using = get_registry_instance(field.field.model, using_key)
1567
+
1568
+ inspect_result_using = standardize_and_inspect(
1569
+ values=values,
1570
+ field=field,
1571
+ registry=registry_using,
1572
+ standardize=standardize,
1573
+ exclude=exclude,
1574
+ **kwargs,
1575
+ )
1576
+ labels_using = registry_using.filter(
1577
+ **{f"{field.field.name}__in": inspect_result_using.validated}
1578
+ ).all()
1579
+ for label_using in labels_using:
1580
+ label_using.save()
1581
+ labels_saved.append(getattr(label_using, field.field.name))
1582
+ not_saved = inspect_result_using.non_validated
1583
+
1584
+ return labels_saved, not_saved
1585
+
1586
+
1587
+ def _save_organism(name: str): # pragma: no cover
1588
+ """Save an organism record."""
1589
+ import bionty as bt
1590
+
1591
+ organism = bt.Organism.filter(name=name).one_or_none()
1592
+ if organism is None:
1593
+ organism = bt.Organism.from_source(name=name)
1594
+ if organism is None:
1595
+ raise ValueError(
1596
+ f"Organism '{name}' not found\n"
1597
+ f" → please save it: bt.Organism(name='{name}').save()"
1598
+ )
1599
+ organism.save()
1600
+ return organism
1601
+
1602
+
1603
+ Curate = Curator # backward compat