lamindb 0.76.7__py3-none-any.whl → 0.76.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +113 -113
- lamindb/_artifact.py +1205 -1178
- lamindb/_can_validate.py +579 -579
- lamindb/_collection.py +387 -387
- lamindb/_curate.py +1601 -1601
- lamindb/_feature.py +155 -155
- lamindb/_feature_set.py +242 -242
- lamindb/_filter.py +23 -23
- lamindb/_finish.py +256 -256
- lamindb/_from_values.py +382 -382
- lamindb/_is_versioned.py +40 -40
- lamindb/_parents.py +476 -476
- lamindb/_query_manager.py +125 -125
- lamindb/_query_set.py +362 -362
- lamindb/_record.py +649 -649
- lamindb/_run.py +57 -57
- lamindb/_save.py +308 -295
- lamindb/_storage.py +14 -14
- lamindb/_transform.py +127 -127
- lamindb/_ulabel.py +56 -56
- lamindb/_utils.py +9 -9
- lamindb/_view.py +72 -72
- lamindb/core/__init__.py +94 -94
- lamindb/core/_context.py +574 -574
- lamindb/core/_data.py +438 -438
- lamindb/core/_feature_manager.py +867 -867
- lamindb/core/_label_manager.py +253 -253
- lamindb/core/_mapped_collection.py +597 -597
- lamindb/core/_settings.py +187 -187
- lamindb/core/_sync_git.py +138 -138
- lamindb/core/_track_environment.py +27 -27
- lamindb/core/datasets/__init__.py +59 -59
- lamindb/core/datasets/_core.py +571 -571
- lamindb/core/datasets/_fake.py +36 -36
- lamindb/core/exceptions.py +90 -77
- lamindb/core/fields.py +12 -12
- lamindb/core/loaders.py +164 -164
- lamindb/core/schema.py +56 -56
- lamindb/core/storage/__init__.py +25 -25
- lamindb/core/storage/_anndata_accessor.py +740 -740
- lamindb/core/storage/_anndata_sizes.py +41 -41
- lamindb/core/storage/_backed_access.py +98 -98
- lamindb/core/storage/_tiledbsoma.py +204 -204
- lamindb/core/storage/_valid_suffixes.py +21 -21
- lamindb/core/storage/_zarr.py +110 -110
- lamindb/core/storage/objects.py +62 -62
- lamindb/core/storage/paths.py +172 -141
- lamindb/core/subsettings/__init__.py +12 -12
- lamindb/core/subsettings/_creation_settings.py +38 -38
- lamindb/core/subsettings/_transform_settings.py +21 -21
- lamindb/core/types.py +19 -19
- lamindb/core/versioning.py +158 -158
- lamindb/integrations/__init__.py +12 -12
- lamindb/integrations/_vitessce.py +107 -107
- lamindb/setup/__init__.py +14 -14
- lamindb/setup/core/__init__.py +4 -4
- {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/LICENSE +201 -201
- {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/METADATA +3 -3
- lamindb-0.76.8.dist-info/RECORD +60 -0
- {lamindb-0.76.7.dist-info → lamindb-0.76.8.dist-info}/WHEEL +1 -1
- lamindb-0.76.7.dist-info/RECORD +0 -60
lamindb/_curate.py
CHANGED
@@ -1,1601 +1,1601 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import copy
|
4
|
-
from typing import TYPE_CHECKING, Iterable
|
5
|
-
|
6
|
-
import anndata as ad
|
7
|
-
import lamindb_setup as ln_setup
|
8
|
-
import pandas as pd
|
9
|
-
from lamin_utils import colors, logger
|
10
|
-
from lamindb_setup.core._docs import doc_args
|
11
|
-
from lnschema_core import (
|
12
|
-
Artifact,
|
13
|
-
Feature,
|
14
|
-
Record,
|
15
|
-
Run,
|
16
|
-
ULabel,
|
17
|
-
)
|
18
|
-
|
19
|
-
from .core.exceptions import ValidationError
|
20
|
-
|
21
|
-
if TYPE_CHECKING:
|
22
|
-
from lamindb_setup.core.types import UPathStr
|
23
|
-
from lnschema_core.types import FieldAttr
|
24
|
-
from mudata import MuData
|
25
|
-
|
26
|
-
|
27
|
-
class CurateLookup:
|
28
|
-
"""Lookup categories from the reference instance."""
|
29
|
-
|
30
|
-
def __init__(
|
31
|
-
self,
|
32
|
-
categoricals: dict[str, FieldAttr],
|
33
|
-
slots: dict[str, FieldAttr] = None,
|
34
|
-
using_key: str | None = None,
|
35
|
-
) -> None:
|
36
|
-
if slots is None:
|
37
|
-
slots = {}
|
38
|
-
self._fields = {**categoricals, **slots}
|
39
|
-
self._using_key = None if using_key == "default" else using_key
|
40
|
-
self._using_key_name = self._using_key or ln_setup.settings.instance.slug
|
41
|
-
debug_message = (
|
42
|
-
f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
|
43
|
-
)
|
44
|
-
logger.debug(debug_message)
|
45
|
-
|
46
|
-
def __getattr__(self, name):
|
47
|
-
if name in self._fields:
|
48
|
-
registry = self._fields[name].field.model
|
49
|
-
if self._using_key == "public":
|
50
|
-
return registry.public().lookup()
|
51
|
-
else:
|
52
|
-
return get_registry_instance(registry, self._using_key).lookup()
|
53
|
-
raise AttributeError(
|
54
|
-
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
55
|
-
)
|
56
|
-
|
57
|
-
def __getitem__(self, name):
|
58
|
-
if name in self._fields:
|
59
|
-
registry = self._fields[name].field.model
|
60
|
-
if self._using_key == "public":
|
61
|
-
return registry.public().lookup()
|
62
|
-
else:
|
63
|
-
return get_registry_instance(registry, self._using_key).lookup()
|
64
|
-
raise AttributeError(
|
65
|
-
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
66
|
-
)
|
67
|
-
|
68
|
-
def __repr__(self) -> str:
|
69
|
-
if len(self._fields) > 0:
|
70
|
-
getattr_keys = "\n ".join(
|
71
|
-
[f".{key}" for key in self._fields if key.isidentifier()]
|
72
|
-
)
|
73
|
-
getitem_keys = "\n ".join(
|
74
|
-
[str([key]) for key in self._fields if not key.isidentifier()]
|
75
|
-
)
|
76
|
-
return (
|
77
|
-
f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
|
78
|
-
f"{colors.green(getattr_keys)}\n "
|
79
|
-
f"{colors.green(getitem_keys)}\n\n"
|
80
|
-
"Example:\n → categories = validator.lookup().cell_type\n"
|
81
|
-
" → categories.alveolar_type_1_fibroblast_cell"
|
82
|
-
)
|
83
|
-
else: # pragma: no cover
|
84
|
-
return colors.warning("No fields are found!")
|
85
|
-
|
86
|
-
|
87
|
-
class BaseCurator:
|
88
|
-
"""Curate a dataset."""
|
89
|
-
|
90
|
-
def validate(self) -> bool:
|
91
|
-
"""Validate dataset.
|
92
|
-
|
93
|
-
Returns:
|
94
|
-
Boolean indicating whether the dataset is validated.
|
95
|
-
"""
|
96
|
-
pass
|
97
|
-
|
98
|
-
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
99
|
-
"""Save the dataset as artifact.
|
100
|
-
|
101
|
-
Args:
|
102
|
-
description: Description of the DataFrame object.
|
103
|
-
**kwargs: Object level metadata.
|
104
|
-
|
105
|
-
Returns:
|
106
|
-
A saved artifact record.
|
107
|
-
"""
|
108
|
-
pass
|
109
|
-
|
110
|
-
|
111
|
-
class DataFrameCurator(BaseCurator):
|
112
|
-
"""Curation flow for a DataFrame object.
|
113
|
-
|
114
|
-
See also :class:`~lamindb.Curator`.
|
115
|
-
|
116
|
-
Args:
|
117
|
-
df: The DataFrame object to curate.
|
118
|
-
columns: The field attribute for the feature column.
|
119
|
-
categoricals: A dictionary mapping column names to registry_field.
|
120
|
-
using_key: The reference instance containing registries to validate against.
|
121
|
-
verbosity: The verbosity level.
|
122
|
-
organism: The organism name.
|
123
|
-
sources: A dictionary mapping column names to Source records.
|
124
|
-
exclude: A dictionary mapping column names to values to exclude.
|
125
|
-
|
126
|
-
Examples:
|
127
|
-
>>> import bionty as bt
|
128
|
-
>>> curate = ln.Curator.from_df(
|
129
|
-
... df,
|
130
|
-
... categoricals={
|
131
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
132
|
-
... "donor_id": ln.ULabel.name
|
133
|
-
... }
|
134
|
-
... )
|
135
|
-
"""
|
136
|
-
|
137
|
-
def __init__(
|
138
|
-
self,
|
139
|
-
df: pd.DataFrame,
|
140
|
-
columns: FieldAttr = Feature.name,
|
141
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
142
|
-
using_key: str | None = None,
|
143
|
-
verbosity: str = "hint",
|
144
|
-
organism: str | None = None,
|
145
|
-
sources: dict[str, Record] | None = None,
|
146
|
-
exclude: dict | None = None,
|
147
|
-
check_valid_keys: bool = True,
|
148
|
-
) -> None:
|
149
|
-
from lamindb.core._settings import settings
|
150
|
-
|
151
|
-
self._df = df
|
152
|
-
self._fields = categoricals or {}
|
153
|
-
self._columns_field = columns
|
154
|
-
self._using_key = using_key
|
155
|
-
settings.verbosity = verbosity
|
156
|
-
self._artifact = None
|
157
|
-
self._collection = None
|
158
|
-
self._validated = False
|
159
|
-
self._kwargs = {"organism": organism} if organism else {}
|
160
|
-
if sources is None:
|
161
|
-
sources = {}
|
162
|
-
self._sources = sources
|
163
|
-
if exclude is None:
|
164
|
-
exclude = {}
|
165
|
-
self._exclude = exclude
|
166
|
-
self._non_validated = None
|
167
|
-
if check_valid_keys:
|
168
|
-
self._check_valid_keys()
|
169
|
-
self._save_columns()
|
170
|
-
|
171
|
-
@property
|
172
|
-
def non_validated(self) -> list:
|
173
|
-
"""Return the non-validated features and labels."""
|
174
|
-
if self._non_validated is None:
|
175
|
-
raise ValueError("Please run validate() first!")
|
176
|
-
return self._non_validated
|
177
|
-
|
178
|
-
@property
|
179
|
-
def fields(self) -> dict:
|
180
|
-
"""Return the columns fields to validate against."""
|
181
|
-
return self._fields
|
182
|
-
|
183
|
-
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
184
|
-
"""Lookup categories.
|
185
|
-
|
186
|
-
Args:
|
187
|
-
using_key: The instance where the lookup is performed.
|
188
|
-
if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
|
189
|
-
if "public", the lookup is performed on the public reference.
|
190
|
-
"""
|
191
|
-
return CurateLookup(
|
192
|
-
categoricals=self._fields,
|
193
|
-
slots={"columns": self._columns_field},
|
194
|
-
using_key=using_key or self._using_key,
|
195
|
-
)
|
196
|
-
|
197
|
-
def _check_valid_keys(self, extra: set = None) -> None:
|
198
|
-
if extra is None:
|
199
|
-
extra = set()
|
200
|
-
for name, d in {
|
201
|
-
"categoricals": self._fields,
|
202
|
-
"sources": self._sources,
|
203
|
-
"exclude": self._exclude,
|
204
|
-
}.items():
|
205
|
-
if not isinstance(d, dict):
|
206
|
-
raise TypeError(f"{name} must be a dictionary!")
|
207
|
-
valid_keys = set(self._df.columns) | {"columns"} | extra
|
208
|
-
nonval_keys = [key for key in d.keys() if key not in valid_keys]
|
209
|
-
if len(nonval_keys) > 0:
|
210
|
-
raise ValueError(
|
211
|
-
f"the following keys passed to {name} are not allowed: {nonval_keys}"
|
212
|
-
)
|
213
|
-
|
214
|
-
def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
|
215
|
-
"""Save column name records."""
|
216
|
-
# Always save features specified as the fields keys
|
217
|
-
update_registry(
|
218
|
-
values=list(self.fields.keys()),
|
219
|
-
field=self._columns_field,
|
220
|
-
key="columns",
|
221
|
-
save_function="add_new_from_columns",
|
222
|
-
using_key=self._using_key,
|
223
|
-
validated_only=False,
|
224
|
-
source=self._sources.get("columns"),
|
225
|
-
exclude=self._exclude.get("columns"),
|
226
|
-
**kwargs,
|
227
|
-
)
|
228
|
-
|
229
|
-
# Save the rest of the columns based on validated_only
|
230
|
-
additional_columns = set(self._df.columns) - set(self.fields.keys())
|
231
|
-
if additional_columns:
|
232
|
-
update_registry(
|
233
|
-
values=list(additional_columns),
|
234
|
-
field=self._columns_field,
|
235
|
-
key="columns",
|
236
|
-
save_function="add_new_from_columns",
|
237
|
-
using_key=self._using_key,
|
238
|
-
validated_only=validated_only,
|
239
|
-
df=self._df, # Get the Feature type from df
|
240
|
-
source=self._sources.get("columns"),
|
241
|
-
exclude=self._exclude.get("columns"),
|
242
|
-
warning=False, # Do not warn about missing columns, just an info message
|
243
|
-
**kwargs,
|
244
|
-
)
|
245
|
-
|
246
|
-
def add_validated_from(self, key: str, organism: str | None = None):
|
247
|
-
"""Add validated categories.
|
248
|
-
|
249
|
-
Args:
|
250
|
-
key: The key referencing the slot in the DataFrame.
|
251
|
-
organism: The organism name.
|
252
|
-
"""
|
253
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
254
|
-
self._update_registry(key, validated_only=True, **self._kwargs)
|
255
|
-
|
256
|
-
def add_new_from(self, key: str, organism: str | None = None, **kwargs):
|
257
|
-
"""Add validated & new categories.
|
258
|
-
|
259
|
-
Args:
|
260
|
-
key: The key referencing the slot in the DataFrame from which to draw terms.
|
261
|
-
organism: The organism name.
|
262
|
-
**kwargs: Additional keyword arguments to pass to the registry model.
|
263
|
-
"""
|
264
|
-
if len(kwargs) > 0 and key == "all":
|
265
|
-
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
266
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
267
|
-
self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
|
268
|
-
|
269
|
-
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
270
|
-
"""Add validated & new column names to its registry.
|
271
|
-
|
272
|
-
Args:
|
273
|
-
organism: The organism name.
|
274
|
-
**kwargs: Additional keyword arguments to pass to the registry model.
|
275
|
-
"""
|
276
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
277
|
-
self._save_columns(validated_only=False, **self._kwargs, **kwargs)
|
278
|
-
|
279
|
-
def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
|
280
|
-
if categorical == "all":
|
281
|
-
self._update_registry_all(validated_only=validated_only, **kwargs)
|
282
|
-
elif categorical == "columns":
|
283
|
-
self._save_columns(validated_only=validated_only, **kwargs)
|
284
|
-
else:
|
285
|
-
if categorical not in self.fields:
|
286
|
-
raise ValueError(f"Feature {categorical} is not part of the fields!")
|
287
|
-
update_registry(
|
288
|
-
values=self._df[categorical].unique().tolist(),
|
289
|
-
field=self.fields[categorical],
|
290
|
-
key=categorical,
|
291
|
-
using_key=self._using_key,
|
292
|
-
validated_only=validated_only,
|
293
|
-
source=self._sources.get(categorical),
|
294
|
-
exclude=self._exclude.get(categorical),
|
295
|
-
**kwargs,
|
296
|
-
)
|
297
|
-
|
298
|
-
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
299
|
-
"""Save labels for all features."""
|
300
|
-
for name in self.fields.keys():
|
301
|
-
logger.info(f"saving labels for '{name}'")
|
302
|
-
self._update_registry(name, validated_only=validated_only, **kwargs)
|
303
|
-
|
304
|
-
def validate(self, organism: str | None = None) -> bool:
|
305
|
-
"""Validate variables and categorical observations.
|
306
|
-
|
307
|
-
Args:
|
308
|
-
organism: The organism name.
|
309
|
-
|
310
|
-
Returns:
|
311
|
-
Whether the DataFrame is validated.
|
312
|
-
"""
|
313
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
314
|
-
self._validated, self._non_validated = validate_categories_in_df( # type: ignore
|
315
|
-
self._df,
|
316
|
-
fields=self.fields,
|
317
|
-
using_key=self._using_key,
|
318
|
-
sources=self._sources,
|
319
|
-
exclude=self._exclude,
|
320
|
-
**self._kwargs,
|
321
|
-
)
|
322
|
-
return self._validated
|
323
|
-
|
324
|
-
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
325
|
-
"""Save the validated DataFrame and metadata.
|
326
|
-
|
327
|
-
Args:
|
328
|
-
description: Description of the DataFrame object.
|
329
|
-
**kwargs: Object level metadata.
|
330
|
-
|
331
|
-
Returns:
|
332
|
-
A saved artifact record.
|
333
|
-
"""
|
334
|
-
from lamindb.core._settings import settings
|
335
|
-
|
336
|
-
if not self._validated:
|
337
|
-
self.validate()
|
338
|
-
if not self._validated:
|
339
|
-
raise ValidationError("Dataset does not validate. Please curate.")
|
340
|
-
|
341
|
-
# Make sure all labels are saved in the current instance
|
342
|
-
verbosity = settings.verbosity
|
343
|
-
try:
|
344
|
-
settings.verbosity = "warning"
|
345
|
-
# save all validated records to the current instance
|
346
|
-
self.add_validated_from("all")
|
347
|
-
|
348
|
-
self._artifact = save_artifact(
|
349
|
-
self._df,
|
350
|
-
description=description,
|
351
|
-
fields=self.fields,
|
352
|
-
columns_field=self._columns_field,
|
353
|
-
**kwargs,
|
354
|
-
**self._kwargs,
|
355
|
-
)
|
356
|
-
finally:
|
357
|
-
settings.verbosity = verbosity
|
358
|
-
|
359
|
-
return self._artifact
|
360
|
-
|
361
|
-
def clean_up_failed_runs(self):
|
362
|
-
"""Clean up previous failed runs that don't save any outputs."""
|
363
|
-
from lamindb.core._context import context
|
364
|
-
|
365
|
-
if context.run is not None:
|
366
|
-
Run.filter(transform=context.run.transform, output_artifacts=None).exclude(
|
367
|
-
uid=context.run.uid
|
368
|
-
).delete()
|
369
|
-
|
370
|
-
|
371
|
-
class AnnDataCurator(DataFrameCurator):
|
372
|
-
"""Curation flow for ``AnnData``.
|
373
|
-
|
374
|
-
See also :class:`~lamindb.Curator`.
|
375
|
-
|
376
|
-
Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
|
377
|
-
|
378
|
-
See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
|
379
|
-
|
380
|
-
Args:
|
381
|
-
data: The AnnData object or an AnnData-like path.
|
382
|
-
var_index: The registry field for mapping the ``.var`` index.
|
383
|
-
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
384
|
-
using_key: A reference LaminDB instance.
|
385
|
-
verbosity: The verbosity level.
|
386
|
-
organism: The organism name.
|
387
|
-
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
388
|
-
exclude: A dictionary mapping column names to values to exclude.
|
389
|
-
|
390
|
-
Examples:
|
391
|
-
>>> import bionty as bt
|
392
|
-
>>> curate = ln.Curator.from_anndata(
|
393
|
-
... adata,
|
394
|
-
... var_index=bt.Gene.ensembl_gene_id,
|
395
|
-
... categoricals={
|
396
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
397
|
-
... "donor_id": ln.ULabel.name
|
398
|
-
... },
|
399
|
-
... organism="human",
|
400
|
-
... )
|
401
|
-
"""
|
402
|
-
|
403
|
-
def __init__(
|
404
|
-
self,
|
405
|
-
data: ad.AnnData | UPathStr,
|
406
|
-
var_index: FieldAttr,
|
407
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
408
|
-
obs_columns: FieldAttr = Feature.name,
|
409
|
-
using_key: str = "default",
|
410
|
-
verbosity: str = "hint",
|
411
|
-
organism: str | None = None,
|
412
|
-
sources: dict[str, Record] | None = None,
|
413
|
-
exclude: dict | None = None,
|
414
|
-
) -> None:
|
415
|
-
from lamindb_setup.core import upath
|
416
|
-
|
417
|
-
from ._artifact import data_is_anndata
|
418
|
-
|
419
|
-
if sources is None:
|
420
|
-
sources = {}
|
421
|
-
if not data_is_anndata(data):
|
422
|
-
raise ValueError(
|
423
|
-
"data has to be an AnnData object or a path to AnnData-like"
|
424
|
-
)
|
425
|
-
if isinstance(data, ad.AnnData):
|
426
|
-
self._adata = data
|
427
|
-
else: # pragma: no cover
|
428
|
-
from lamindb.core.storage._backed_access import backed_access
|
429
|
-
|
430
|
-
self._adata = backed_access(upath.create_path(data))
|
431
|
-
|
432
|
-
self._data = data
|
433
|
-
self._var_field = var_index
|
434
|
-
super().__init__(
|
435
|
-
df=self._adata.obs,
|
436
|
-
categoricals=categoricals,
|
437
|
-
columns=obs_columns,
|
438
|
-
using_key=using_key,
|
439
|
-
verbosity=verbosity,
|
440
|
-
organism=organism,
|
441
|
-
sources=sources,
|
442
|
-
exclude=exclude,
|
443
|
-
check_valid_keys=False,
|
444
|
-
)
|
445
|
-
self._obs_fields = categoricals or {}
|
446
|
-
self._check_valid_keys(extra={"var_index"})
|
447
|
-
|
448
|
-
@property
|
449
|
-
def var_index(self) -> FieldAttr:
|
450
|
-
"""Return the registry field to validate variables index against."""
|
451
|
-
return self._var_field
|
452
|
-
|
453
|
-
@property
|
454
|
-
def categoricals(self) -> dict:
|
455
|
-
"""Return the obs fields to validate against."""
|
456
|
-
return self._obs_fields
|
457
|
-
|
458
|
-
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
459
|
-
"""Lookup categories.
|
460
|
-
|
461
|
-
Args:
|
462
|
-
using_key: The instance where the lookup is performed.
|
463
|
-
if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
|
464
|
-
if "public", the lookup is performed on the public reference.
|
465
|
-
"""
|
466
|
-
return CurateLookup(
|
467
|
-
categoricals=self._obs_fields,
|
468
|
-
slots={"columns": self._columns_field, "var_index": self._var_field},
|
469
|
-
using_key=using_key or self._using_key,
|
470
|
-
)
|
471
|
-
|
472
|
-
def _save_from_var_index(
|
473
|
-
self, validated_only: bool = True, organism: str | None = None
|
474
|
-
):
|
475
|
-
"""Save variable records."""
|
476
|
-
update_registry(
|
477
|
-
values=list(self._adata.var.index),
|
478
|
-
field=self.var_index,
|
479
|
-
key="var_index",
|
480
|
-
save_function="add_new_from_var_index",
|
481
|
-
using_key=self._using_key,
|
482
|
-
validated_only=validated_only,
|
483
|
-
organism=organism,
|
484
|
-
source=self._sources.get("var_index"),
|
485
|
-
exclude=self._exclude.get("var_index"),
|
486
|
-
)
|
487
|
-
|
488
|
-
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
489
|
-
"""Save labels for all features."""
|
490
|
-
for name in self.fields.keys():
|
491
|
-
logger.info(f"saving labels for '{name}'")
|
492
|
-
if name == "var_index":
|
493
|
-
self._save_from_var_index(validated_only=validated_only, **kwargs)
|
494
|
-
else:
|
495
|
-
self._update_registry(name, validated_only=validated_only, **kwargs)
|
496
|
-
|
497
|
-
def add_new_from_var_index(self, organism: str | None = None, **kwargs):
|
498
|
-
"""Update variable records.
|
499
|
-
|
500
|
-
Args:
|
501
|
-
organism: The organism name.
|
502
|
-
**kwargs: Additional keyword arguments to pass to the registry model.
|
503
|
-
"""
|
504
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
505
|
-
self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
|
506
|
-
|
507
|
-
def add_validated_from_var_index(self, organism: str | None = None):
|
508
|
-
"""Add validated variable records.
|
509
|
-
|
510
|
-
Args:
|
511
|
-
organism: The organism name.
|
512
|
-
"""
|
513
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
514
|
-
self._save_from_var_index(validated_only=True, **self._kwargs)
|
515
|
-
|
516
|
-
def validate(self, organism: str | None = None) -> bool:
|
517
|
-
"""Validate categories.
|
518
|
-
|
519
|
-
Args:
|
520
|
-
organism: The organism name.
|
521
|
-
|
522
|
-
Returns:
|
523
|
-
Whether the AnnData object is validated.
|
524
|
-
"""
|
525
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
526
|
-
if self._using_key is not None and self._using_key != "default":
|
527
|
-
logger.important(
|
528
|
-
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
529
|
-
)
|
530
|
-
|
531
|
-
validated_var, non_validated_var = validate_categories(
|
532
|
-
self._adata.var.index,
|
533
|
-
field=self._var_field,
|
534
|
-
key="var_index",
|
535
|
-
using_key=self._using_key,
|
536
|
-
source=self._sources.get("var_index"),
|
537
|
-
validated_hint_print=".add_validated_from_var_index()",
|
538
|
-
exclude=self._exclude.get("var_index"),
|
539
|
-
**self._kwargs, # type: ignore
|
540
|
-
)
|
541
|
-
validated_obs, non_validated_obs = validate_categories_in_df(
|
542
|
-
self._adata.obs,
|
543
|
-
fields=self.categoricals,
|
544
|
-
using_key=self._using_key,
|
545
|
-
sources=self._sources,
|
546
|
-
exclude=self._exclude,
|
547
|
-
**self._kwargs,
|
548
|
-
)
|
549
|
-
self._non_validated = non_validated_obs # type: ignore
|
550
|
-
if len(non_validated_var) > 0:
|
551
|
-
self._non_validated["var_index"] = non_validated_var # type: ignore
|
552
|
-
self._validated = validated_var and validated_obs
|
553
|
-
return self._validated
|
554
|
-
|
555
|
-
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
556
|
-
"""Save the validated ``AnnData`` and metadata.
|
557
|
-
|
558
|
-
Args:
|
559
|
-
description: Description of the ``AnnData`` object.
|
560
|
-
**kwargs: Object level metadata.
|
561
|
-
|
562
|
-
Returns:
|
563
|
-
A saved artifact record.
|
564
|
-
"""
|
565
|
-
if not self._validated:
|
566
|
-
self.validate()
|
567
|
-
if not self._validated:
|
568
|
-
raise ValidationError("Dataset does not validate. Please curate.")
|
569
|
-
|
570
|
-
self._artifact = save_artifact(
|
571
|
-
self._data,
|
572
|
-
adata=self._adata,
|
573
|
-
description=description,
|
574
|
-
columns_field=self.var_index,
|
575
|
-
fields=self.categoricals,
|
576
|
-
**self._kwargs,
|
577
|
-
**kwargs,
|
578
|
-
)
|
579
|
-
return self._artifact
|
580
|
-
|
581
|
-
|
582
|
-
class MuDataCurator:
|
583
|
-
"""Curation flow for a ``MuData`` object.
|
584
|
-
|
585
|
-
See also :class:`~lamindb.Curator`.
|
586
|
-
|
587
|
-
Note that if genes or other measurements are removed from the MuData object,
|
588
|
-
the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
|
589
|
-
|
590
|
-
Args:
|
591
|
-
mdata: The MuData object to curate.
|
592
|
-
var_index: The registry field for mapping the ``.var`` index for each modality.
|
593
|
-
For example:
|
594
|
-
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
|
595
|
-
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
596
|
-
Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
|
597
|
-
using_key: A reference LaminDB instance.
|
598
|
-
verbosity: The verbosity level.
|
599
|
-
organism: The organism name.
|
600
|
-
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
601
|
-
exclude: A dictionary mapping column names to values to exclude.
|
602
|
-
|
603
|
-
Examples:
|
604
|
-
>>> import bionty as bt
|
605
|
-
>>> curate = ln.Curator.from_mudata(
|
606
|
-
... mdata,
|
607
|
-
... var_index={
|
608
|
-
... "rna": bt.Gene.ensembl_gene_id,
|
609
|
-
... "adt": ln.CellMarker.name
|
610
|
-
... },
|
611
|
-
... categoricals={
|
612
|
-
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
613
|
-
... "donor_id": ln.ULabel.name
|
614
|
-
... },
|
615
|
-
... organism="human",
|
616
|
-
... )
|
617
|
-
"""
|
618
|
-
|
619
|
-
def __init__(
|
620
|
-
self,
|
621
|
-
mdata: MuData,
|
622
|
-
var_index: dict[str, dict[str, FieldAttr]],
|
623
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
624
|
-
using_key: str = "default",
|
625
|
-
verbosity: str = "hint",
|
626
|
-
organism: str | None = None,
|
627
|
-
sources: dict[str, Record] | None = None,
|
628
|
-
exclude: dict | None = None,
|
629
|
-
) -> None:
|
630
|
-
if sources is None:
|
631
|
-
sources = {}
|
632
|
-
self._sources = sources
|
633
|
-
if exclude is None:
|
634
|
-
exclude = {}
|
635
|
-
self._exclude = exclude
|
636
|
-
self._mdata = mdata
|
637
|
-
self._kwargs = {"organism": organism} if organism else {}
|
638
|
-
self._var_fields = var_index
|
639
|
-
self._verify_modality(self._var_fields.keys())
|
640
|
-
self._obs_fields = self._parse_categoricals(categoricals)
|
641
|
-
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
642
|
-
self._using_key = using_key
|
643
|
-
self._verbosity = verbosity
|
644
|
-
self._df_annotators = {
|
645
|
-
modality: DataFrameCurator(
|
646
|
-
df=mdata[modality].obs if modality != "obs" else mdata.obs,
|
647
|
-
categoricals=self._obs_fields.get(modality, {}),
|
648
|
-
using_key=using_key,
|
649
|
-
verbosity=verbosity,
|
650
|
-
sources=self._sources.get(modality),
|
651
|
-
exclude=self._exclude.get(modality),
|
652
|
-
check_valid_keys=False,
|
653
|
-
**self._kwargs,
|
654
|
-
)
|
655
|
-
for modality in self._modalities
|
656
|
-
}
|
657
|
-
for modality in self._var_fields.keys():
|
658
|
-
self._save_from_var_index_modality(
|
659
|
-
modality=modality, validated_only=True, **self._kwargs
|
660
|
-
)
|
661
|
-
|
662
|
-
@property
|
663
|
-
def var_index(self) -> FieldAttr:
|
664
|
-
"""Return the registry field to validate variables index against."""
|
665
|
-
return self._var_fields
|
666
|
-
|
667
|
-
@property
|
668
|
-
def categoricals(self) -> dict:
|
669
|
-
"""Return the obs fields to validate against."""
|
670
|
-
return self._obs_fields
|
671
|
-
|
672
|
-
def _verify_modality(self, modalities: Iterable[str]):
|
673
|
-
"""Verify the modality exists."""
|
674
|
-
for modality in modalities:
|
675
|
-
if modality not in self._mdata.mod.keys():
|
676
|
-
raise ValueError(f"modality '{modality}' does not exist!")
|
677
|
-
|
678
|
-
def _save_from_var_index_modality(
|
679
|
-
self, modality: str, validated_only: bool = True, **kwargs
|
680
|
-
):
|
681
|
-
"""Save variable records."""
|
682
|
-
update_registry(
|
683
|
-
values=list(self._mdata[modality].var.index),
|
684
|
-
field=self._var_fields[modality],
|
685
|
-
key="var_index",
|
686
|
-
save_function="add_new_from_var_index",
|
687
|
-
using_key=self._using_key,
|
688
|
-
validated_only=validated_only,
|
689
|
-
dtype="number",
|
690
|
-
source=self._sources.get(modality, {}).get("var_index"),
|
691
|
-
exclude=self._exclude.get(modality, {}).get("var_index"),
|
692
|
-
**kwargs,
|
693
|
-
)
|
694
|
-
|
695
|
-
def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
|
696
|
-
"""Parse the categorical fields."""
|
697
|
-
prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
|
698
|
-
obs_fields: dict[str, dict[str, FieldAttr]] = {}
|
699
|
-
for k, v in categoricals.items():
|
700
|
-
if k not in self._mdata.obs.columns:
|
701
|
-
raise ValueError(f"column '{k}' does not exist in mdata.obs!")
|
702
|
-
if any(k.startswith(prefix) for prefix in prefixes):
|
703
|
-
modality, col = k.split(":")[0], k.split(":")[1]
|
704
|
-
if modality not in obs_fields.keys():
|
705
|
-
obs_fields[modality] = {}
|
706
|
-
obs_fields[modality][col] = v
|
707
|
-
else:
|
708
|
-
if "obs" not in obs_fields.keys():
|
709
|
-
obs_fields["obs"] = {}
|
710
|
-
obs_fields["obs"][k] = v
|
711
|
-
return obs_fields
|
712
|
-
|
713
|
-
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
714
|
-
"""Lookup categories.
|
715
|
-
|
716
|
-
Args:
|
717
|
-
using_key: The instance where the lookup is performed.
|
718
|
-
if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
|
719
|
-
if "public", the lookup is performed on the public reference.
|
720
|
-
"""
|
721
|
-
return CurateLookup(
|
722
|
-
categoricals=self._obs_fields,
|
723
|
-
slots={
|
724
|
-
**self._obs_fields,
|
725
|
-
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
726
|
-
},
|
727
|
-
using_key=using_key or self._using_key,
|
728
|
-
)
|
729
|
-
|
730
|
-
def add_new_from_columns(
|
731
|
-
self,
|
732
|
-
modality: str,
|
733
|
-
column_names: list[str] | None = None,
|
734
|
-
organism: str | None = None,
|
735
|
-
**kwargs,
|
736
|
-
):
|
737
|
-
"""Update columns records.
|
738
|
-
|
739
|
-
Args:
|
740
|
-
modality: The modality name.
|
741
|
-
column_names: The column names to save.
|
742
|
-
organism: The organism name.
|
743
|
-
**kwargs: Additional keyword arguments to pass to the registry model.
|
744
|
-
"""
|
745
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
746
|
-
values = column_names or self._mdata[modality].obs.columns
|
747
|
-
update_registry(
|
748
|
-
values=list(values),
|
749
|
-
field=Feature.name,
|
750
|
-
key=f"{modality} obs columns",
|
751
|
-
using_key=self._using_key,
|
752
|
-
validated_only=False,
|
753
|
-
df=self._mdata[modality].obs,
|
754
|
-
source=self._sources.get(modality, {}).get("columns"),
|
755
|
-
exclude=self._exclude.get(modality, {}).get("columns"),
|
756
|
-
**self._kwargs, # type: ignore
|
757
|
-
**kwargs,
|
758
|
-
)
|
759
|
-
|
760
|
-
def add_new_from_var_index(
|
761
|
-
self, modality: str, organism: str | None = None, **kwargs
|
762
|
-
):
|
763
|
-
"""Update variable records.
|
764
|
-
|
765
|
-
Args:
|
766
|
-
modality: The modality name.
|
767
|
-
organism: The organism name.
|
768
|
-
**kwargs: Additional keyword arguments to pass to the registry model.
|
769
|
-
"""
|
770
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
771
|
-
self._save_from_var_index_modality(
|
772
|
-
modality=modality, validated_only=False, **self._kwargs, **kwargs
|
773
|
-
)
|
774
|
-
|
775
|
-
def add_validated_from_var_index(self, modality: str, organism: str | None = None):
|
776
|
-
"""Add validated variable records.
|
777
|
-
|
778
|
-
Args:
|
779
|
-
modality: The modality name.
|
780
|
-
organism: The organism name.
|
781
|
-
"""
|
782
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
783
|
-
self._save_from_var_index_modality(
|
784
|
-
modality=modality, validated_only=True, **self._kwargs
|
785
|
-
)
|
786
|
-
|
787
|
-
def add_validated_from(
|
788
|
-
self, key: str, modality: str | None = None, organism: str | None = None
|
789
|
-
):
|
790
|
-
"""Add validated categories.
|
791
|
-
|
792
|
-
Args:
|
793
|
-
key: The key referencing the slot in the DataFrame.
|
794
|
-
modality: The modality name.
|
795
|
-
organism: The organism name.
|
796
|
-
"""
|
797
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
798
|
-
modality = modality or "obs"
|
799
|
-
if modality in self._df_annotators:
|
800
|
-
df_annotator = self._df_annotators[modality]
|
801
|
-
df_annotator.add_validated_from(key=key, **self._kwargs)
|
802
|
-
|
803
|
-
def add_new_from(
|
804
|
-
self,
|
805
|
-
key: str,
|
806
|
-
modality: str | None = None,
|
807
|
-
organism: str | None = None,
|
808
|
-
**kwargs,
|
809
|
-
):
|
810
|
-
"""Add validated & new categories.
|
811
|
-
|
812
|
-
Args:
|
813
|
-
key: The key referencing the slot in the DataFrame.
|
814
|
-
modality: The modality name.
|
815
|
-
organism: The organism name.
|
816
|
-
**kwargs: Additional keyword arguments to pass to the registry model.
|
817
|
-
"""
|
818
|
-
if len(kwargs) > 0 and key == "all":
|
819
|
-
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
820
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
821
|
-
modality = modality or "obs"
|
822
|
-
if modality in self._df_annotators:
|
823
|
-
df_annotator = self._df_annotators[modality]
|
824
|
-
df_annotator.add_new_from(key=key, **self._kwargs, **kwargs)
|
825
|
-
|
826
|
-
def validate(self, organism: str | None = None) -> bool:
|
827
|
-
"""Validate categories."""
|
828
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
829
|
-
if self._using_key is not None and self._using_key != "default":
|
830
|
-
logger.important(
|
831
|
-
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
832
|
-
)
|
833
|
-
validated_var = True
|
834
|
-
non_validated_var_modality = {}
|
835
|
-
for modality, var_field in self._var_fields.items():
|
836
|
-
is_validated_var, non_validated_var = validate_categories(
|
837
|
-
self._mdata[modality].var.index,
|
838
|
-
field=var_field,
|
839
|
-
key=f"{modality}_var_index",
|
840
|
-
using_key=self._using_key,
|
841
|
-
source=self._sources.get(modality, {}).get("var_index"),
|
842
|
-
exclude=self._exclude.get(modality, {}).get("var_index"),
|
843
|
-
**self._kwargs, # type: ignore
|
844
|
-
)
|
845
|
-
validated_var &= is_validated_var
|
846
|
-
if len(non_validated_var) > 0:
|
847
|
-
non_validated_var_modality[modality] = non_validated_var
|
848
|
-
|
849
|
-
validated_obs = True
|
850
|
-
non_validated_obs_modality = {}
|
851
|
-
for modality, fields in self._obs_fields.items():
|
852
|
-
if modality == "obs":
|
853
|
-
obs = self._mdata.obs
|
854
|
-
else:
|
855
|
-
obs = self._mdata[modality].obs
|
856
|
-
is_validated_obs, non_validated_obs = validate_categories_in_df(
|
857
|
-
obs,
|
858
|
-
fields=fields,
|
859
|
-
using_key=self._using_key,
|
860
|
-
sources=self._sources.get(modality),
|
861
|
-
exclude=self._exclude.get(modality),
|
862
|
-
**self._kwargs,
|
863
|
-
)
|
864
|
-
validated_obs &= is_validated_obs
|
865
|
-
non_validated_obs_modality[modality] = non_validated_obs
|
866
|
-
if modality in non_validated_var_modality:
|
867
|
-
non_validated_obs_modality[modality]["var_index"] = (
|
868
|
-
non_validated_var_modality[modality]
|
869
|
-
)
|
870
|
-
if len(non_validated_obs_modality[modality]) > 0:
|
871
|
-
self._non_validated = non_validated_obs_modality[modality]
|
872
|
-
self._validated = validated_var and validated_obs
|
873
|
-
return self._validated
|
874
|
-
|
875
|
-
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
876
|
-
"""Save the validated ``MuData`` and metadata.
|
877
|
-
|
878
|
-
Args:
|
879
|
-
description: Description of the ``MuData`` object.
|
880
|
-
**kwargs: Object level metadata.
|
881
|
-
|
882
|
-
Returns:
|
883
|
-
A saved artifact record.
|
884
|
-
"""
|
885
|
-
if not self._validated:
|
886
|
-
raise ValidationError("Please run `validate()` first!")
|
887
|
-
|
888
|
-
self._artifact = save_artifact(
|
889
|
-
self._mdata,
|
890
|
-
description=description,
|
891
|
-
columns_field=self.var_index,
|
892
|
-
fields=self.categoricals,
|
893
|
-
**self._kwargs,
|
894
|
-
**kwargs,
|
895
|
-
)
|
896
|
-
return self._artifact
|
897
|
-
|
898
|
-
|
899
|
-
class Curator(BaseCurator):
|
900
|
-
"""Dataset curator.
|
901
|
-
|
902
|
-
Data curation entails accurately labeling datasets with standardized metadata
|
903
|
-
to facilitate data integration, interpretation and analysis.
|
904
|
-
|
905
|
-
The curation flow has several steps:
|
906
|
-
|
907
|
-
1. Instantiate `Curator` from one of the following dataset objects:
|
908
|
-
|
909
|
-
- :meth:`~lamindb.Curator.from_df`
|
910
|
-
- :meth:`~lamindb.Curator.from_anndata`
|
911
|
-
- :meth:`~lamindb.Curator.from_mudata`
|
912
|
-
|
913
|
-
During object creation, any passed categoricals found in the object will be saved.
|
914
|
-
|
915
|
-
2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
|
916
|
-
|
917
|
-
- Values that can successfully validated and already exist in the registry.
|
918
|
-
- Values which are new and not yet validated or potentially problematic values.
|
919
|
-
|
920
|
-
3. Determine how to handle validated and non-validated values:
|
921
|
-
|
922
|
-
- Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
|
923
|
-
- Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
|
924
|
-
- All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
|
925
|
-
"""
|
926
|
-
|
927
|
-
@classmethod
|
928
|
-
@doc_args(DataFrameCurator.__doc__)
|
929
|
-
def from_df(
|
930
|
-
cls,
|
931
|
-
df: pd.DataFrame,
|
932
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
933
|
-
columns: FieldAttr = Feature.name,
|
934
|
-
using_key: str | None = None,
|
935
|
-
verbosity: str = "hint",
|
936
|
-
organism: str | None = None,
|
937
|
-
) -> DataFrameCurator:
|
938
|
-
"""{}""" # noqa: D415
|
939
|
-
return DataFrameCurator(
|
940
|
-
df=df,
|
941
|
-
categoricals=categoricals,
|
942
|
-
columns=columns,
|
943
|
-
using_key=using_key,
|
944
|
-
verbosity=verbosity,
|
945
|
-
organism=organism,
|
946
|
-
)
|
947
|
-
|
948
|
-
@classmethod
|
949
|
-
@doc_args(AnnDataCurator.__doc__)
|
950
|
-
def from_anndata(
|
951
|
-
cls,
|
952
|
-
data: ad.AnnData | UPathStr,
|
953
|
-
var_index: FieldAttr,
|
954
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
955
|
-
obs_columns: FieldAttr = Feature.name,
|
956
|
-
using_key: str = "default",
|
957
|
-
verbosity: str = "hint",
|
958
|
-
organism: str | None = None,
|
959
|
-
sources: dict[str, Record] | None = None,
|
960
|
-
) -> AnnDataCurator:
|
961
|
-
"""{}""" # noqa: D415
|
962
|
-
return AnnDataCurator(
|
963
|
-
data=data,
|
964
|
-
var_index=var_index,
|
965
|
-
categoricals=categoricals,
|
966
|
-
obs_columns=obs_columns,
|
967
|
-
using_key=using_key,
|
968
|
-
verbosity=verbosity,
|
969
|
-
organism=organism,
|
970
|
-
sources=sources,
|
971
|
-
)
|
972
|
-
|
973
|
-
@classmethod
|
974
|
-
@doc_args(MuDataCurator.__doc__)
|
975
|
-
def from_mudata(
|
976
|
-
cls,
|
977
|
-
mdata: MuData,
|
978
|
-
var_index: dict[str, dict[str, FieldAttr]],
|
979
|
-
categoricals: dict[str, FieldAttr] | None = None,
|
980
|
-
using_key: str = "default",
|
981
|
-
verbosity: str = "hint",
|
982
|
-
organism: str | None = None,
|
983
|
-
) -> MuDataCurator:
|
984
|
-
"""{}""" # noqa: D415
|
985
|
-
return MuDataCurator(
|
986
|
-
mdata=mdata,
|
987
|
-
var_index=var_index,
|
988
|
-
categoricals=categoricals,
|
989
|
-
using_key=using_key,
|
990
|
-
verbosity=verbosity,
|
991
|
-
organism=organism,
|
992
|
-
)
|
993
|
-
|
994
|
-
|
995
|
-
def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
|
996
|
-
"""Get a registry instance using a specific instance."""
|
997
|
-
if using_key is not None and using_key != "default":
|
998
|
-
return registry.using(using_key)
|
999
|
-
return registry
|
1000
|
-
|
1001
|
-
|
1002
|
-
def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
1003
|
-
"""Make sure the source and organism are saved in the same database as the registry."""
|
1004
|
-
from lamindb.core._settings import settings
|
1005
|
-
|
1006
|
-
db = registry.filter().db
|
1007
|
-
source = kwargs.get("source")
|
1008
|
-
organism = kwargs.get("organism")
|
1009
|
-
filter_kwargs = kwargs.copy()
|
1010
|
-
try:
|
1011
|
-
verbosity = settings.verbosity
|
1012
|
-
settings.verbosity = "error"
|
1013
|
-
if isinstance(organism, Record) and organism._state.db != "default":
|
1014
|
-
if db is None or db == "default":
|
1015
|
-
organism_default = copy.copy(organism)
|
1016
|
-
# save the organism record in the default database
|
1017
|
-
organism_default.save()
|
1018
|
-
filter_kwargs["organism"] = organism_default
|
1019
|
-
if isinstance(source, Record) and source._state.db != "default":
|
1020
|
-
if db is None or db == "default":
|
1021
|
-
source_default = copy.copy(source)
|
1022
|
-
# save the source record in the default database
|
1023
|
-
source_default.save()
|
1024
|
-
filter_kwargs["source"] = source_default
|
1025
|
-
finally:
|
1026
|
-
settings.verbosity = verbosity
|
1027
|
-
return filter_kwargs
|
1028
|
-
|
1029
|
-
|
1030
|
-
def standardize_and_inspect(
|
1031
|
-
values: Iterable[str],
|
1032
|
-
field: FieldAttr,
|
1033
|
-
registry: type[Record],
|
1034
|
-
standardize: bool = False,
|
1035
|
-
exclude: str | list | None = None,
|
1036
|
-
**kwargs,
|
1037
|
-
):
|
1038
|
-
"""Standardize and inspect values using a registry."""
|
1039
|
-
# inspect exclude values in the default instance
|
1040
|
-
values = list(values)
|
1041
|
-
include_validated = []
|
1042
|
-
if exclude is not None:
|
1043
|
-
exclude = [exclude] if isinstance(exclude, str) else exclude
|
1044
|
-
exclude = [i for i in exclude if i in values]
|
1045
|
-
if len(exclude) > 0:
|
1046
|
-
# exclude values are validated without source and organism
|
1047
|
-
inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
|
1048
|
-
# if exclude values are validated, remove them from the values
|
1049
|
-
values = [i for i in values if i not in inspect_result_exclude.validated]
|
1050
|
-
include_validated = inspect_result_exclude.validated
|
1051
|
-
|
1052
|
-
if standardize:
|
1053
|
-
if hasattr(registry, "standardize") and hasattr(
|
1054
|
-
registry,
|
1055
|
-
"synonyms", # https://github.com/laminlabs/lamindb/issues/1685
|
1056
|
-
):
|
1057
|
-
standardized_values = registry.standardize(
|
1058
|
-
values, field=field, mute=True, **kwargs
|
1059
|
-
)
|
1060
|
-
values = standardized_values
|
1061
|
-
|
1062
|
-
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
|
1063
|
-
inspect_result._validated += include_validated
|
1064
|
-
inspect_result._non_validated = [
|
1065
|
-
i for i in inspect_result.non_validated if i not in include_validated
|
1066
|
-
]
|
1067
|
-
|
1068
|
-
return inspect_result
|
1069
|
-
|
1070
|
-
|
1071
|
-
def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
|
1072
|
-
"""Check if a registry needs an organism and return the organism name."""
|
1073
|
-
if hasattr(registry, "organism_id"):
|
1074
|
-
import bionty as bt
|
1075
|
-
|
1076
|
-
if organism is None and bt.settings.organism is None:
|
1077
|
-
raise ValueError(
|
1078
|
-
f"{registry.__name__} registry requires an organism!\n"
|
1079
|
-
" → please pass an organism name via organism="
|
1080
|
-
)
|
1081
|
-
return {"organism": organism or bt.settings.organism.name}
|
1082
|
-
return {}
|
1083
|
-
|
1084
|
-
|
1085
|
-
def validate_categories(
|
1086
|
-
values: Iterable[str],
|
1087
|
-
field: FieldAttr,
|
1088
|
-
key: str,
|
1089
|
-
using_key: str | None = None,
|
1090
|
-
organism: str | None = None,
|
1091
|
-
source: Record | None = None,
|
1092
|
-
exclude: str | list | None = None,
|
1093
|
-
standardize: bool = True,
|
1094
|
-
validated_hint_print: str | None = None,
|
1095
|
-
) -> tuple[bool, list]:
|
1096
|
-
"""Validate ontology terms in a pandas series using LaminDB registries.
|
1097
|
-
|
1098
|
-
Args:
|
1099
|
-
values: The values to validate.
|
1100
|
-
field: The field attribute.
|
1101
|
-
key: The key referencing the slot in the DataFrame.
|
1102
|
-
using_key: A reference LaminDB instance.
|
1103
|
-
organism: The organism name.
|
1104
|
-
source: The source record.
|
1105
|
-
exclude: Exclude specific values.
|
1106
|
-
standardize: Standardize the values.
|
1107
|
-
validated_hint_print: The hint to print for validated values.
|
1108
|
-
"""
|
1109
|
-
from lamindb._from_values import _print_values
|
1110
|
-
from lamindb.core._settings import settings
|
1111
|
-
|
1112
|
-
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
1113
|
-
|
1114
|
-
def _log_mapping_info():
|
1115
|
-
logger.indent = ""
|
1116
|
-
logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}")
|
1117
|
-
logger.indent = " "
|
1118
|
-
|
1119
|
-
registry = field.field.model
|
1120
|
-
|
1121
|
-
kwargs = check_registry_organism(registry, organism)
|
1122
|
-
kwargs.update({"source": source} if source else {})
|
1123
|
-
kwargs_current = get_current_filter_kwargs(registry, kwargs)
|
1124
|
-
|
1125
|
-
# inspect the default instance
|
1126
|
-
inspect_result = standardize_and_inspect(
|
1127
|
-
values=values,
|
1128
|
-
field=field,
|
1129
|
-
registry=registry,
|
1130
|
-
standardize=standardize,
|
1131
|
-
exclude=exclude,
|
1132
|
-
**kwargs_current,
|
1133
|
-
)
|
1134
|
-
non_validated = inspect_result.non_validated
|
1135
|
-
|
1136
|
-
# inspect the using instance
|
1137
|
-
values_validated = []
|
1138
|
-
if using_key is not None and using_key != "default" and non_validated:
|
1139
|
-
registry_using = get_registry_instance(registry, using_key)
|
1140
|
-
inspect_result = standardize_and_inspect(
|
1141
|
-
values=non_validated,
|
1142
|
-
field=field,
|
1143
|
-
registry=registry_using,
|
1144
|
-
standardize=standardize,
|
1145
|
-
exclude=exclude,
|
1146
|
-
**kwargs,
|
1147
|
-
)
|
1148
|
-
non_validated = inspect_result.non_validated
|
1149
|
-
values_validated += inspect_result.validated
|
1150
|
-
|
1151
|
-
# inspect from public (bionty only)
|
1152
|
-
if hasattr(registry, "public"):
|
1153
|
-
verbosity = settings.verbosity
|
1154
|
-
try:
|
1155
|
-
settings.verbosity = "error"
|
1156
|
-
public_records = registry.from_values(
|
1157
|
-
non_validated,
|
1158
|
-
field=field,
|
1159
|
-
**kwargs_current,
|
1160
|
-
)
|
1161
|
-
values_validated += [getattr(r, field.field.name) for r in public_records]
|
1162
|
-
finally:
|
1163
|
-
settings.verbosity = verbosity
|
1164
|
-
|
1165
|
-
validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
|
1166
|
-
n_validated = len(values_validated)
|
1167
|
-
if n_validated > 0:
|
1168
|
-
_log_mapping_info()
|
1169
|
-
logger.warning(
|
1170
|
-
f"found {colors.yellow(n_validated)} validated terms: "
|
1171
|
-
f"{colors.yellow(values_validated)}\n → save terms via "
|
1172
|
-
f"{colors.yellow(validated_hint_print)}"
|
1173
|
-
)
|
1174
|
-
|
1175
|
-
non_validated_hint_print = f".add_new_from('{key}')"
|
1176
|
-
non_validated = [i for i in non_validated if i not in values_validated]
|
1177
|
-
n_non_validated = len(non_validated)
|
1178
|
-
if n_non_validated == 0:
|
1179
|
-
if n_validated == 0:
|
1180
|
-
logger.indent = ""
|
1181
|
-
logger.success(f"{key} is validated against {colors.italic(model_field)}")
|
1182
|
-
return True, []
|
1183
|
-
else:
|
1184
|
-
# validated values still need to be saved to the current instance
|
1185
|
-
return False, []
|
1186
|
-
else:
|
1187
|
-
are = "are" if n_non_validated > 1 else "is"
|
1188
|
-
print_values = _print_values(non_validated)
|
1189
|
-
warning_message = (
|
1190
|
-
f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
|
1191
|
-
f"{colors.red(print_values)}\n → fix typos, remove non-existent values, or save terms via "
|
1192
|
-
f"{colors.red(non_validated_hint_print)}"
|
1193
|
-
)
|
1194
|
-
if logger.indent == "":
|
1195
|
-
_log_mapping_info()
|
1196
|
-
logger.warning(warning_message)
|
1197
|
-
logger.indent = ""
|
1198
|
-
return False, non_validated
|
1199
|
-
|
1200
|
-
|
1201
|
-
def validate_categories_in_df(
|
1202
|
-
df: pd.DataFrame,
|
1203
|
-
fields: dict[str, FieldAttr],
|
1204
|
-
using_key: str | None = None,
|
1205
|
-
sources: dict[str, Record] = None,
|
1206
|
-
exclude: dict | None = None,
|
1207
|
-
**kwargs,
|
1208
|
-
) -> tuple[bool, dict]:
|
1209
|
-
"""Validate categories in DataFrame columns using LaminDB registries."""
|
1210
|
-
if not fields:
|
1211
|
-
return True, {}
|
1212
|
-
|
1213
|
-
if sources is None:
|
1214
|
-
sources = {}
|
1215
|
-
validated = True
|
1216
|
-
non_validated = {}
|
1217
|
-
for key, field in fields.items():
|
1218
|
-
is_val, non_val = validate_categories(
|
1219
|
-
df[key],
|
1220
|
-
field=field,
|
1221
|
-
key=key,
|
1222
|
-
using_key=using_key,
|
1223
|
-
source=sources.get(key),
|
1224
|
-
exclude=exclude.get(key) if exclude else None,
|
1225
|
-
**kwargs,
|
1226
|
-
)
|
1227
|
-
validated &= is_val
|
1228
|
-
if len(non_val) > 0:
|
1229
|
-
non_validated[key] = non_val
|
1230
|
-
return validated, non_validated
|
1231
|
-
|
1232
|
-
|
1233
|
-
def save_artifact(
|
1234
|
-
data: pd.DataFrame | ad.AnnData | MuData,
|
1235
|
-
fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
|
1236
|
-
columns_field: FieldAttr | dict[str, FieldAttr],
|
1237
|
-
description: str | None = None,
|
1238
|
-
organism: str | None = None,
|
1239
|
-
adata: ad.AnnData | None = None,
|
1240
|
-
**kwargs,
|
1241
|
-
) -> Artifact:
|
1242
|
-
"""Save all metadata with an Artifact.
|
1243
|
-
|
1244
|
-
Args:
|
1245
|
-
data: The DataFrame or AnnData object to save.
|
1246
|
-
description: A description of the artifact.
|
1247
|
-
fields: A dictionary mapping obs_column to registry_field.
|
1248
|
-
columns_field: The registry field to validate variables index against.
|
1249
|
-
organism: The organism name.
|
1250
|
-
adata: The AnnData object to save, must be provided if data is a path.
|
1251
|
-
kwargs: Additional keyword arguments to pass to the registry model.
|
1252
|
-
|
1253
|
-
Returns:
|
1254
|
-
The saved Artifact.
|
1255
|
-
"""
|
1256
|
-
from ._artifact import data_is_anndata
|
1257
|
-
|
1258
|
-
artifact = None
|
1259
|
-
if data_is_anndata(data):
|
1260
|
-
assert adata is not None # noqa: S101
|
1261
|
-
artifact = Artifact.from_anndata(data, description=description, **kwargs)
|
1262
|
-
artifact.n_observations = adata.shape[0]
|
1263
|
-
data = adata
|
1264
|
-
|
1265
|
-
elif isinstance(data, pd.DataFrame):
|
1266
|
-
artifact = Artifact.from_df(data, description=description, **kwargs)
|
1267
|
-
else:
|
1268
|
-
try:
|
1269
|
-
from mudata import MuData
|
1270
|
-
|
1271
|
-
if isinstance(data, MuData):
|
1272
|
-
artifact = Artifact.from_mudata(data, description=description, **kwargs)
|
1273
|
-
artifact.n_observations = data.n_obs
|
1274
|
-
except ImportError:
|
1275
|
-
pass
|
1276
|
-
if artifact is None:
|
1277
|
-
raise ValueError("data must be a DataFrame, AnnData or MuData object.")
|
1278
|
-
artifact.save()
|
1279
|
-
|
1280
|
-
feature_kwargs = check_registry_organism(
|
1281
|
-
(
|
1282
|
-
list(columns_field.values())[0].field.model
|
1283
|
-
if isinstance(columns_field, dict)
|
1284
|
-
else columns_field.field.model
|
1285
|
-
),
|
1286
|
-
organism,
|
1287
|
-
)
|
1288
|
-
|
1289
|
-
if artifact._accessor == "DataFrame":
|
1290
|
-
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
|
1291
|
-
elif artifact._accessor == "AnnData":
|
1292
|
-
artifact.features._add_set_from_anndata(
|
1293
|
-
var_field=columns_field, **feature_kwargs
|
1294
|
-
)
|
1295
|
-
elif artifact._accessor == "MuData":
|
1296
|
-
artifact.features._add_set_from_mudata(
|
1297
|
-
var_fields=columns_field, **feature_kwargs
|
1298
|
-
)
|
1299
|
-
else:
|
1300
|
-
raise NotImplementedError
|
1301
|
-
|
1302
|
-
def _add_labels(data, artifact: Artifact, fields: dict[str, FieldAttr]):
|
1303
|
-
features = Feature.lookup().dict()
|
1304
|
-
for key, field in fields.items():
|
1305
|
-
feature = features.get(key)
|
1306
|
-
registry = field.field.model
|
1307
|
-
filter_kwargs = check_registry_organism(registry, organism)
|
1308
|
-
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
1309
|
-
df = data if isinstance(data, pd.DataFrame) else data.obs
|
1310
|
-
labels = registry.from_values(
|
1311
|
-
df[key],
|
1312
|
-
field=field,
|
1313
|
-
**filter_kwargs_current,
|
1314
|
-
)
|
1315
|
-
artifact.labels.add(labels, feature)
|
1316
|
-
|
1317
|
-
if artifact._accessor == "MuData":
|
1318
|
-
for modality, modality_fields in fields.items():
|
1319
|
-
if modality == "obs":
|
1320
|
-
_add_labels(data, artifact, modality_fields)
|
1321
|
-
else:
|
1322
|
-
_add_labels(data[modality], artifact, modality_fields)
|
1323
|
-
else:
|
1324
|
-
_add_labels(data, artifact, fields)
|
1325
|
-
|
1326
|
-
slug = ln_setup.settings.instance.slug
|
1327
|
-
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
1328
|
-
logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
|
1329
|
-
return artifact
|
1330
|
-
|
1331
|
-
|
1332
|
-
def update_registry(
|
1333
|
-
values: list[str],
|
1334
|
-
field: FieldAttr,
|
1335
|
-
key: str,
|
1336
|
-
save_function: str = "add_new_from",
|
1337
|
-
using_key: str | None = None,
|
1338
|
-
validated_only: bool = True,
|
1339
|
-
df: pd.DataFrame | None = None,
|
1340
|
-
organism: str | None = None,
|
1341
|
-
dtype: str | None = None,
|
1342
|
-
source: Record | None = None,
|
1343
|
-
standardize: bool = True,
|
1344
|
-
warning: bool = True,
|
1345
|
-
exclude: str | list | None = None,
|
1346
|
-
**kwargs,
|
1347
|
-
) -> None:
|
1348
|
-
"""Save features or labels records in the default instance from the using_key instance.
|
1349
|
-
|
1350
|
-
Args:
|
1351
|
-
values: A list of values to be saved as labels.
|
1352
|
-
field: The FieldAttr object representing the field for which labels are being saved.
|
1353
|
-
key: The name of the feature to save.
|
1354
|
-
save_function: The name of the function to save the labels.
|
1355
|
-
using_key: The name of the instance from which to transfer labels (if applicable).
|
1356
|
-
validated_only: If True, only save validated labels.
|
1357
|
-
df: A DataFrame to save labels from.
|
1358
|
-
organism: The organism name.
|
1359
|
-
dtype: The type of the feature.
|
1360
|
-
source: The source record.
|
1361
|
-
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
1362
|
-
"""
|
1363
|
-
from lamindb._save import save as ln_save
|
1364
|
-
from lamindb.core._settings import settings
|
1365
|
-
|
1366
|
-
registry = field.field.model
|
1367
|
-
filter_kwargs = check_registry_organism(registry, organism)
|
1368
|
-
filter_kwargs.update({"source": source} if source else {})
|
1369
|
-
|
1370
|
-
verbosity = settings.verbosity
|
1371
|
-
try:
|
1372
|
-
settings.verbosity = "error"
|
1373
|
-
|
1374
|
-
# save from public
|
1375
|
-
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
1376
|
-
existing_and_public_records = (
|
1377
|
-
registry.from_values(
|
1378
|
-
list(values),
|
1379
|
-
field=field,
|
1380
|
-
**filter_kwargs_current,
|
1381
|
-
)
|
1382
|
-
if values
|
1383
|
-
else []
|
1384
|
-
)
|
1385
|
-
|
1386
|
-
labels_saved: dict = {"from public": [], "without reference": []}
|
1387
|
-
|
1388
|
-
public_records = [r for r in existing_and_public_records if r._state.adding]
|
1389
|
-
# here we check to only save the public records if they are from the specified source
|
1390
|
-
# we check the uid because r.source and soruce can be from different instances
|
1391
|
-
if source:
|
1392
|
-
public_records = [r for r in public_records if r.source.uid == source.uid]
|
1393
|
-
ln_save(public_records)
|
1394
|
-
labels_saved["from public"] = [
|
1395
|
-
getattr(r, field.field.name) for r in public_records
|
1396
|
-
]
|
1397
|
-
non_public_labels = [i for i in values if i not in labels_saved["from public"]]
|
1398
|
-
|
1399
|
-
# inspect the default instance
|
1400
|
-
inspect_result_current = standardize_and_inspect(
|
1401
|
-
values=non_public_labels,
|
1402
|
-
field=field,
|
1403
|
-
registry=registry,
|
1404
|
-
standardize=standardize,
|
1405
|
-
exclude=exclude,
|
1406
|
-
**filter_kwargs_current,
|
1407
|
-
)
|
1408
|
-
if not inspect_result_current.non_validated:
|
1409
|
-
all_labels = registry.from_values(
|
1410
|
-
inspect_result_current.validated,
|
1411
|
-
field=field,
|
1412
|
-
**filter_kwargs_current,
|
1413
|
-
)
|
1414
|
-
settings.verbosity = verbosity
|
1415
|
-
return all_labels
|
1416
|
-
|
1417
|
-
# inspect the using_key instance
|
1418
|
-
(
|
1419
|
-
labels_saved[f"from {using_key}"],
|
1420
|
-
non_validated_labels,
|
1421
|
-
) = update_registry_from_using_instance(
|
1422
|
-
inspect_result_current.non_validated,
|
1423
|
-
field=field,
|
1424
|
-
using_key=using_key,
|
1425
|
-
exclude=exclude,
|
1426
|
-
**filter_kwargs,
|
1427
|
-
)
|
1428
|
-
|
1429
|
-
labels_saved["without reference"] = [
|
1430
|
-
i
|
1431
|
-
for i in non_validated_labels
|
1432
|
-
if i not in labels_saved[f"from {using_key}"]
|
1433
|
-
]
|
1434
|
-
|
1435
|
-
# save non-validated records
|
1436
|
-
if not validated_only:
|
1437
|
-
non_validated_records = []
|
1438
|
-
if df is not None and registry == Feature:
|
1439
|
-
non_validated_records = Feature.from_df(df)
|
1440
|
-
else:
|
1441
|
-
if "organism" in filter_kwargs:
|
1442
|
-
# make sure organism record is saved to the current instance
|
1443
|
-
filter_kwargs["organism"] = _save_organism(name=organism)
|
1444
|
-
init_kwargs = {}
|
1445
|
-
for value in labels_saved["without reference"]:
|
1446
|
-
init_kwargs[field.field.name] = value
|
1447
|
-
if registry == Feature:
|
1448
|
-
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
1449
|
-
non_validated_records.append(
|
1450
|
-
registry(
|
1451
|
-
**init_kwargs,
|
1452
|
-
**{k: v for k, v in filter_kwargs.items() if k != "source"},
|
1453
|
-
**{k: v for k, v in kwargs.items() if k != "sources"},
|
1454
|
-
)
|
1455
|
-
)
|
1456
|
-
ln_save(non_validated_records)
|
1457
|
-
|
1458
|
-
# save parent labels for ulabels
|
1459
|
-
if registry == ULabel and field.field.name == "name":
|
1460
|
-
save_ulabels_with_parent(values, field=field, key=key)
|
1461
|
-
|
1462
|
-
# # get all records that are now validated in the current instance
|
1463
|
-
# all_labels = registry.from_values(
|
1464
|
-
# inspect_result_current.validated + inspect_result_current.non_validated,
|
1465
|
-
# field=field,
|
1466
|
-
# **get_current_filter_kwargs(registry, filter_kwargs),
|
1467
|
-
# )
|
1468
|
-
finally:
|
1469
|
-
settings.verbosity = verbosity
|
1470
|
-
|
1471
|
-
log_saved_labels(
|
1472
|
-
labels_saved,
|
1473
|
-
key=key,
|
1474
|
-
save_function=save_function,
|
1475
|
-
model_field=f"{registry.__name__}.{field.field.name}",
|
1476
|
-
validated_only=validated_only,
|
1477
|
-
warning=warning,
|
1478
|
-
)
|
1479
|
-
|
1480
|
-
# return all_labels
|
1481
|
-
|
1482
|
-
|
1483
|
-
def log_saved_labels(
|
1484
|
-
labels_saved: dict,
|
1485
|
-
key: str,
|
1486
|
-
save_function: str,
|
1487
|
-
model_field: str,
|
1488
|
-
validated_only: bool = True,
|
1489
|
-
warning: bool = True,
|
1490
|
-
) -> None:
|
1491
|
-
"""Log the saved labels."""
|
1492
|
-
from ._from_values import _print_values
|
1493
|
-
|
1494
|
-
model_field = colors.italic(model_field)
|
1495
|
-
for k, labels in labels_saved.items():
|
1496
|
-
if not labels:
|
1497
|
-
continue
|
1498
|
-
|
1499
|
-
if k == "without reference" and validated_only:
|
1500
|
-
msg = colors.yellow(
|
1501
|
-
f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
|
1502
|
-
)
|
1503
|
-
lookup_print = (
|
1504
|
-
f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
|
1505
|
-
)
|
1506
|
-
|
1507
|
-
hint = f".add_new_from('{key}')"
|
1508
|
-
msg += f"\n → to lookup values, use {lookup_print}"
|
1509
|
-
msg += (
|
1510
|
-
f"\n → to save, run {colors.yellow(hint)}"
|
1511
|
-
if save_function == "add_new_from"
|
1512
|
-
else f"\n → to save, run {colors.yellow(save_function)}"
|
1513
|
-
)
|
1514
|
-
if warning:
|
1515
|
-
logger.warning(msg)
|
1516
|
-
else:
|
1517
|
-
logger.info(msg)
|
1518
|
-
else:
|
1519
|
-
k = "" if k == "without reference" else f"{colors.green(k)} "
|
1520
|
-
# the term "transferred" stresses that this is always in the context of transferring
|
1521
|
-
# labels from a public ontology or a different instance to the present instance
|
1522
|
-
s = "s" if len(labels) > 1 else ""
|
1523
|
-
logger.success(
|
1524
|
-
f"added {len(labels)} record{s} {k}with {model_field} for {colors.italic(key)}: {_print_values(labels)}"
|
1525
|
-
)
|
1526
|
-
|
1527
|
-
|
1528
|
-
def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None:
|
1529
|
-
"""Save a parent label for the given labels."""
|
1530
|
-
registry = field.field.model
|
1531
|
-
assert registry == ULabel # noqa: S101
|
1532
|
-
all_records = registry.from_values(list(values), field=field)
|
1533
|
-
is_feature = registry.filter(name=f"is_{key}").one_or_none()
|
1534
|
-
if is_feature is None:
|
1535
|
-
is_feature = registry(name=f"is_{key}")
|
1536
|
-
is_feature.save()
|
1537
|
-
is_feature.children.add(*all_records)
|
1538
|
-
|
1539
|
-
|
1540
|
-
def update_registry_from_using_instance(
|
1541
|
-
values: list[str],
|
1542
|
-
field: FieldAttr,
|
1543
|
-
using_key: str | None = None,
|
1544
|
-
standardize: bool = False,
|
1545
|
-
exclude: str | list | None = None,
|
1546
|
-
**kwargs,
|
1547
|
-
) -> tuple[list[str], list[str]]:
|
1548
|
-
"""Save features or labels records from the using_key instance.
|
1549
|
-
|
1550
|
-
Args:
|
1551
|
-
values: A list of values to be saved as labels.
|
1552
|
-
field: The FieldAttr object representing the field for which labels are being saved.
|
1553
|
-
using_key: The name of the instance from which to transfer labels (if applicable).
|
1554
|
-
standardize: Whether to also standardize the values.
|
1555
|
-
kwargs: Additional keyword arguments to pass to the registry model.
|
1556
|
-
|
1557
|
-
Returns:
|
1558
|
-
A tuple containing the list of saved labels and the list of non-saved labels.
|
1559
|
-
"""
|
1560
|
-
labels_saved = []
|
1561
|
-
not_saved = values
|
1562
|
-
|
1563
|
-
if using_key is not None and using_key != "default":
|
1564
|
-
registry_using = get_registry_instance(field.field.model, using_key)
|
1565
|
-
|
1566
|
-
inspect_result_using = standardize_and_inspect(
|
1567
|
-
values=values,
|
1568
|
-
field=field,
|
1569
|
-
registry=registry_using,
|
1570
|
-
standardize=standardize,
|
1571
|
-
exclude=exclude,
|
1572
|
-
**kwargs,
|
1573
|
-
)
|
1574
|
-
labels_using = registry_using.filter(
|
1575
|
-
**{f"{field.field.name}__in": inspect_result_using.validated}
|
1576
|
-
).all()
|
1577
|
-
for label_using in labels_using:
|
1578
|
-
label_using.save()
|
1579
|
-
labels_saved.append(getattr(label_using, field.field.name))
|
1580
|
-
not_saved = inspect_result_using.non_validated
|
1581
|
-
|
1582
|
-
return labels_saved, not_saved
|
1583
|
-
|
1584
|
-
|
1585
|
-
def _save_organism(name: str): # pragma: no cover
|
1586
|
-
"""Save an organism record."""
|
1587
|
-
import bionty as bt
|
1588
|
-
|
1589
|
-
organism = bt.Organism.filter(name=name).one_or_none()
|
1590
|
-
if organism is None:
|
1591
|
-
organism = bt.Organism.from_source(name=name)
|
1592
|
-
if organism is None:
|
1593
|
-
raise ValueError(
|
1594
|
-
f"Organism '{name}' not found\n"
|
1595
|
-
f" → please save it: bt.Organism(name='{name}').save()"
|
1596
|
-
)
|
1597
|
-
organism.save()
|
1598
|
-
return organism
|
1599
|
-
|
1600
|
-
|
1601
|
-
Curate = Curator # backward compat
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import copy
|
4
|
+
from typing import TYPE_CHECKING, Iterable
|
5
|
+
|
6
|
+
import anndata as ad
|
7
|
+
import lamindb_setup as ln_setup
|
8
|
+
import pandas as pd
|
9
|
+
from lamin_utils import colors, logger
|
10
|
+
from lamindb_setup.core._docs import doc_args
|
11
|
+
from lnschema_core import (
|
12
|
+
Artifact,
|
13
|
+
Feature,
|
14
|
+
Record,
|
15
|
+
Run,
|
16
|
+
ULabel,
|
17
|
+
)
|
18
|
+
|
19
|
+
from .core.exceptions import ValidationError
|
20
|
+
|
21
|
+
if TYPE_CHECKING:
|
22
|
+
from lamindb_setup.core.types import UPathStr
|
23
|
+
from lnschema_core.types import FieldAttr
|
24
|
+
from mudata import MuData
|
25
|
+
|
26
|
+
|
27
|
+
class CurateLookup:
|
28
|
+
"""Lookup categories from the reference instance."""
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
categoricals: dict[str, FieldAttr],
|
33
|
+
slots: dict[str, FieldAttr] = None,
|
34
|
+
using_key: str | None = None,
|
35
|
+
) -> None:
|
36
|
+
if slots is None:
|
37
|
+
slots = {}
|
38
|
+
self._fields = {**categoricals, **slots}
|
39
|
+
self._using_key = None if using_key == "default" else using_key
|
40
|
+
self._using_key_name = self._using_key or ln_setup.settings.instance.slug
|
41
|
+
debug_message = (
|
42
|
+
f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
|
43
|
+
)
|
44
|
+
logger.debug(debug_message)
|
45
|
+
|
46
|
+
def __getattr__(self, name):
|
47
|
+
if name in self._fields:
|
48
|
+
registry = self._fields[name].field.model
|
49
|
+
if self._using_key == "public":
|
50
|
+
return registry.public().lookup()
|
51
|
+
else:
|
52
|
+
return get_registry_instance(registry, self._using_key).lookup()
|
53
|
+
raise AttributeError(
|
54
|
+
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
55
|
+
)
|
56
|
+
|
57
|
+
def __getitem__(self, name):
|
58
|
+
if name in self._fields:
|
59
|
+
registry = self._fields[name].field.model
|
60
|
+
if self._using_key == "public":
|
61
|
+
return registry.public().lookup()
|
62
|
+
else:
|
63
|
+
return get_registry_instance(registry, self._using_key).lookup()
|
64
|
+
raise AttributeError(
|
65
|
+
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
66
|
+
)
|
67
|
+
|
68
|
+
def __repr__(self) -> str:
|
69
|
+
if len(self._fields) > 0:
|
70
|
+
getattr_keys = "\n ".join(
|
71
|
+
[f".{key}" for key in self._fields if key.isidentifier()]
|
72
|
+
)
|
73
|
+
getitem_keys = "\n ".join(
|
74
|
+
[str([key]) for key in self._fields if not key.isidentifier()]
|
75
|
+
)
|
76
|
+
return (
|
77
|
+
f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
|
78
|
+
f"{colors.green(getattr_keys)}\n "
|
79
|
+
f"{colors.green(getitem_keys)}\n\n"
|
80
|
+
"Example:\n → categories = validator.lookup().cell_type\n"
|
81
|
+
" → categories.alveolar_type_1_fibroblast_cell"
|
82
|
+
)
|
83
|
+
else: # pragma: no cover
|
84
|
+
return colors.warning("No fields are found!")
|
85
|
+
|
86
|
+
|
87
|
+
class BaseCurator:
|
88
|
+
"""Curate a dataset."""
|
89
|
+
|
90
|
+
def validate(self) -> bool:
|
91
|
+
"""Validate dataset.
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
Boolean indicating whether the dataset is validated.
|
95
|
+
"""
|
96
|
+
pass
|
97
|
+
|
98
|
+
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
99
|
+
"""Save the dataset as artifact.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
description: Description of the DataFrame object.
|
103
|
+
**kwargs: Object level metadata.
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
A saved artifact record.
|
107
|
+
"""
|
108
|
+
pass
|
109
|
+
|
110
|
+
|
111
|
+
class DataFrameCurator(BaseCurator):
|
112
|
+
"""Curation flow for a DataFrame object.
|
113
|
+
|
114
|
+
See also :class:`~lamindb.Curator`.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
df: The DataFrame object to curate.
|
118
|
+
columns: The field attribute for the feature column.
|
119
|
+
categoricals: A dictionary mapping column names to registry_field.
|
120
|
+
using_key: The reference instance containing registries to validate against.
|
121
|
+
verbosity: The verbosity level.
|
122
|
+
organism: The organism name.
|
123
|
+
sources: A dictionary mapping column names to Source records.
|
124
|
+
exclude: A dictionary mapping column names to values to exclude.
|
125
|
+
|
126
|
+
Examples:
|
127
|
+
>>> import bionty as bt
|
128
|
+
>>> curate = ln.Curator.from_df(
|
129
|
+
... df,
|
130
|
+
... categoricals={
|
131
|
+
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
132
|
+
... "donor_id": ln.ULabel.name
|
133
|
+
... }
|
134
|
+
... )
|
135
|
+
"""
|
136
|
+
|
137
|
+
def __init__(
|
138
|
+
self,
|
139
|
+
df: pd.DataFrame,
|
140
|
+
columns: FieldAttr = Feature.name,
|
141
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
142
|
+
using_key: str | None = None,
|
143
|
+
verbosity: str = "hint",
|
144
|
+
organism: str | None = None,
|
145
|
+
sources: dict[str, Record] | None = None,
|
146
|
+
exclude: dict | None = None,
|
147
|
+
check_valid_keys: bool = True,
|
148
|
+
) -> None:
|
149
|
+
from lamindb.core._settings import settings
|
150
|
+
|
151
|
+
self._df = df
|
152
|
+
self._fields = categoricals or {}
|
153
|
+
self._columns_field = columns
|
154
|
+
self._using_key = using_key
|
155
|
+
settings.verbosity = verbosity
|
156
|
+
self._artifact = None
|
157
|
+
self._collection = None
|
158
|
+
self._validated = False
|
159
|
+
self._kwargs = {"organism": organism} if organism else {}
|
160
|
+
if sources is None:
|
161
|
+
sources = {}
|
162
|
+
self._sources = sources
|
163
|
+
if exclude is None:
|
164
|
+
exclude = {}
|
165
|
+
self._exclude = exclude
|
166
|
+
self._non_validated = None
|
167
|
+
if check_valid_keys:
|
168
|
+
self._check_valid_keys()
|
169
|
+
self._save_columns()
|
170
|
+
|
171
|
+
@property
|
172
|
+
def non_validated(self) -> list:
|
173
|
+
"""Return the non-validated features and labels."""
|
174
|
+
if self._non_validated is None:
|
175
|
+
raise ValueError("Please run validate() first!")
|
176
|
+
return self._non_validated
|
177
|
+
|
178
|
+
@property
|
179
|
+
def fields(self) -> dict:
|
180
|
+
"""Return the columns fields to validate against."""
|
181
|
+
return self._fields
|
182
|
+
|
183
|
+
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
184
|
+
"""Lookup categories.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
using_key: The instance where the lookup is performed.
|
188
|
+
if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
|
189
|
+
if "public", the lookup is performed on the public reference.
|
190
|
+
"""
|
191
|
+
return CurateLookup(
|
192
|
+
categoricals=self._fields,
|
193
|
+
slots={"columns": self._columns_field},
|
194
|
+
using_key=using_key or self._using_key,
|
195
|
+
)
|
196
|
+
|
197
|
+
def _check_valid_keys(self, extra: set = None) -> None:
|
198
|
+
if extra is None:
|
199
|
+
extra = set()
|
200
|
+
for name, d in {
|
201
|
+
"categoricals": self._fields,
|
202
|
+
"sources": self._sources,
|
203
|
+
"exclude": self._exclude,
|
204
|
+
}.items():
|
205
|
+
if not isinstance(d, dict):
|
206
|
+
raise TypeError(f"{name} must be a dictionary!")
|
207
|
+
valid_keys = set(self._df.columns) | {"columns"} | extra
|
208
|
+
nonval_keys = [key for key in d.keys() if key not in valid_keys]
|
209
|
+
if len(nonval_keys) > 0:
|
210
|
+
raise ValueError(
|
211
|
+
f"the following keys passed to {name} are not allowed: {nonval_keys}"
|
212
|
+
)
|
213
|
+
|
214
|
+
def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
|
215
|
+
"""Save column name records."""
|
216
|
+
# Always save features specified as the fields keys
|
217
|
+
update_registry(
|
218
|
+
values=list(self.fields.keys()),
|
219
|
+
field=self._columns_field,
|
220
|
+
key="columns",
|
221
|
+
save_function="add_new_from_columns",
|
222
|
+
using_key=self._using_key,
|
223
|
+
validated_only=False,
|
224
|
+
source=self._sources.get("columns"),
|
225
|
+
exclude=self._exclude.get("columns"),
|
226
|
+
**kwargs,
|
227
|
+
)
|
228
|
+
|
229
|
+
# Save the rest of the columns based on validated_only
|
230
|
+
additional_columns = set(self._df.columns) - set(self.fields.keys())
|
231
|
+
if additional_columns:
|
232
|
+
update_registry(
|
233
|
+
values=list(additional_columns),
|
234
|
+
field=self._columns_field,
|
235
|
+
key="columns",
|
236
|
+
save_function="add_new_from_columns",
|
237
|
+
using_key=self._using_key,
|
238
|
+
validated_only=validated_only,
|
239
|
+
df=self._df, # Get the Feature type from df
|
240
|
+
source=self._sources.get("columns"),
|
241
|
+
exclude=self._exclude.get("columns"),
|
242
|
+
warning=False, # Do not warn about missing columns, just an info message
|
243
|
+
**kwargs,
|
244
|
+
)
|
245
|
+
|
246
|
+
def add_validated_from(self, key: str, organism: str | None = None):
|
247
|
+
"""Add validated categories.
|
248
|
+
|
249
|
+
Args:
|
250
|
+
key: The key referencing the slot in the DataFrame.
|
251
|
+
organism: The organism name.
|
252
|
+
"""
|
253
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
254
|
+
self._update_registry(key, validated_only=True, **self._kwargs)
|
255
|
+
|
256
|
+
def add_new_from(self, key: str, organism: str | None = None, **kwargs):
|
257
|
+
"""Add validated & new categories.
|
258
|
+
|
259
|
+
Args:
|
260
|
+
key: The key referencing the slot in the DataFrame from which to draw terms.
|
261
|
+
organism: The organism name.
|
262
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
263
|
+
"""
|
264
|
+
if len(kwargs) > 0 and key == "all":
|
265
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
266
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
267
|
+
self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
|
268
|
+
|
269
|
+
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
270
|
+
"""Add validated & new column names to its registry.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
organism: The organism name.
|
274
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
275
|
+
"""
|
276
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
277
|
+
self._save_columns(validated_only=False, **self._kwargs, **kwargs)
|
278
|
+
|
279
|
+
def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
|
280
|
+
if categorical == "all":
|
281
|
+
self._update_registry_all(validated_only=validated_only, **kwargs)
|
282
|
+
elif categorical == "columns":
|
283
|
+
self._save_columns(validated_only=validated_only, **kwargs)
|
284
|
+
else:
|
285
|
+
if categorical not in self.fields:
|
286
|
+
raise ValueError(f"Feature {categorical} is not part of the fields!")
|
287
|
+
update_registry(
|
288
|
+
values=self._df[categorical].unique().tolist(),
|
289
|
+
field=self.fields[categorical],
|
290
|
+
key=categorical,
|
291
|
+
using_key=self._using_key,
|
292
|
+
validated_only=validated_only,
|
293
|
+
source=self._sources.get(categorical),
|
294
|
+
exclude=self._exclude.get(categorical),
|
295
|
+
**kwargs,
|
296
|
+
)
|
297
|
+
|
298
|
+
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
299
|
+
"""Save labels for all features."""
|
300
|
+
for name in self.fields.keys():
|
301
|
+
logger.info(f"saving labels for '{name}'")
|
302
|
+
self._update_registry(name, validated_only=validated_only, **kwargs)
|
303
|
+
|
304
|
+
def validate(self, organism: str | None = None) -> bool:
|
305
|
+
"""Validate variables and categorical observations.
|
306
|
+
|
307
|
+
Args:
|
308
|
+
organism: The organism name.
|
309
|
+
|
310
|
+
Returns:
|
311
|
+
Whether the DataFrame is validated.
|
312
|
+
"""
|
313
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
314
|
+
self._validated, self._non_validated = validate_categories_in_df( # type: ignore
|
315
|
+
self._df,
|
316
|
+
fields=self.fields,
|
317
|
+
using_key=self._using_key,
|
318
|
+
sources=self._sources,
|
319
|
+
exclude=self._exclude,
|
320
|
+
**self._kwargs,
|
321
|
+
)
|
322
|
+
return self._validated
|
323
|
+
|
324
|
+
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
325
|
+
"""Save the validated DataFrame and metadata.
|
326
|
+
|
327
|
+
Args:
|
328
|
+
description: Description of the DataFrame object.
|
329
|
+
**kwargs: Object level metadata.
|
330
|
+
|
331
|
+
Returns:
|
332
|
+
A saved artifact record.
|
333
|
+
"""
|
334
|
+
from lamindb.core._settings import settings
|
335
|
+
|
336
|
+
if not self._validated:
|
337
|
+
self.validate()
|
338
|
+
if not self._validated:
|
339
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
340
|
+
|
341
|
+
# Make sure all labels are saved in the current instance
|
342
|
+
verbosity = settings.verbosity
|
343
|
+
try:
|
344
|
+
settings.verbosity = "warning"
|
345
|
+
# save all validated records to the current instance
|
346
|
+
self.add_validated_from("all")
|
347
|
+
|
348
|
+
self._artifact = save_artifact(
|
349
|
+
self._df,
|
350
|
+
description=description,
|
351
|
+
fields=self.fields,
|
352
|
+
columns_field=self._columns_field,
|
353
|
+
**kwargs,
|
354
|
+
**self._kwargs,
|
355
|
+
)
|
356
|
+
finally:
|
357
|
+
settings.verbosity = verbosity
|
358
|
+
|
359
|
+
return self._artifact
|
360
|
+
|
361
|
+
def clean_up_failed_runs(self):
|
362
|
+
"""Clean up previous failed runs that don't save any outputs."""
|
363
|
+
from lamindb.core._context import context
|
364
|
+
|
365
|
+
if context.run is not None:
|
366
|
+
Run.filter(transform=context.run.transform, output_artifacts=None).exclude(
|
367
|
+
uid=context.run.uid
|
368
|
+
).delete()
|
369
|
+
|
370
|
+
|
371
|
+
class AnnDataCurator(DataFrameCurator):
|
372
|
+
"""Curation flow for ``AnnData``.
|
373
|
+
|
374
|
+
See also :class:`~lamindb.Curator`.
|
375
|
+
|
376
|
+
Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
|
377
|
+
|
378
|
+
See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
|
379
|
+
|
380
|
+
Args:
|
381
|
+
data: The AnnData object or an AnnData-like path.
|
382
|
+
var_index: The registry field for mapping the ``.var`` index.
|
383
|
+
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
384
|
+
using_key: A reference LaminDB instance.
|
385
|
+
verbosity: The verbosity level.
|
386
|
+
organism: The organism name.
|
387
|
+
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
388
|
+
exclude: A dictionary mapping column names to values to exclude.
|
389
|
+
|
390
|
+
Examples:
|
391
|
+
>>> import bionty as bt
|
392
|
+
>>> curate = ln.Curator.from_anndata(
|
393
|
+
... adata,
|
394
|
+
... var_index=bt.Gene.ensembl_gene_id,
|
395
|
+
... categoricals={
|
396
|
+
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
397
|
+
... "donor_id": ln.ULabel.name
|
398
|
+
... },
|
399
|
+
... organism="human",
|
400
|
+
... )
|
401
|
+
"""
|
402
|
+
|
403
|
+
def __init__(
|
404
|
+
self,
|
405
|
+
data: ad.AnnData | UPathStr,
|
406
|
+
var_index: FieldAttr,
|
407
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
408
|
+
obs_columns: FieldAttr = Feature.name,
|
409
|
+
using_key: str = "default",
|
410
|
+
verbosity: str = "hint",
|
411
|
+
organism: str | None = None,
|
412
|
+
sources: dict[str, Record] | None = None,
|
413
|
+
exclude: dict | None = None,
|
414
|
+
) -> None:
|
415
|
+
from lamindb_setup.core import upath
|
416
|
+
|
417
|
+
from ._artifact import data_is_anndata
|
418
|
+
|
419
|
+
if sources is None:
|
420
|
+
sources = {}
|
421
|
+
if not data_is_anndata(data):
|
422
|
+
raise ValueError(
|
423
|
+
"data has to be an AnnData object or a path to AnnData-like"
|
424
|
+
)
|
425
|
+
if isinstance(data, ad.AnnData):
|
426
|
+
self._adata = data
|
427
|
+
else: # pragma: no cover
|
428
|
+
from lamindb.core.storage._backed_access import backed_access
|
429
|
+
|
430
|
+
self._adata = backed_access(upath.create_path(data))
|
431
|
+
|
432
|
+
self._data = data
|
433
|
+
self._var_field = var_index
|
434
|
+
super().__init__(
|
435
|
+
df=self._adata.obs,
|
436
|
+
categoricals=categoricals,
|
437
|
+
columns=obs_columns,
|
438
|
+
using_key=using_key,
|
439
|
+
verbosity=verbosity,
|
440
|
+
organism=organism,
|
441
|
+
sources=sources,
|
442
|
+
exclude=exclude,
|
443
|
+
check_valid_keys=False,
|
444
|
+
)
|
445
|
+
self._obs_fields = categoricals or {}
|
446
|
+
self._check_valid_keys(extra={"var_index"})
|
447
|
+
|
448
|
+
@property
|
449
|
+
def var_index(self) -> FieldAttr:
|
450
|
+
"""Return the registry field to validate variables index against."""
|
451
|
+
return self._var_field
|
452
|
+
|
453
|
+
@property
|
454
|
+
def categoricals(self) -> dict:
|
455
|
+
"""Return the obs fields to validate against."""
|
456
|
+
return self._obs_fields
|
457
|
+
|
458
|
+
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
459
|
+
"""Lookup categories.
|
460
|
+
|
461
|
+
Args:
|
462
|
+
using_key: The instance where the lookup is performed.
|
463
|
+
if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
|
464
|
+
if "public", the lookup is performed on the public reference.
|
465
|
+
"""
|
466
|
+
return CurateLookup(
|
467
|
+
categoricals=self._obs_fields,
|
468
|
+
slots={"columns": self._columns_field, "var_index": self._var_field},
|
469
|
+
using_key=using_key or self._using_key,
|
470
|
+
)
|
471
|
+
|
472
|
+
def _save_from_var_index(
|
473
|
+
self, validated_only: bool = True, organism: str | None = None
|
474
|
+
):
|
475
|
+
"""Save variable records."""
|
476
|
+
update_registry(
|
477
|
+
values=list(self._adata.var.index),
|
478
|
+
field=self.var_index,
|
479
|
+
key="var_index",
|
480
|
+
save_function="add_new_from_var_index",
|
481
|
+
using_key=self._using_key,
|
482
|
+
validated_only=validated_only,
|
483
|
+
organism=organism,
|
484
|
+
source=self._sources.get("var_index"),
|
485
|
+
exclude=self._exclude.get("var_index"),
|
486
|
+
)
|
487
|
+
|
488
|
+
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
489
|
+
"""Save labels for all features."""
|
490
|
+
for name in self.fields.keys():
|
491
|
+
logger.info(f"saving labels for '{name}'")
|
492
|
+
if name == "var_index":
|
493
|
+
self._save_from_var_index(validated_only=validated_only, **kwargs)
|
494
|
+
else:
|
495
|
+
self._update_registry(name, validated_only=validated_only, **kwargs)
|
496
|
+
|
497
|
+
def add_new_from_var_index(self, organism: str | None = None, **kwargs):
|
498
|
+
"""Update variable records.
|
499
|
+
|
500
|
+
Args:
|
501
|
+
organism: The organism name.
|
502
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
503
|
+
"""
|
504
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
505
|
+
self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
|
506
|
+
|
507
|
+
def add_validated_from_var_index(self, organism: str | None = None):
|
508
|
+
"""Add validated variable records.
|
509
|
+
|
510
|
+
Args:
|
511
|
+
organism: The organism name.
|
512
|
+
"""
|
513
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
514
|
+
self._save_from_var_index(validated_only=True, **self._kwargs)
|
515
|
+
|
516
|
+
def validate(self, organism: str | None = None) -> bool:
|
517
|
+
"""Validate categories.
|
518
|
+
|
519
|
+
Args:
|
520
|
+
organism: The organism name.
|
521
|
+
|
522
|
+
Returns:
|
523
|
+
Whether the AnnData object is validated.
|
524
|
+
"""
|
525
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
526
|
+
if self._using_key is not None and self._using_key != "default":
|
527
|
+
logger.important(
|
528
|
+
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
529
|
+
)
|
530
|
+
|
531
|
+
validated_var, non_validated_var = validate_categories(
|
532
|
+
self._adata.var.index,
|
533
|
+
field=self._var_field,
|
534
|
+
key="var_index",
|
535
|
+
using_key=self._using_key,
|
536
|
+
source=self._sources.get("var_index"),
|
537
|
+
validated_hint_print=".add_validated_from_var_index()",
|
538
|
+
exclude=self._exclude.get("var_index"),
|
539
|
+
**self._kwargs, # type: ignore
|
540
|
+
)
|
541
|
+
validated_obs, non_validated_obs = validate_categories_in_df(
|
542
|
+
self._adata.obs,
|
543
|
+
fields=self.categoricals,
|
544
|
+
using_key=self._using_key,
|
545
|
+
sources=self._sources,
|
546
|
+
exclude=self._exclude,
|
547
|
+
**self._kwargs,
|
548
|
+
)
|
549
|
+
self._non_validated = non_validated_obs # type: ignore
|
550
|
+
if len(non_validated_var) > 0:
|
551
|
+
self._non_validated["var_index"] = non_validated_var # type: ignore
|
552
|
+
self._validated = validated_var and validated_obs
|
553
|
+
return self._validated
|
554
|
+
|
555
|
+
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
556
|
+
"""Save the validated ``AnnData`` and metadata.
|
557
|
+
|
558
|
+
Args:
|
559
|
+
description: Description of the ``AnnData`` object.
|
560
|
+
**kwargs: Object level metadata.
|
561
|
+
|
562
|
+
Returns:
|
563
|
+
A saved artifact record.
|
564
|
+
"""
|
565
|
+
if not self._validated:
|
566
|
+
self.validate()
|
567
|
+
if not self._validated:
|
568
|
+
raise ValidationError("Dataset does not validate. Please curate.")
|
569
|
+
|
570
|
+
self._artifact = save_artifact(
|
571
|
+
self._data,
|
572
|
+
adata=self._adata,
|
573
|
+
description=description,
|
574
|
+
columns_field=self.var_index,
|
575
|
+
fields=self.categoricals,
|
576
|
+
**self._kwargs,
|
577
|
+
**kwargs,
|
578
|
+
)
|
579
|
+
return self._artifact
|
580
|
+
|
581
|
+
|
582
|
+
class MuDataCurator:
|
583
|
+
"""Curation flow for a ``MuData`` object.
|
584
|
+
|
585
|
+
See also :class:`~lamindb.Curator`.
|
586
|
+
|
587
|
+
Note that if genes or other measurements are removed from the MuData object,
|
588
|
+
the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
|
589
|
+
|
590
|
+
Args:
|
591
|
+
mdata: The MuData object to curate.
|
592
|
+
var_index: The registry field for mapping the ``.var`` index for each modality.
|
593
|
+
For example:
|
594
|
+
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
|
595
|
+
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
596
|
+
Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
|
597
|
+
using_key: A reference LaminDB instance.
|
598
|
+
verbosity: The verbosity level.
|
599
|
+
organism: The organism name.
|
600
|
+
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
601
|
+
exclude: A dictionary mapping column names to values to exclude.
|
602
|
+
|
603
|
+
Examples:
|
604
|
+
>>> import bionty as bt
|
605
|
+
>>> curate = ln.Curator.from_mudata(
|
606
|
+
... mdata,
|
607
|
+
... var_index={
|
608
|
+
... "rna": bt.Gene.ensembl_gene_id,
|
609
|
+
... "adt": ln.CellMarker.name
|
610
|
+
... },
|
611
|
+
... categoricals={
|
612
|
+
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
613
|
+
... "donor_id": ln.ULabel.name
|
614
|
+
... },
|
615
|
+
... organism="human",
|
616
|
+
... )
|
617
|
+
"""
|
618
|
+
|
619
|
+
def __init__(
|
620
|
+
self,
|
621
|
+
mdata: MuData,
|
622
|
+
var_index: dict[str, dict[str, FieldAttr]],
|
623
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
624
|
+
using_key: str = "default",
|
625
|
+
verbosity: str = "hint",
|
626
|
+
organism: str | None = None,
|
627
|
+
sources: dict[str, Record] | None = None,
|
628
|
+
exclude: dict | None = None,
|
629
|
+
) -> None:
|
630
|
+
if sources is None:
|
631
|
+
sources = {}
|
632
|
+
self._sources = sources
|
633
|
+
if exclude is None:
|
634
|
+
exclude = {}
|
635
|
+
self._exclude = exclude
|
636
|
+
self._mdata = mdata
|
637
|
+
self._kwargs = {"organism": organism} if organism else {}
|
638
|
+
self._var_fields = var_index
|
639
|
+
self._verify_modality(self._var_fields.keys())
|
640
|
+
self._obs_fields = self._parse_categoricals(categoricals)
|
641
|
+
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
642
|
+
self._using_key = using_key
|
643
|
+
self._verbosity = verbosity
|
644
|
+
self._df_annotators = {
|
645
|
+
modality: DataFrameCurator(
|
646
|
+
df=mdata[modality].obs if modality != "obs" else mdata.obs,
|
647
|
+
categoricals=self._obs_fields.get(modality, {}),
|
648
|
+
using_key=using_key,
|
649
|
+
verbosity=verbosity,
|
650
|
+
sources=self._sources.get(modality),
|
651
|
+
exclude=self._exclude.get(modality),
|
652
|
+
check_valid_keys=False,
|
653
|
+
**self._kwargs,
|
654
|
+
)
|
655
|
+
for modality in self._modalities
|
656
|
+
}
|
657
|
+
for modality in self._var_fields.keys():
|
658
|
+
self._save_from_var_index_modality(
|
659
|
+
modality=modality, validated_only=True, **self._kwargs
|
660
|
+
)
|
661
|
+
|
662
|
+
@property
|
663
|
+
def var_index(self) -> FieldAttr:
|
664
|
+
"""Return the registry field to validate variables index against."""
|
665
|
+
return self._var_fields
|
666
|
+
|
667
|
+
@property
|
668
|
+
def categoricals(self) -> dict:
|
669
|
+
"""Return the obs fields to validate against."""
|
670
|
+
return self._obs_fields
|
671
|
+
|
672
|
+
def _verify_modality(self, modalities: Iterable[str]):
|
673
|
+
"""Verify the modality exists."""
|
674
|
+
for modality in modalities:
|
675
|
+
if modality not in self._mdata.mod.keys():
|
676
|
+
raise ValueError(f"modality '{modality}' does not exist!")
|
677
|
+
|
678
|
+
def _save_from_var_index_modality(
|
679
|
+
self, modality: str, validated_only: bool = True, **kwargs
|
680
|
+
):
|
681
|
+
"""Save variable records."""
|
682
|
+
update_registry(
|
683
|
+
values=list(self._mdata[modality].var.index),
|
684
|
+
field=self._var_fields[modality],
|
685
|
+
key="var_index",
|
686
|
+
save_function="add_new_from_var_index",
|
687
|
+
using_key=self._using_key,
|
688
|
+
validated_only=validated_only,
|
689
|
+
dtype="number",
|
690
|
+
source=self._sources.get(modality, {}).get("var_index"),
|
691
|
+
exclude=self._exclude.get(modality, {}).get("var_index"),
|
692
|
+
**kwargs,
|
693
|
+
)
|
694
|
+
|
695
|
+
def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
|
696
|
+
"""Parse the categorical fields."""
|
697
|
+
prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
|
698
|
+
obs_fields: dict[str, dict[str, FieldAttr]] = {}
|
699
|
+
for k, v in categoricals.items():
|
700
|
+
if k not in self._mdata.obs.columns:
|
701
|
+
raise ValueError(f"column '{k}' does not exist in mdata.obs!")
|
702
|
+
if any(k.startswith(prefix) for prefix in prefixes):
|
703
|
+
modality, col = k.split(":")[0], k.split(":")[1]
|
704
|
+
if modality not in obs_fields.keys():
|
705
|
+
obs_fields[modality] = {}
|
706
|
+
obs_fields[modality][col] = v
|
707
|
+
else:
|
708
|
+
if "obs" not in obs_fields.keys():
|
709
|
+
obs_fields["obs"] = {}
|
710
|
+
obs_fields["obs"][k] = v
|
711
|
+
return obs_fields
|
712
|
+
|
713
|
+
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
714
|
+
"""Lookup categories.
|
715
|
+
|
716
|
+
Args:
|
717
|
+
using_key: The instance where the lookup is performed.
|
718
|
+
if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
|
719
|
+
if "public", the lookup is performed on the public reference.
|
720
|
+
"""
|
721
|
+
return CurateLookup(
|
722
|
+
categoricals=self._obs_fields,
|
723
|
+
slots={
|
724
|
+
**self._obs_fields,
|
725
|
+
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
726
|
+
},
|
727
|
+
using_key=using_key or self._using_key,
|
728
|
+
)
|
729
|
+
|
730
|
+
def add_new_from_columns(
|
731
|
+
self,
|
732
|
+
modality: str,
|
733
|
+
column_names: list[str] | None = None,
|
734
|
+
organism: str | None = None,
|
735
|
+
**kwargs,
|
736
|
+
):
|
737
|
+
"""Update columns records.
|
738
|
+
|
739
|
+
Args:
|
740
|
+
modality: The modality name.
|
741
|
+
column_names: The column names to save.
|
742
|
+
organism: The organism name.
|
743
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
744
|
+
"""
|
745
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
746
|
+
values = column_names or self._mdata[modality].obs.columns
|
747
|
+
update_registry(
|
748
|
+
values=list(values),
|
749
|
+
field=Feature.name,
|
750
|
+
key=f"{modality} obs columns",
|
751
|
+
using_key=self._using_key,
|
752
|
+
validated_only=False,
|
753
|
+
df=self._mdata[modality].obs,
|
754
|
+
source=self._sources.get(modality, {}).get("columns"),
|
755
|
+
exclude=self._exclude.get(modality, {}).get("columns"),
|
756
|
+
**self._kwargs, # type: ignore
|
757
|
+
**kwargs,
|
758
|
+
)
|
759
|
+
|
760
|
+
def add_new_from_var_index(
|
761
|
+
self, modality: str, organism: str | None = None, **kwargs
|
762
|
+
):
|
763
|
+
"""Update variable records.
|
764
|
+
|
765
|
+
Args:
|
766
|
+
modality: The modality name.
|
767
|
+
organism: The organism name.
|
768
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
769
|
+
"""
|
770
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
771
|
+
self._save_from_var_index_modality(
|
772
|
+
modality=modality, validated_only=False, **self._kwargs, **kwargs
|
773
|
+
)
|
774
|
+
|
775
|
+
def add_validated_from_var_index(self, modality: str, organism: str | None = None):
|
776
|
+
"""Add validated variable records.
|
777
|
+
|
778
|
+
Args:
|
779
|
+
modality: The modality name.
|
780
|
+
organism: The organism name.
|
781
|
+
"""
|
782
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
783
|
+
self._save_from_var_index_modality(
|
784
|
+
modality=modality, validated_only=True, **self._kwargs
|
785
|
+
)
|
786
|
+
|
787
|
+
def add_validated_from(
|
788
|
+
self, key: str, modality: str | None = None, organism: str | None = None
|
789
|
+
):
|
790
|
+
"""Add validated categories.
|
791
|
+
|
792
|
+
Args:
|
793
|
+
key: The key referencing the slot in the DataFrame.
|
794
|
+
modality: The modality name.
|
795
|
+
organism: The organism name.
|
796
|
+
"""
|
797
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
798
|
+
modality = modality or "obs"
|
799
|
+
if modality in self._df_annotators:
|
800
|
+
df_annotator = self._df_annotators[modality]
|
801
|
+
df_annotator.add_validated_from(key=key, **self._kwargs)
|
802
|
+
|
803
|
+
def add_new_from(
|
804
|
+
self,
|
805
|
+
key: str,
|
806
|
+
modality: str | None = None,
|
807
|
+
organism: str | None = None,
|
808
|
+
**kwargs,
|
809
|
+
):
|
810
|
+
"""Add validated & new categories.
|
811
|
+
|
812
|
+
Args:
|
813
|
+
key: The key referencing the slot in the DataFrame.
|
814
|
+
modality: The modality name.
|
815
|
+
organism: The organism name.
|
816
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
817
|
+
"""
|
818
|
+
if len(kwargs) > 0 and key == "all":
|
819
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
820
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
821
|
+
modality = modality or "obs"
|
822
|
+
if modality in self._df_annotators:
|
823
|
+
df_annotator = self._df_annotators[modality]
|
824
|
+
df_annotator.add_new_from(key=key, **self._kwargs, **kwargs)
|
825
|
+
|
826
|
+
def validate(self, organism: str | None = None) -> bool:
|
827
|
+
"""Validate categories."""
|
828
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
829
|
+
if self._using_key is not None and self._using_key != "default":
|
830
|
+
logger.important(
|
831
|
+
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
832
|
+
)
|
833
|
+
validated_var = True
|
834
|
+
non_validated_var_modality = {}
|
835
|
+
for modality, var_field in self._var_fields.items():
|
836
|
+
is_validated_var, non_validated_var = validate_categories(
|
837
|
+
self._mdata[modality].var.index,
|
838
|
+
field=var_field,
|
839
|
+
key=f"{modality}_var_index",
|
840
|
+
using_key=self._using_key,
|
841
|
+
source=self._sources.get(modality, {}).get("var_index"),
|
842
|
+
exclude=self._exclude.get(modality, {}).get("var_index"),
|
843
|
+
**self._kwargs, # type: ignore
|
844
|
+
)
|
845
|
+
validated_var &= is_validated_var
|
846
|
+
if len(non_validated_var) > 0:
|
847
|
+
non_validated_var_modality[modality] = non_validated_var
|
848
|
+
|
849
|
+
validated_obs = True
|
850
|
+
non_validated_obs_modality = {}
|
851
|
+
for modality, fields in self._obs_fields.items():
|
852
|
+
if modality == "obs":
|
853
|
+
obs = self._mdata.obs
|
854
|
+
else:
|
855
|
+
obs = self._mdata[modality].obs
|
856
|
+
is_validated_obs, non_validated_obs = validate_categories_in_df(
|
857
|
+
obs,
|
858
|
+
fields=fields,
|
859
|
+
using_key=self._using_key,
|
860
|
+
sources=self._sources.get(modality),
|
861
|
+
exclude=self._exclude.get(modality),
|
862
|
+
**self._kwargs,
|
863
|
+
)
|
864
|
+
validated_obs &= is_validated_obs
|
865
|
+
non_validated_obs_modality[modality] = non_validated_obs
|
866
|
+
if modality in non_validated_var_modality:
|
867
|
+
non_validated_obs_modality[modality]["var_index"] = (
|
868
|
+
non_validated_var_modality[modality]
|
869
|
+
)
|
870
|
+
if len(non_validated_obs_modality[modality]) > 0:
|
871
|
+
self._non_validated = non_validated_obs_modality[modality]
|
872
|
+
self._validated = validated_var and validated_obs
|
873
|
+
return self._validated
|
874
|
+
|
875
|
+
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
876
|
+
"""Save the validated ``MuData`` and metadata.
|
877
|
+
|
878
|
+
Args:
|
879
|
+
description: Description of the ``MuData`` object.
|
880
|
+
**kwargs: Object level metadata.
|
881
|
+
|
882
|
+
Returns:
|
883
|
+
A saved artifact record.
|
884
|
+
"""
|
885
|
+
if not self._validated:
|
886
|
+
raise ValidationError("Please run `validate()` first!")
|
887
|
+
|
888
|
+
self._artifact = save_artifact(
|
889
|
+
self._mdata,
|
890
|
+
description=description,
|
891
|
+
columns_field=self.var_index,
|
892
|
+
fields=self.categoricals,
|
893
|
+
**self._kwargs,
|
894
|
+
**kwargs,
|
895
|
+
)
|
896
|
+
return self._artifact
|
897
|
+
|
898
|
+
|
899
|
+
class Curator(BaseCurator):
|
900
|
+
"""Dataset curator.
|
901
|
+
|
902
|
+
Data curation entails accurately labeling datasets with standardized metadata
|
903
|
+
to facilitate data integration, interpretation and analysis.
|
904
|
+
|
905
|
+
The curation flow has several steps:
|
906
|
+
|
907
|
+
1. Instantiate `Curator` from one of the following dataset objects:
|
908
|
+
|
909
|
+
- :meth:`~lamindb.Curator.from_df`
|
910
|
+
- :meth:`~lamindb.Curator.from_anndata`
|
911
|
+
- :meth:`~lamindb.Curator.from_mudata`
|
912
|
+
|
913
|
+
During object creation, any passed categoricals found in the object will be saved.
|
914
|
+
|
915
|
+
2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
|
916
|
+
|
917
|
+
- Values that can successfully validated and already exist in the registry.
|
918
|
+
- Values which are new and not yet validated or potentially problematic values.
|
919
|
+
|
920
|
+
3. Determine how to handle validated and non-validated values:
|
921
|
+
|
922
|
+
- Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
|
923
|
+
- Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
|
924
|
+
- All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
|
925
|
+
"""
|
926
|
+
|
927
|
+
@classmethod
|
928
|
+
@doc_args(DataFrameCurator.__doc__)
|
929
|
+
def from_df(
|
930
|
+
cls,
|
931
|
+
df: pd.DataFrame,
|
932
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
933
|
+
columns: FieldAttr = Feature.name,
|
934
|
+
using_key: str | None = None,
|
935
|
+
verbosity: str = "hint",
|
936
|
+
organism: str | None = None,
|
937
|
+
) -> DataFrameCurator:
|
938
|
+
"""{}""" # noqa: D415
|
939
|
+
return DataFrameCurator(
|
940
|
+
df=df,
|
941
|
+
categoricals=categoricals,
|
942
|
+
columns=columns,
|
943
|
+
using_key=using_key,
|
944
|
+
verbosity=verbosity,
|
945
|
+
organism=organism,
|
946
|
+
)
|
947
|
+
|
948
|
+
@classmethod
|
949
|
+
@doc_args(AnnDataCurator.__doc__)
|
950
|
+
def from_anndata(
|
951
|
+
cls,
|
952
|
+
data: ad.AnnData | UPathStr,
|
953
|
+
var_index: FieldAttr,
|
954
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
955
|
+
obs_columns: FieldAttr = Feature.name,
|
956
|
+
using_key: str = "default",
|
957
|
+
verbosity: str = "hint",
|
958
|
+
organism: str | None = None,
|
959
|
+
sources: dict[str, Record] | None = None,
|
960
|
+
) -> AnnDataCurator:
|
961
|
+
"""{}""" # noqa: D415
|
962
|
+
return AnnDataCurator(
|
963
|
+
data=data,
|
964
|
+
var_index=var_index,
|
965
|
+
categoricals=categoricals,
|
966
|
+
obs_columns=obs_columns,
|
967
|
+
using_key=using_key,
|
968
|
+
verbosity=verbosity,
|
969
|
+
organism=organism,
|
970
|
+
sources=sources,
|
971
|
+
)
|
972
|
+
|
973
|
+
@classmethod
|
974
|
+
@doc_args(MuDataCurator.__doc__)
|
975
|
+
def from_mudata(
|
976
|
+
cls,
|
977
|
+
mdata: MuData,
|
978
|
+
var_index: dict[str, dict[str, FieldAttr]],
|
979
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
980
|
+
using_key: str = "default",
|
981
|
+
verbosity: str = "hint",
|
982
|
+
organism: str | None = None,
|
983
|
+
) -> MuDataCurator:
|
984
|
+
"""{}""" # noqa: D415
|
985
|
+
return MuDataCurator(
|
986
|
+
mdata=mdata,
|
987
|
+
var_index=var_index,
|
988
|
+
categoricals=categoricals,
|
989
|
+
using_key=using_key,
|
990
|
+
verbosity=verbosity,
|
991
|
+
organism=organism,
|
992
|
+
)
|
993
|
+
|
994
|
+
|
995
|
+
def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
|
996
|
+
"""Get a registry instance using a specific instance."""
|
997
|
+
if using_key is not None and using_key != "default":
|
998
|
+
return registry.using(using_key)
|
999
|
+
return registry
|
1000
|
+
|
1001
|
+
|
1002
|
+
def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
1003
|
+
"""Make sure the source and organism are saved in the same database as the registry."""
|
1004
|
+
from lamindb.core._settings import settings
|
1005
|
+
|
1006
|
+
db = registry.filter().db
|
1007
|
+
source = kwargs.get("source")
|
1008
|
+
organism = kwargs.get("organism")
|
1009
|
+
filter_kwargs = kwargs.copy()
|
1010
|
+
try:
|
1011
|
+
verbosity = settings.verbosity
|
1012
|
+
settings.verbosity = "error"
|
1013
|
+
if isinstance(organism, Record) and organism._state.db != "default":
|
1014
|
+
if db is None or db == "default":
|
1015
|
+
organism_default = copy.copy(organism)
|
1016
|
+
# save the organism record in the default database
|
1017
|
+
organism_default.save()
|
1018
|
+
filter_kwargs["organism"] = organism_default
|
1019
|
+
if isinstance(source, Record) and source._state.db != "default":
|
1020
|
+
if db is None or db == "default":
|
1021
|
+
source_default = copy.copy(source)
|
1022
|
+
# save the source record in the default database
|
1023
|
+
source_default.save()
|
1024
|
+
filter_kwargs["source"] = source_default
|
1025
|
+
finally:
|
1026
|
+
settings.verbosity = verbosity
|
1027
|
+
return filter_kwargs
|
1028
|
+
|
1029
|
+
|
1030
|
+
def standardize_and_inspect(
|
1031
|
+
values: Iterable[str],
|
1032
|
+
field: FieldAttr,
|
1033
|
+
registry: type[Record],
|
1034
|
+
standardize: bool = False,
|
1035
|
+
exclude: str | list | None = None,
|
1036
|
+
**kwargs,
|
1037
|
+
):
|
1038
|
+
"""Standardize and inspect values using a registry."""
|
1039
|
+
# inspect exclude values in the default instance
|
1040
|
+
values = list(values)
|
1041
|
+
include_validated = []
|
1042
|
+
if exclude is not None:
|
1043
|
+
exclude = [exclude] if isinstance(exclude, str) else exclude
|
1044
|
+
exclude = [i for i in exclude if i in values]
|
1045
|
+
if len(exclude) > 0:
|
1046
|
+
# exclude values are validated without source and organism
|
1047
|
+
inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
|
1048
|
+
# if exclude values are validated, remove them from the values
|
1049
|
+
values = [i for i in values if i not in inspect_result_exclude.validated]
|
1050
|
+
include_validated = inspect_result_exclude.validated
|
1051
|
+
|
1052
|
+
if standardize:
|
1053
|
+
if hasattr(registry, "standardize") and hasattr(
|
1054
|
+
registry,
|
1055
|
+
"synonyms", # https://github.com/laminlabs/lamindb/issues/1685
|
1056
|
+
):
|
1057
|
+
standardized_values = registry.standardize(
|
1058
|
+
values, field=field, mute=True, **kwargs
|
1059
|
+
)
|
1060
|
+
values = standardized_values
|
1061
|
+
|
1062
|
+
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
|
1063
|
+
inspect_result._validated += include_validated
|
1064
|
+
inspect_result._non_validated = [
|
1065
|
+
i for i in inspect_result.non_validated if i not in include_validated
|
1066
|
+
]
|
1067
|
+
|
1068
|
+
return inspect_result
|
1069
|
+
|
1070
|
+
|
1071
|
+
def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
|
1072
|
+
"""Check if a registry needs an organism and return the organism name."""
|
1073
|
+
if hasattr(registry, "organism_id"):
|
1074
|
+
import bionty as bt
|
1075
|
+
|
1076
|
+
if organism is None and bt.settings.organism is None:
|
1077
|
+
raise ValueError(
|
1078
|
+
f"{registry.__name__} registry requires an organism!\n"
|
1079
|
+
" → please pass an organism name via organism="
|
1080
|
+
)
|
1081
|
+
return {"organism": organism or bt.settings.organism.name}
|
1082
|
+
return {}
|
1083
|
+
|
1084
|
+
|
1085
|
+
def validate_categories(
|
1086
|
+
values: Iterable[str],
|
1087
|
+
field: FieldAttr,
|
1088
|
+
key: str,
|
1089
|
+
using_key: str | None = None,
|
1090
|
+
organism: str | None = None,
|
1091
|
+
source: Record | None = None,
|
1092
|
+
exclude: str | list | None = None,
|
1093
|
+
standardize: bool = True,
|
1094
|
+
validated_hint_print: str | None = None,
|
1095
|
+
) -> tuple[bool, list]:
|
1096
|
+
"""Validate ontology terms in a pandas series using LaminDB registries.
|
1097
|
+
|
1098
|
+
Args:
|
1099
|
+
values: The values to validate.
|
1100
|
+
field: The field attribute.
|
1101
|
+
key: The key referencing the slot in the DataFrame.
|
1102
|
+
using_key: A reference LaminDB instance.
|
1103
|
+
organism: The organism name.
|
1104
|
+
source: The source record.
|
1105
|
+
exclude: Exclude specific values.
|
1106
|
+
standardize: Standardize the values.
|
1107
|
+
validated_hint_print: The hint to print for validated values.
|
1108
|
+
"""
|
1109
|
+
from lamindb._from_values import _print_values
|
1110
|
+
from lamindb.core._settings import settings
|
1111
|
+
|
1112
|
+
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
1113
|
+
|
1114
|
+
def _log_mapping_info():
|
1115
|
+
logger.indent = ""
|
1116
|
+
logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}")
|
1117
|
+
logger.indent = " "
|
1118
|
+
|
1119
|
+
registry = field.field.model
|
1120
|
+
|
1121
|
+
kwargs = check_registry_organism(registry, organism)
|
1122
|
+
kwargs.update({"source": source} if source else {})
|
1123
|
+
kwargs_current = get_current_filter_kwargs(registry, kwargs)
|
1124
|
+
|
1125
|
+
# inspect the default instance
|
1126
|
+
inspect_result = standardize_and_inspect(
|
1127
|
+
values=values,
|
1128
|
+
field=field,
|
1129
|
+
registry=registry,
|
1130
|
+
standardize=standardize,
|
1131
|
+
exclude=exclude,
|
1132
|
+
**kwargs_current,
|
1133
|
+
)
|
1134
|
+
non_validated = inspect_result.non_validated
|
1135
|
+
|
1136
|
+
# inspect the using instance
|
1137
|
+
values_validated = []
|
1138
|
+
if using_key is not None and using_key != "default" and non_validated:
|
1139
|
+
registry_using = get_registry_instance(registry, using_key)
|
1140
|
+
inspect_result = standardize_and_inspect(
|
1141
|
+
values=non_validated,
|
1142
|
+
field=field,
|
1143
|
+
registry=registry_using,
|
1144
|
+
standardize=standardize,
|
1145
|
+
exclude=exclude,
|
1146
|
+
**kwargs,
|
1147
|
+
)
|
1148
|
+
non_validated = inspect_result.non_validated
|
1149
|
+
values_validated += inspect_result.validated
|
1150
|
+
|
1151
|
+
# inspect from public (bionty only)
|
1152
|
+
if hasattr(registry, "public"):
|
1153
|
+
verbosity = settings.verbosity
|
1154
|
+
try:
|
1155
|
+
settings.verbosity = "error"
|
1156
|
+
public_records = registry.from_values(
|
1157
|
+
non_validated,
|
1158
|
+
field=field,
|
1159
|
+
**kwargs_current,
|
1160
|
+
)
|
1161
|
+
values_validated += [getattr(r, field.field.name) for r in public_records]
|
1162
|
+
finally:
|
1163
|
+
settings.verbosity = verbosity
|
1164
|
+
|
1165
|
+
validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
|
1166
|
+
n_validated = len(values_validated)
|
1167
|
+
if n_validated > 0:
|
1168
|
+
_log_mapping_info()
|
1169
|
+
logger.warning(
|
1170
|
+
f"found {colors.yellow(n_validated)} validated terms: "
|
1171
|
+
f"{colors.yellow(values_validated)}\n → save terms via "
|
1172
|
+
f"{colors.yellow(validated_hint_print)}"
|
1173
|
+
)
|
1174
|
+
|
1175
|
+
non_validated_hint_print = f".add_new_from('{key}')"
|
1176
|
+
non_validated = [i for i in non_validated if i not in values_validated]
|
1177
|
+
n_non_validated = len(non_validated)
|
1178
|
+
if n_non_validated == 0:
|
1179
|
+
if n_validated == 0:
|
1180
|
+
logger.indent = ""
|
1181
|
+
logger.success(f"{key} is validated against {colors.italic(model_field)}")
|
1182
|
+
return True, []
|
1183
|
+
else:
|
1184
|
+
# validated values still need to be saved to the current instance
|
1185
|
+
return False, []
|
1186
|
+
else:
|
1187
|
+
are = "are" if n_non_validated > 1 else "is"
|
1188
|
+
print_values = _print_values(non_validated)
|
1189
|
+
warning_message = (
|
1190
|
+
f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
|
1191
|
+
f"{colors.red(print_values)}\n → fix typos, remove non-existent values, or save terms via "
|
1192
|
+
f"{colors.red(non_validated_hint_print)}"
|
1193
|
+
)
|
1194
|
+
if logger.indent == "":
|
1195
|
+
_log_mapping_info()
|
1196
|
+
logger.warning(warning_message)
|
1197
|
+
logger.indent = ""
|
1198
|
+
return False, non_validated
|
1199
|
+
|
1200
|
+
|
1201
|
+
def validate_categories_in_df(
|
1202
|
+
df: pd.DataFrame,
|
1203
|
+
fields: dict[str, FieldAttr],
|
1204
|
+
using_key: str | None = None,
|
1205
|
+
sources: dict[str, Record] = None,
|
1206
|
+
exclude: dict | None = None,
|
1207
|
+
**kwargs,
|
1208
|
+
) -> tuple[bool, dict]:
|
1209
|
+
"""Validate categories in DataFrame columns using LaminDB registries."""
|
1210
|
+
if not fields:
|
1211
|
+
return True, {}
|
1212
|
+
|
1213
|
+
if sources is None:
|
1214
|
+
sources = {}
|
1215
|
+
validated = True
|
1216
|
+
non_validated = {}
|
1217
|
+
for key, field in fields.items():
|
1218
|
+
is_val, non_val = validate_categories(
|
1219
|
+
df[key],
|
1220
|
+
field=field,
|
1221
|
+
key=key,
|
1222
|
+
using_key=using_key,
|
1223
|
+
source=sources.get(key),
|
1224
|
+
exclude=exclude.get(key) if exclude else None,
|
1225
|
+
**kwargs,
|
1226
|
+
)
|
1227
|
+
validated &= is_val
|
1228
|
+
if len(non_val) > 0:
|
1229
|
+
non_validated[key] = non_val
|
1230
|
+
return validated, non_validated
|
1231
|
+
|
1232
|
+
|
1233
|
+
def save_artifact(
|
1234
|
+
data: pd.DataFrame | ad.AnnData | MuData,
|
1235
|
+
fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
|
1236
|
+
columns_field: FieldAttr | dict[str, FieldAttr],
|
1237
|
+
description: str | None = None,
|
1238
|
+
organism: str | None = None,
|
1239
|
+
adata: ad.AnnData | None = None,
|
1240
|
+
**kwargs,
|
1241
|
+
) -> Artifact:
|
1242
|
+
"""Save all metadata with an Artifact.
|
1243
|
+
|
1244
|
+
Args:
|
1245
|
+
data: The DataFrame or AnnData object to save.
|
1246
|
+
description: A description of the artifact.
|
1247
|
+
fields: A dictionary mapping obs_column to registry_field.
|
1248
|
+
columns_field: The registry field to validate variables index against.
|
1249
|
+
organism: The organism name.
|
1250
|
+
adata: The AnnData object to save, must be provided if data is a path.
|
1251
|
+
kwargs: Additional keyword arguments to pass to the registry model.
|
1252
|
+
|
1253
|
+
Returns:
|
1254
|
+
The saved Artifact.
|
1255
|
+
"""
|
1256
|
+
from ._artifact import data_is_anndata
|
1257
|
+
|
1258
|
+
artifact = None
|
1259
|
+
if data_is_anndata(data):
|
1260
|
+
assert adata is not None # noqa: S101
|
1261
|
+
artifact = Artifact.from_anndata(data, description=description, **kwargs)
|
1262
|
+
artifact.n_observations = adata.shape[0]
|
1263
|
+
data = adata
|
1264
|
+
|
1265
|
+
elif isinstance(data, pd.DataFrame):
|
1266
|
+
artifact = Artifact.from_df(data, description=description, **kwargs)
|
1267
|
+
else:
|
1268
|
+
try:
|
1269
|
+
from mudata import MuData
|
1270
|
+
|
1271
|
+
if isinstance(data, MuData):
|
1272
|
+
artifact = Artifact.from_mudata(data, description=description, **kwargs)
|
1273
|
+
artifact.n_observations = data.n_obs
|
1274
|
+
except ImportError:
|
1275
|
+
pass
|
1276
|
+
if artifact is None:
|
1277
|
+
raise ValueError("data must be a DataFrame, AnnData or MuData object.")
|
1278
|
+
artifact.save()
|
1279
|
+
|
1280
|
+
feature_kwargs = check_registry_organism(
|
1281
|
+
(
|
1282
|
+
list(columns_field.values())[0].field.model
|
1283
|
+
if isinstance(columns_field, dict)
|
1284
|
+
else columns_field.field.model
|
1285
|
+
),
|
1286
|
+
organism,
|
1287
|
+
)
|
1288
|
+
|
1289
|
+
if artifact._accessor == "DataFrame":
|
1290
|
+
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
|
1291
|
+
elif artifact._accessor == "AnnData":
|
1292
|
+
artifact.features._add_set_from_anndata(
|
1293
|
+
var_field=columns_field, **feature_kwargs
|
1294
|
+
)
|
1295
|
+
elif artifact._accessor == "MuData":
|
1296
|
+
artifact.features._add_set_from_mudata(
|
1297
|
+
var_fields=columns_field, **feature_kwargs
|
1298
|
+
)
|
1299
|
+
else:
|
1300
|
+
raise NotImplementedError
|
1301
|
+
|
1302
|
+
def _add_labels(data, artifact: Artifact, fields: dict[str, FieldAttr]):
|
1303
|
+
features = Feature.lookup().dict()
|
1304
|
+
for key, field in fields.items():
|
1305
|
+
feature = features.get(key)
|
1306
|
+
registry = field.field.model
|
1307
|
+
filter_kwargs = check_registry_organism(registry, organism)
|
1308
|
+
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
1309
|
+
df = data if isinstance(data, pd.DataFrame) else data.obs
|
1310
|
+
labels = registry.from_values(
|
1311
|
+
df[key],
|
1312
|
+
field=field,
|
1313
|
+
**filter_kwargs_current,
|
1314
|
+
)
|
1315
|
+
artifact.labels.add(labels, feature)
|
1316
|
+
|
1317
|
+
if artifact._accessor == "MuData":
|
1318
|
+
for modality, modality_fields in fields.items():
|
1319
|
+
if modality == "obs":
|
1320
|
+
_add_labels(data, artifact, modality_fields)
|
1321
|
+
else:
|
1322
|
+
_add_labels(data[modality], artifact, modality_fields)
|
1323
|
+
else:
|
1324
|
+
_add_labels(data, artifact, fields)
|
1325
|
+
|
1326
|
+
slug = ln_setup.settings.instance.slug
|
1327
|
+
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
1328
|
+
logger.important(f"go to https://lamin.ai/{slug}/artifact/{artifact.uid}")
|
1329
|
+
return artifact
|
1330
|
+
|
1331
|
+
|
1332
|
+
def update_registry(
|
1333
|
+
values: list[str],
|
1334
|
+
field: FieldAttr,
|
1335
|
+
key: str,
|
1336
|
+
save_function: str = "add_new_from",
|
1337
|
+
using_key: str | None = None,
|
1338
|
+
validated_only: bool = True,
|
1339
|
+
df: pd.DataFrame | None = None,
|
1340
|
+
organism: str | None = None,
|
1341
|
+
dtype: str | None = None,
|
1342
|
+
source: Record | None = None,
|
1343
|
+
standardize: bool = True,
|
1344
|
+
warning: bool = True,
|
1345
|
+
exclude: str | list | None = None,
|
1346
|
+
**kwargs,
|
1347
|
+
) -> None:
|
1348
|
+
"""Save features or labels records in the default instance from the using_key instance.
|
1349
|
+
|
1350
|
+
Args:
|
1351
|
+
values: A list of values to be saved as labels.
|
1352
|
+
field: The FieldAttr object representing the field for which labels are being saved.
|
1353
|
+
key: The name of the feature to save.
|
1354
|
+
save_function: The name of the function to save the labels.
|
1355
|
+
using_key: The name of the instance from which to transfer labels (if applicable).
|
1356
|
+
validated_only: If True, only save validated labels.
|
1357
|
+
df: A DataFrame to save labels from.
|
1358
|
+
organism: The organism name.
|
1359
|
+
dtype: The type of the feature.
|
1360
|
+
source: The source record.
|
1361
|
+
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
1362
|
+
"""
|
1363
|
+
from lamindb._save import save as ln_save
|
1364
|
+
from lamindb.core._settings import settings
|
1365
|
+
|
1366
|
+
registry = field.field.model
|
1367
|
+
filter_kwargs = check_registry_organism(registry, organism)
|
1368
|
+
filter_kwargs.update({"source": source} if source else {})
|
1369
|
+
|
1370
|
+
verbosity = settings.verbosity
|
1371
|
+
try:
|
1372
|
+
settings.verbosity = "error"
|
1373
|
+
|
1374
|
+
# save from public
|
1375
|
+
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
1376
|
+
existing_and_public_records = (
|
1377
|
+
registry.from_values(
|
1378
|
+
list(values),
|
1379
|
+
field=field,
|
1380
|
+
**filter_kwargs_current,
|
1381
|
+
)
|
1382
|
+
if values
|
1383
|
+
else []
|
1384
|
+
)
|
1385
|
+
|
1386
|
+
labels_saved: dict = {"from public": [], "without reference": []}
|
1387
|
+
|
1388
|
+
public_records = [r for r in existing_and_public_records if r._state.adding]
|
1389
|
+
# here we check to only save the public records if they are from the specified source
|
1390
|
+
# we check the uid because r.source and soruce can be from different instances
|
1391
|
+
if source:
|
1392
|
+
public_records = [r for r in public_records if r.source.uid == source.uid]
|
1393
|
+
ln_save(public_records)
|
1394
|
+
labels_saved["from public"] = [
|
1395
|
+
getattr(r, field.field.name) for r in public_records
|
1396
|
+
]
|
1397
|
+
non_public_labels = [i for i in values if i not in labels_saved["from public"]]
|
1398
|
+
|
1399
|
+
# inspect the default instance
|
1400
|
+
inspect_result_current = standardize_and_inspect(
|
1401
|
+
values=non_public_labels,
|
1402
|
+
field=field,
|
1403
|
+
registry=registry,
|
1404
|
+
standardize=standardize,
|
1405
|
+
exclude=exclude,
|
1406
|
+
**filter_kwargs_current,
|
1407
|
+
)
|
1408
|
+
if not inspect_result_current.non_validated:
|
1409
|
+
all_labels = registry.from_values(
|
1410
|
+
inspect_result_current.validated,
|
1411
|
+
field=field,
|
1412
|
+
**filter_kwargs_current,
|
1413
|
+
)
|
1414
|
+
settings.verbosity = verbosity
|
1415
|
+
return all_labels
|
1416
|
+
|
1417
|
+
# inspect the using_key instance
|
1418
|
+
(
|
1419
|
+
labels_saved[f"from {using_key}"],
|
1420
|
+
non_validated_labels,
|
1421
|
+
) = update_registry_from_using_instance(
|
1422
|
+
inspect_result_current.non_validated,
|
1423
|
+
field=field,
|
1424
|
+
using_key=using_key,
|
1425
|
+
exclude=exclude,
|
1426
|
+
**filter_kwargs,
|
1427
|
+
)
|
1428
|
+
|
1429
|
+
labels_saved["without reference"] = [
|
1430
|
+
i
|
1431
|
+
for i in non_validated_labels
|
1432
|
+
if i not in labels_saved[f"from {using_key}"]
|
1433
|
+
]
|
1434
|
+
|
1435
|
+
# save non-validated records
|
1436
|
+
if not validated_only:
|
1437
|
+
non_validated_records = []
|
1438
|
+
if df is not None and registry == Feature:
|
1439
|
+
non_validated_records = Feature.from_df(df)
|
1440
|
+
else:
|
1441
|
+
if "organism" in filter_kwargs:
|
1442
|
+
# make sure organism record is saved to the current instance
|
1443
|
+
filter_kwargs["organism"] = _save_organism(name=organism)
|
1444
|
+
init_kwargs = {}
|
1445
|
+
for value in labels_saved["without reference"]:
|
1446
|
+
init_kwargs[field.field.name] = value
|
1447
|
+
if registry == Feature:
|
1448
|
+
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
1449
|
+
non_validated_records.append(
|
1450
|
+
registry(
|
1451
|
+
**init_kwargs,
|
1452
|
+
**{k: v for k, v in filter_kwargs.items() if k != "source"},
|
1453
|
+
**{k: v for k, v in kwargs.items() if k != "sources"},
|
1454
|
+
)
|
1455
|
+
)
|
1456
|
+
ln_save(non_validated_records)
|
1457
|
+
|
1458
|
+
# save parent labels for ulabels
|
1459
|
+
if registry == ULabel and field.field.name == "name":
|
1460
|
+
save_ulabels_with_parent(values, field=field, key=key)
|
1461
|
+
|
1462
|
+
# # get all records that are now validated in the current instance
|
1463
|
+
# all_labels = registry.from_values(
|
1464
|
+
# inspect_result_current.validated + inspect_result_current.non_validated,
|
1465
|
+
# field=field,
|
1466
|
+
# **get_current_filter_kwargs(registry, filter_kwargs),
|
1467
|
+
# )
|
1468
|
+
finally:
|
1469
|
+
settings.verbosity = verbosity
|
1470
|
+
|
1471
|
+
log_saved_labels(
|
1472
|
+
labels_saved,
|
1473
|
+
key=key,
|
1474
|
+
save_function=save_function,
|
1475
|
+
model_field=f"{registry.__name__}.{field.field.name}",
|
1476
|
+
validated_only=validated_only,
|
1477
|
+
warning=warning,
|
1478
|
+
)
|
1479
|
+
|
1480
|
+
# return all_labels
|
1481
|
+
|
1482
|
+
|
1483
|
+
def log_saved_labels(
|
1484
|
+
labels_saved: dict,
|
1485
|
+
key: str,
|
1486
|
+
save_function: str,
|
1487
|
+
model_field: str,
|
1488
|
+
validated_only: bool = True,
|
1489
|
+
warning: bool = True,
|
1490
|
+
) -> None:
|
1491
|
+
"""Log the saved labels."""
|
1492
|
+
from ._from_values import _print_values
|
1493
|
+
|
1494
|
+
model_field = colors.italic(model_field)
|
1495
|
+
for k, labels in labels_saved.items():
|
1496
|
+
if not labels:
|
1497
|
+
continue
|
1498
|
+
|
1499
|
+
if k == "without reference" and validated_only:
|
1500
|
+
msg = colors.yellow(
|
1501
|
+
f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
|
1502
|
+
)
|
1503
|
+
lookup_print = (
|
1504
|
+
f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
|
1505
|
+
)
|
1506
|
+
|
1507
|
+
hint = f".add_new_from('{key}')"
|
1508
|
+
msg += f"\n → to lookup values, use {lookup_print}"
|
1509
|
+
msg += (
|
1510
|
+
f"\n → to save, run {colors.yellow(hint)}"
|
1511
|
+
if save_function == "add_new_from"
|
1512
|
+
else f"\n → to save, run {colors.yellow(save_function)}"
|
1513
|
+
)
|
1514
|
+
if warning:
|
1515
|
+
logger.warning(msg)
|
1516
|
+
else:
|
1517
|
+
logger.info(msg)
|
1518
|
+
else:
|
1519
|
+
k = "" if k == "without reference" else f"{colors.green(k)} "
|
1520
|
+
# the term "transferred" stresses that this is always in the context of transferring
|
1521
|
+
# labels from a public ontology or a different instance to the present instance
|
1522
|
+
s = "s" if len(labels) > 1 else ""
|
1523
|
+
logger.success(
|
1524
|
+
f"added {len(labels)} record{s} {k}with {model_field} for {colors.italic(key)}: {_print_values(labels)}"
|
1525
|
+
)
|
1526
|
+
|
1527
|
+
|
1528
|
+
def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> None:
|
1529
|
+
"""Save a parent label for the given labels."""
|
1530
|
+
registry = field.field.model
|
1531
|
+
assert registry == ULabel # noqa: S101
|
1532
|
+
all_records = registry.from_values(list(values), field=field)
|
1533
|
+
is_feature = registry.filter(name=f"is_{key}").one_or_none()
|
1534
|
+
if is_feature is None:
|
1535
|
+
is_feature = registry(name=f"is_{key}")
|
1536
|
+
is_feature.save()
|
1537
|
+
is_feature.children.add(*all_records)
|
1538
|
+
|
1539
|
+
|
1540
|
+
def update_registry_from_using_instance(
|
1541
|
+
values: list[str],
|
1542
|
+
field: FieldAttr,
|
1543
|
+
using_key: str | None = None,
|
1544
|
+
standardize: bool = False,
|
1545
|
+
exclude: str | list | None = None,
|
1546
|
+
**kwargs,
|
1547
|
+
) -> tuple[list[str], list[str]]:
|
1548
|
+
"""Save features or labels records from the using_key instance.
|
1549
|
+
|
1550
|
+
Args:
|
1551
|
+
values: A list of values to be saved as labels.
|
1552
|
+
field: The FieldAttr object representing the field for which labels are being saved.
|
1553
|
+
using_key: The name of the instance from which to transfer labels (if applicable).
|
1554
|
+
standardize: Whether to also standardize the values.
|
1555
|
+
kwargs: Additional keyword arguments to pass to the registry model.
|
1556
|
+
|
1557
|
+
Returns:
|
1558
|
+
A tuple containing the list of saved labels and the list of non-saved labels.
|
1559
|
+
"""
|
1560
|
+
labels_saved = []
|
1561
|
+
not_saved = values
|
1562
|
+
|
1563
|
+
if using_key is not None and using_key != "default":
|
1564
|
+
registry_using = get_registry_instance(field.field.model, using_key)
|
1565
|
+
|
1566
|
+
inspect_result_using = standardize_and_inspect(
|
1567
|
+
values=values,
|
1568
|
+
field=field,
|
1569
|
+
registry=registry_using,
|
1570
|
+
standardize=standardize,
|
1571
|
+
exclude=exclude,
|
1572
|
+
**kwargs,
|
1573
|
+
)
|
1574
|
+
labels_using = registry_using.filter(
|
1575
|
+
**{f"{field.field.name}__in": inspect_result_using.validated}
|
1576
|
+
).all()
|
1577
|
+
for label_using in labels_using:
|
1578
|
+
label_using.save()
|
1579
|
+
labels_saved.append(getattr(label_using, field.field.name))
|
1580
|
+
not_saved = inspect_result_using.non_validated
|
1581
|
+
|
1582
|
+
return labels_saved, not_saved
|
1583
|
+
|
1584
|
+
|
1585
|
+
def _save_organism(name: str): # pragma: no cover
|
1586
|
+
"""Save an organism record."""
|
1587
|
+
import bionty as bt
|
1588
|
+
|
1589
|
+
organism = bt.Organism.filter(name=name).one_or_none()
|
1590
|
+
if organism is None:
|
1591
|
+
organism = bt.Organism.from_source(name=name)
|
1592
|
+
if organism is None:
|
1593
|
+
raise ValueError(
|
1594
|
+
f"Organism '{name}' not found\n"
|
1595
|
+
f" → please save it: bt.Organism(name='{name}').save()"
|
1596
|
+
)
|
1597
|
+
organism.save()
|
1598
|
+
return organism
|
1599
|
+
|
1600
|
+
|
1601
|
+
Curate = Curator # backward compat
|