lamindb 0.74.3__py3-none-any.whl → 0.75.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +85 -43
- lamindb/_can_validate.py +100 -35
- lamindb/_collection.py +36 -28
- lamindb/_curate.py +432 -181
- lamindb/_feature_set.py +5 -5
- lamindb/_filter.py +3 -3
- lamindb/_finish.py +29 -23
- lamindb/_from_values.py +47 -66
- lamindb/_is_versioned.py +1 -1
- lamindb/_parents.py +38 -13
- lamindb/_record.py +41 -42
- lamindb/_save.py +7 -7
- lamindb/_transform.py +27 -16
- lamindb/_view.py +13 -11
- lamindb/core/__init__.py +2 -0
- lamindb/core/_data.py +18 -20
- lamindb/core/_feature_manager.py +50 -50
- lamindb/core/_label_manager.py +17 -19
- lamindb/core/_mapped_collection.py +1 -1
- lamindb/core/_run_context.py +6 -8
- lamindb/core/datasets/_core.py +7 -7
- lamindb/core/exceptions.py +11 -0
- lamindb/core/schema.py +5 -5
- lamindb/core/storage/__init__.py +12 -2
- lamindb/core/storage/_anndata_accessor.py +735 -0
- lamindb/core/storage/_backed_access.py +77 -747
- lamindb/core/storage/_valid_suffixes.py +16 -2
- lamindb/core/storage/paths.py +9 -14
- lamindb/core/types.py +3 -0
- lamindb/core/versioning.py +1 -1
- lamindb/integrations/__init__.py +1 -0
- lamindb/integrations/_vitessce.py +68 -31
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/METADATA +5 -5
- lamindb-0.75.1.dist-info/RECORD +58 -0
- lamindb-0.74.3.dist-info/RECORD +0 -57
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/LICENSE +0 -0
- {lamindb-0.74.3.dist-info → lamindb-0.75.1.dist-info}/WHEEL +0 -0
lamindb/_curate.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
|
3
|
+
import copy
|
4
|
+
from typing import TYPE_CHECKING, Iterable, Type
|
4
5
|
|
5
6
|
import anndata as ad
|
6
7
|
import lamindb_setup as ln_setup
|
@@ -9,7 +10,6 @@ from lamin_utils import colors, logger
|
|
9
10
|
from lamindb_setup.core._docs import doc_args
|
10
11
|
from lnschema_core import (
|
11
12
|
Artifact,
|
12
|
-
Collection,
|
13
13
|
Feature,
|
14
14
|
Record,
|
15
15
|
Run,
|
@@ -31,23 +31,25 @@ class CurateLookup:
|
|
31
31
|
self,
|
32
32
|
categoricals: dict[str, FieldAttr],
|
33
33
|
slots: dict[str, FieldAttr] = None,
|
34
|
-
|
34
|
+
using_key: str | None = None,
|
35
35
|
) -> None:
|
36
36
|
if slots is None:
|
37
37
|
slots = {}
|
38
38
|
self._fields = {**categoricals, **slots}
|
39
|
-
self.
|
40
|
-
self.
|
41
|
-
debug_message =
|
39
|
+
self._using_key = None if using_key == "default" else using_key
|
40
|
+
self._using_key_name = self._using_key or ln_setup.settings.instance.slug
|
41
|
+
debug_message = (
|
42
|
+
f"Lookup objects from the " f"{colors.italic(self._using_key_name)}"
|
43
|
+
)
|
42
44
|
logger.debug(debug_message)
|
43
45
|
|
44
46
|
def __getattr__(self, name):
|
45
47
|
if name in self._fields:
|
46
48
|
registry = self._fields[name].field.model
|
47
|
-
if self.
|
49
|
+
if self._using_key == "public":
|
48
50
|
return registry.public().lookup()
|
49
51
|
else:
|
50
|
-
return get_registry_instance(registry, self.
|
52
|
+
return get_registry_instance(registry, self._using_key).lookup()
|
51
53
|
raise AttributeError(
|
52
54
|
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
53
55
|
)
|
@@ -55,10 +57,10 @@ class CurateLookup:
|
|
55
57
|
def __getitem__(self, name):
|
56
58
|
if name in self._fields:
|
57
59
|
registry = self._fields[name].field.model
|
58
|
-
if self.
|
60
|
+
if self._using_key == "public":
|
59
61
|
return registry.public().lookup()
|
60
62
|
else:
|
61
|
-
return get_registry_instance(registry, self.
|
63
|
+
return get_registry_instance(registry, self._using_key).lookup()
|
62
64
|
raise AttributeError(
|
63
65
|
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
64
66
|
)
|
@@ -72,7 +74,7 @@ class CurateLookup:
|
|
72
74
|
[str([key]) for key in self._fields if not key.isidentifier()]
|
73
75
|
)
|
74
76
|
return (
|
75
|
-
f"Lookup objects from the {colors.italic(self.
|
77
|
+
f"Lookup objects from the {colors.italic(self._using_key_name)}:\n "
|
76
78
|
f"{colors.green(getattr_keys)}\n "
|
77
79
|
f"{colors.green(getitem_keys)}\n\n"
|
78
80
|
"Example:\n → categories = validator.lookup().cell_type\n"
|
@@ -83,15 +85,19 @@ class CurateLookup:
|
|
83
85
|
|
84
86
|
|
85
87
|
class DataFrameCurator:
|
86
|
-
"""
|
88
|
+
"""Curation flow for a DataFrame object.
|
89
|
+
|
90
|
+
See also :class:`~lamindb.Curate`.
|
87
91
|
|
88
92
|
Args:
|
89
93
|
df: The DataFrame object to curate.
|
90
94
|
columns: The field attribute for the feature column.
|
91
95
|
categoricals: A dictionary mapping column names to registry_field.
|
92
|
-
|
96
|
+
using_key: The reference instance containing registries to validate against.
|
93
97
|
verbosity: The verbosity level.
|
94
98
|
organism: The organism name.
|
99
|
+
sources: A dictionary mapping column names to Source records.
|
100
|
+
exclude: A dictionary mapping column names to values to exclude.
|
95
101
|
|
96
102
|
Examples:
|
97
103
|
>>> import bionty as bt
|
@@ -106,40 +112,56 @@ class DataFrameCurator:
|
|
106
112
|
df: pd.DataFrame,
|
107
113
|
columns: FieldAttr = Feature.name,
|
108
114
|
categoricals: dict[str, FieldAttr] | None = None,
|
109
|
-
|
115
|
+
using_key: str | None = None,
|
110
116
|
verbosity: str = "hint",
|
111
117
|
organism: str | None = None,
|
118
|
+
sources: dict[str, Record] | None = None,
|
119
|
+
exclude: dict | None = None,
|
112
120
|
) -> None:
|
113
121
|
from lamindb.core._settings import settings
|
114
122
|
|
115
123
|
self._df = df
|
116
124
|
self._fields = categoricals or {}
|
117
125
|
self._columns_field = columns
|
118
|
-
self.
|
126
|
+
self._using_key = using_key
|
119
127
|
settings.verbosity = verbosity
|
120
128
|
self._artifact = None
|
121
129
|
self._collection = None
|
122
130
|
self._validated = False
|
123
131
|
self._kwargs = {"organism": organism} if organism else {}
|
132
|
+
if sources is None:
|
133
|
+
sources = {}
|
134
|
+
self._sources = sources
|
135
|
+
if exclude is None:
|
136
|
+
exclude = {}
|
137
|
+
self._exclude = exclude
|
138
|
+
self._non_validated = None
|
124
139
|
self._save_columns()
|
125
140
|
|
141
|
+
@property
|
142
|
+
def non_validated(self) -> list:
|
143
|
+
"""Return the non-validated features and labels."""
|
144
|
+
if self._non_validated is None:
|
145
|
+
raise ValueError("Please run validate() first!")
|
146
|
+
return self._non_validated
|
147
|
+
|
126
148
|
@property
|
127
149
|
def fields(self) -> dict:
|
128
150
|
"""Return the columns fields to validate against."""
|
129
151
|
return self._fields
|
130
152
|
|
131
|
-
def lookup(self,
|
153
|
+
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
132
154
|
"""Lookup categories.
|
133
155
|
|
134
156
|
Args:
|
135
|
-
|
136
|
-
if None (default), the lookup is performed on the instance specified in "
|
157
|
+
using_key: The instance where the lookup is performed.
|
158
|
+
if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
|
137
159
|
if "public", the lookup is performed on the public reference.
|
138
160
|
"""
|
139
161
|
return CurateLookup(
|
140
162
|
categoricals=self._fields,
|
141
163
|
slots={"columns": self._columns_field},
|
142
|
-
|
164
|
+
using_key=using_key or self._using_key,
|
143
165
|
)
|
144
166
|
|
145
167
|
def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
|
@@ -156,8 +178,9 @@ class DataFrameCurator:
|
|
156
178
|
field=self._columns_field,
|
157
179
|
key="columns",
|
158
180
|
save_function="add_new_from_columns",
|
159
|
-
|
181
|
+
using_key=self._using_key,
|
160
182
|
validated_only=False,
|
183
|
+
source=self._sources.get("columns"),
|
161
184
|
**kwargs,
|
162
185
|
)
|
163
186
|
|
@@ -169,9 +192,11 @@ class DataFrameCurator:
|
|
169
192
|
field=self._columns_field,
|
170
193
|
key="columns",
|
171
194
|
save_function="add_new_from_columns",
|
172
|
-
|
195
|
+
using_key=self._using_key,
|
173
196
|
validated_only=validated_only,
|
174
197
|
df=self._df, # Get the Feature type from df
|
198
|
+
source=self._sources.get("columns"),
|
199
|
+
warning=False, # Do not warn about missing columns, just an info message
|
175
200
|
**kwargs,
|
176
201
|
)
|
177
202
|
|
@@ -220,8 +245,9 @@ class DataFrameCurator:
|
|
220
245
|
values=self._df[categorical].unique().tolist(),
|
221
246
|
field=self.fields[categorical],
|
222
247
|
key=categorical,
|
223
|
-
|
248
|
+
using_key=self._using_key,
|
224
249
|
validated_only=validated_only,
|
250
|
+
sources=self._sources.get(categorical),
|
225
251
|
**kwargs,
|
226
252
|
)
|
227
253
|
|
@@ -238,10 +264,12 @@ class DataFrameCurator:
|
|
238
264
|
Whether the DataFrame is validated.
|
239
265
|
"""
|
240
266
|
self._kwargs.update({"organism": organism} if organism else {})
|
241
|
-
self._validated = validate_categories_in_df(
|
267
|
+
self._validated, self._non_validated = validate_categories_in_df( # type: ignore
|
242
268
|
self._df,
|
243
269
|
fields=self.fields,
|
244
|
-
|
270
|
+
using_key=self._using_key,
|
271
|
+
sources=self._sources,
|
272
|
+
exclude=self._exclude,
|
245
273
|
**self._kwargs,
|
246
274
|
)
|
247
275
|
return self._validated
|
@@ -283,41 +311,6 @@ class DataFrameCurator:
|
|
283
311
|
|
284
312
|
return self._artifact
|
285
313
|
|
286
|
-
def save_collection(
|
287
|
-
self,
|
288
|
-
artifact: Artifact | Iterable[Artifact],
|
289
|
-
name: str,
|
290
|
-
description: str | None = None,
|
291
|
-
reference: str | None = None,
|
292
|
-
reference_type: str | None = None,
|
293
|
-
) -> Collection:
|
294
|
-
"""Save a collection from artifact/artifacts.
|
295
|
-
|
296
|
-
Args:
|
297
|
-
artifact: One or several saved Artifacts.
|
298
|
-
name: Title of the publication.
|
299
|
-
description: Description of the publication.
|
300
|
-
reference: Accession number (e.g. GSE#, E-MTAB#, etc.).
|
301
|
-
reference_type: Source type (e.g. GEO, ArrayExpress, SRA, etc.).
|
302
|
-
"""
|
303
|
-
collection = Collection(
|
304
|
-
artifact,
|
305
|
-
name=name,
|
306
|
-
description=description,
|
307
|
-
reference=reference,
|
308
|
-
reference_type=reference_type,
|
309
|
-
)
|
310
|
-
slug = ln_setup.settings.instance.slug
|
311
|
-
if collection._state.adding:
|
312
|
-
collection.save()
|
313
|
-
else: # pragma: no cover
|
314
|
-
collection.save()
|
315
|
-
logger.warning(f"collection already exists in {colors.italic(slug)}!")
|
316
|
-
if ln_setup.settings.instance.is_remote: # pragma: no cover
|
317
|
-
logger.print(f"go to https://lamin.ai/{slug}/collection/{collection.uid}")
|
318
|
-
self._collection = collection
|
319
|
-
return collection
|
320
|
-
|
321
314
|
def clean_up_failed_runs(self):
|
322
315
|
"""Clean up previous failed runs that don't save any outputs."""
|
323
316
|
from lamindb.core._run_context import run_context
|
@@ -329,15 +322,21 @@ class DataFrameCurator:
|
|
329
322
|
|
330
323
|
|
331
324
|
class AnnDataCurator(DataFrameCurator):
|
332
|
-
"""
|
325
|
+
"""Curation flow for ``AnnData``.
|
326
|
+
|
327
|
+
See also :class:`~lamindb.Curate`.
|
328
|
+
|
329
|
+
Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curate.from_anndata`.
|
333
330
|
|
334
331
|
Args:
|
335
332
|
data: The AnnData object or an AnnData-like path.
|
336
333
|
var_index: The registry field for mapping the ``.var`` index.
|
337
334
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
338
|
-
|
335
|
+
using_key: A reference LaminDB instance.
|
339
336
|
verbosity: The verbosity level.
|
340
337
|
organism: The organism name.
|
338
|
+
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
339
|
+
exclude: A dictionary mapping column names to values to exclude.
|
341
340
|
|
342
341
|
Examples:
|
343
342
|
>>> import bionty as bt
|
@@ -354,14 +353,19 @@ class AnnDataCurator(DataFrameCurator):
|
|
354
353
|
data: ad.AnnData | UPathStr,
|
355
354
|
var_index: FieldAttr,
|
356
355
|
categoricals: dict[str, FieldAttr] | None = None,
|
357
|
-
|
356
|
+
obs_columns: FieldAttr = Feature.name,
|
357
|
+
using_key: str = "default",
|
358
358
|
verbosity: str = "hint",
|
359
359
|
organism: str | None = None,
|
360
|
+
sources: dict[str, Record] | None = None,
|
361
|
+
exclude: dict | None = None,
|
360
362
|
) -> None:
|
361
363
|
from lamindb_setup.core import upath
|
362
364
|
|
363
365
|
from ._artifact import data_is_anndata
|
364
366
|
|
367
|
+
if sources is None:
|
368
|
+
sources = {}
|
365
369
|
if not data_is_anndata(data):
|
366
370
|
raise ValueError(
|
367
371
|
"data has to be an AnnData object or a path to AnnData-like"
|
@@ -378,12 +382,14 @@ class AnnDataCurator(DataFrameCurator):
|
|
378
382
|
super().__init__(
|
379
383
|
df=self._adata.obs,
|
380
384
|
categoricals=categoricals,
|
381
|
-
|
385
|
+
columns=obs_columns,
|
386
|
+
using_key=using_key,
|
382
387
|
verbosity=verbosity,
|
383
388
|
organism=organism,
|
389
|
+
sources=sources,
|
390
|
+
exclude=exclude,
|
384
391
|
)
|
385
392
|
self._obs_fields = categoricals
|
386
|
-
self._save_from_var_index(validated_only=True, **self._kwargs)
|
387
393
|
|
388
394
|
@property
|
389
395
|
def var_index(self) -> FieldAttr:
|
@@ -395,18 +401,18 @@ class AnnDataCurator(DataFrameCurator):
|
|
395
401
|
"""Return the obs fields to validate against."""
|
396
402
|
return self._obs_fields
|
397
403
|
|
398
|
-
def lookup(self,
|
404
|
+
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
399
405
|
"""Lookup categories.
|
400
406
|
|
401
407
|
Args:
|
402
|
-
|
408
|
+
using_key: The instance where the lookup is performed.
|
403
409
|
if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
|
404
410
|
if "public", the lookup is performed on the public reference.
|
405
411
|
"""
|
406
412
|
return CurateLookup(
|
407
413
|
categoricals=self._obs_fields,
|
408
414
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
409
|
-
|
415
|
+
using_key=using_key or self._using_key,
|
410
416
|
)
|
411
417
|
|
412
418
|
def _save_from_var_index(
|
@@ -414,15 +420,25 @@ class AnnDataCurator(DataFrameCurator):
|
|
414
420
|
):
|
415
421
|
"""Save variable records."""
|
416
422
|
update_registry(
|
417
|
-
values=self._adata.var.index,
|
423
|
+
values=list(self._adata.var.index),
|
418
424
|
field=self.var_index,
|
419
425
|
key="var_index",
|
420
426
|
save_function="add_new_from_var_index",
|
421
|
-
|
427
|
+
using_key=self._using_key,
|
422
428
|
validated_only=validated_only,
|
423
429
|
organism=organism,
|
430
|
+
source=self._sources.get("var_index"),
|
424
431
|
)
|
425
432
|
|
433
|
+
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
434
|
+
"""Save labels for all features."""
|
435
|
+
for name in self.fields.keys():
|
436
|
+
logger.info(f"saving labels for '{name}'")
|
437
|
+
if name == "var_index":
|
438
|
+
self._save_from_var_index(validated_only=validated_only, **kwargs)
|
439
|
+
else:
|
440
|
+
self._update_registry(name, validated_only=validated_only, **kwargs)
|
441
|
+
|
426
442
|
def add_new_from_var_index(self, organism: str | None = None, **kwargs):
|
427
443
|
"""Update variable records.
|
428
444
|
|
@@ -433,6 +449,15 @@ class AnnDataCurator(DataFrameCurator):
|
|
433
449
|
self._kwargs.update({"organism": organism} if organism else {})
|
434
450
|
self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
|
435
451
|
|
452
|
+
def add_validated_from_var_index(self, organism: str | None = None):
|
453
|
+
"""Add validated variable records.
|
454
|
+
|
455
|
+
Args:
|
456
|
+
organism: The organism name.
|
457
|
+
"""
|
458
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
459
|
+
self._save_from_var_index(validated_only=True, **self._kwargs)
|
460
|
+
|
436
461
|
def validate(self, organism: str | None = None) -> bool:
|
437
462
|
"""Validate categories.
|
438
463
|
|
@@ -443,20 +468,32 @@ class AnnDataCurator(DataFrameCurator):
|
|
443
468
|
Whether the AnnData object is validated.
|
444
469
|
"""
|
445
470
|
self._kwargs.update({"organism": organism} if organism else {})
|
446
|
-
if self.
|
471
|
+
if self._using_key is not None and self._using_key != "default":
|
447
472
|
logger.important(
|
448
|
-
f"validating metadata using registries of instance {colors.italic(self.
|
473
|
+
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
449
474
|
)
|
450
|
-
|
475
|
+
|
476
|
+
validated_var, non_validated_var = validate_categories(
|
451
477
|
self._adata.var.index,
|
452
478
|
field=self._var_field,
|
453
479
|
key="var_index",
|
454
|
-
|
455
|
-
|
480
|
+
using_key=self._using_key,
|
481
|
+
source=self._sources.get("var_index"),
|
482
|
+
validated_hint_print=".add_validated_from_var_index()",
|
483
|
+
exclude=self._exclude.get("var_index"),
|
484
|
+
**self._kwargs, # type: ignore
|
456
485
|
)
|
457
|
-
validated_obs = validate_categories_in_df(
|
458
|
-
self._adata.obs,
|
486
|
+
validated_obs, non_validated_obs = validate_categories_in_df(
|
487
|
+
self._adata.obs,
|
488
|
+
fields=self.categoricals,
|
489
|
+
using_key=self._using_key,
|
490
|
+
sources=self._sources,
|
491
|
+
exclude=self._exclude,
|
492
|
+
**self._kwargs,
|
459
493
|
)
|
494
|
+
self._non_validated = non_validated_obs # type: ignore
|
495
|
+
if len(non_validated_var) > 0:
|
496
|
+
self._non_validated["var_index"] = non_validated_var # type: ignore
|
460
497
|
self._validated = validated_var and validated_obs
|
461
498
|
return self._validated
|
462
499
|
|
@@ -488,7 +525,12 @@ class AnnDataCurator(DataFrameCurator):
|
|
488
525
|
|
489
526
|
|
490
527
|
class MuDataCurator:
|
491
|
-
"""
|
528
|
+
"""Curation flow for a ``MuData`` object.
|
529
|
+
|
530
|
+
See also :class:`~lamindb.Curate`.
|
531
|
+
|
532
|
+
Note that if genes or other measurements are removed from the MuData object,
|
533
|
+
the object should be recreated using :meth:`~lamindb.Curate.from_mudata`.
|
492
534
|
|
493
535
|
Args:
|
494
536
|
mdata: The MuData object to curate.
|
@@ -497,9 +539,11 @@ class MuDataCurator:
|
|
497
539
|
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
|
498
540
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
499
541
|
Use modality keys to specify categoricals for MuData slots such as `"rna:cell_type": bt.CellType.name"`.
|
500
|
-
|
542
|
+
using_key: A reference LaminDB instance.
|
501
543
|
verbosity: The verbosity level.
|
502
544
|
organism: The organism name.
|
545
|
+
sources: A dictionary mapping ``.obs.columns`` to Source records.
|
546
|
+
exclude: A dictionary mapping column names to values to exclude.
|
503
547
|
|
504
548
|
Examples:
|
505
549
|
>>> import bionty as bt
|
@@ -516,24 +560,34 @@ class MuDataCurator:
|
|
516
560
|
mdata: MuData,
|
517
561
|
var_index: dict[str, dict[str, FieldAttr]],
|
518
562
|
categoricals: dict[str, FieldAttr] | None = None,
|
519
|
-
|
563
|
+
using_key: str = "default",
|
520
564
|
verbosity: str = "hint",
|
521
565
|
organism: str | None = None,
|
566
|
+
sources: dict[str, Record] | None = None,
|
567
|
+
exclude: dict | None = None,
|
522
568
|
) -> None:
|
569
|
+
if sources is None:
|
570
|
+
sources = {}
|
571
|
+
self._sources = sources
|
572
|
+
if exclude is None:
|
573
|
+
exclude = {}
|
574
|
+
self._exclude = exclude
|
523
575
|
self._mdata = mdata
|
524
576
|
self._kwargs = {"organism": organism} if organism else {}
|
525
577
|
self._var_fields = var_index
|
526
578
|
self._verify_modality(self._var_fields.keys())
|
527
579
|
self._obs_fields = self._parse_categoricals(categoricals)
|
528
580
|
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
529
|
-
self.
|
581
|
+
self._using_key = using_key
|
530
582
|
self._verbosity = verbosity
|
531
583
|
self._df_annotators = {
|
532
584
|
modality: DataFrameCurator(
|
533
585
|
df=mdata[modality].obs if modality != "obs" else mdata.obs,
|
534
586
|
categoricals=self._obs_fields.get(modality, {}),
|
535
|
-
|
587
|
+
using_key=using_key,
|
536
588
|
verbosity=verbosity,
|
589
|
+
sources=self._sources.get(modality),
|
590
|
+
exclude=self._exclude.get(modality),
|
537
591
|
**self._kwargs,
|
538
592
|
)
|
539
593
|
for modality in self._modalities
|
@@ -564,11 +618,11 @@ class MuDataCurator:
|
|
564
618
|
):
|
565
619
|
"""Save variable records."""
|
566
620
|
update_registry(
|
567
|
-
values=self._mdata[modality].var.index,
|
621
|
+
values=list(self._mdata[modality].var.index),
|
568
622
|
field=self._var_fields[modality],
|
569
623
|
key="var_index",
|
570
624
|
save_function="add_new_from_var_index",
|
571
|
-
|
625
|
+
using_key=self._using_key,
|
572
626
|
validated_only=validated_only,
|
573
627
|
dtype="number",
|
574
628
|
**kwargs,
|
@@ -592,12 +646,12 @@ class MuDataCurator:
|
|
592
646
|
obs_fields["obs"][k] = v
|
593
647
|
return obs_fields
|
594
648
|
|
595
|
-
def lookup(self,
|
649
|
+
def lookup(self, using_key: str | None = None) -> CurateLookup:
|
596
650
|
"""Lookup categories.
|
597
651
|
|
598
652
|
Args:
|
599
|
-
|
600
|
-
if None (default), the lookup is performed on the instance specified in "
|
653
|
+
using_key: The instance where the lookup is performed.
|
654
|
+
if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
|
601
655
|
if "public", the lookup is performed on the public reference.
|
602
656
|
"""
|
603
657
|
return CurateLookup(
|
@@ -606,7 +660,7 @@ class MuDataCurator:
|
|
606
660
|
**self._obs_fields,
|
607
661
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
608
662
|
},
|
609
|
-
|
663
|
+
using_key=using_key or self._using_key,
|
610
664
|
)
|
611
665
|
|
612
666
|
def add_new_from_columns(
|
@@ -625,14 +679,15 @@ class MuDataCurator:
|
|
625
679
|
**kwargs: Additional keyword arguments to pass to the registry model.
|
626
680
|
"""
|
627
681
|
self._kwargs.update({"organism": organism} if organism else {})
|
682
|
+
values = column_names or self._mdata[modality].obs.columns
|
628
683
|
update_registry(
|
629
|
-
values=
|
684
|
+
values=list(values),
|
630
685
|
field=Feature.name,
|
631
686
|
key=f"{modality} obs columns",
|
632
|
-
|
687
|
+
using_key=self._using_key,
|
633
688
|
validated_only=False,
|
634
689
|
df=self._mdata[modality].obs,
|
635
|
-
**self._kwargs,
|
690
|
+
**self._kwargs, # type: ignore
|
636
691
|
**kwargs,
|
637
692
|
)
|
638
693
|
|
@@ -651,6 +706,18 @@ class MuDataCurator:
|
|
651
706
|
modality=modality, validated_only=False, **self._kwargs, **kwargs
|
652
707
|
)
|
653
708
|
|
709
|
+
def add_validated_from_var_index(self, modality: str, organism: str | None = None):
|
710
|
+
"""Add validated variable records.
|
711
|
+
|
712
|
+
Args:
|
713
|
+
modality: The modality name.
|
714
|
+
organism: The organism name.
|
715
|
+
"""
|
716
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
717
|
+
self._save_from_var_index_modality(
|
718
|
+
modality=modality, validated_only=True, **self._kwargs
|
719
|
+
)
|
720
|
+
|
654
721
|
def add_validated_from(
|
655
722
|
self, key: str, modality: str | None = None, organism: str | None = None
|
656
723
|
):
|
@@ -693,28 +760,48 @@ class MuDataCurator:
|
|
693
760
|
def validate(self, organism: str | None = None) -> bool:
|
694
761
|
"""Validate categories."""
|
695
762
|
self._kwargs.update({"organism": organism} if organism else {})
|
696
|
-
if self.
|
763
|
+
if self._using_key is not None and self._using_key != "default":
|
697
764
|
logger.important(
|
698
|
-
f"validating metadata using registries of instance {colors.italic(self.
|
765
|
+
f"validating metadata using registries of instance {colors.italic(self._using_key)}"
|
699
766
|
)
|
700
767
|
validated_var = True
|
768
|
+
non_validated_var_modality = {}
|
701
769
|
for modality, var_field in self._var_fields.items():
|
702
|
-
|
770
|
+
is_validated_var, non_validated_var = validate_categories(
|
703
771
|
self._mdata[modality].var.index,
|
704
772
|
field=var_field,
|
705
773
|
key=f"{modality}_var_index",
|
706
|
-
|
707
|
-
|
774
|
+
using_key=self._using_key,
|
775
|
+
exclude=self._exclude.get(f"{modality}_var_index"),
|
776
|
+
**self._kwargs, # type: ignore
|
708
777
|
)
|
778
|
+
validated_var &= is_validated_var
|
779
|
+
if len(non_validated_var) > 0:
|
780
|
+
non_validated_var_modality[modality] = non_validated_var
|
781
|
+
|
709
782
|
validated_obs = True
|
783
|
+
non_validated_obs_modality = {}
|
710
784
|
for modality, fields in self._obs_fields.items():
|
711
785
|
if modality == "obs":
|
712
786
|
obs = self._mdata.obs
|
713
787
|
else:
|
714
788
|
obs = self._mdata[modality].obs
|
715
|
-
|
716
|
-
obs,
|
789
|
+
is_validated_obs, non_validated_obs = validate_categories_in_df(
|
790
|
+
obs,
|
791
|
+
fields=fields,
|
792
|
+
using_key=self._using_key,
|
793
|
+
sources=self._sources.get(modality),
|
794
|
+
exclude=self._exclude.get(modality),
|
795
|
+
**self._kwargs,
|
717
796
|
)
|
797
|
+
validated_obs &= is_validated_obs
|
798
|
+
non_validated_obs_modality[modality] = non_validated_obs
|
799
|
+
if modality in non_validated_var_modality:
|
800
|
+
non_validated_obs_modality[modality]["var_index"] = (
|
801
|
+
non_validated_var_modality[modality]
|
802
|
+
)
|
803
|
+
if len(non_validated_obs_modality[modality]) > 0:
|
804
|
+
self._non_validated = non_validated_obs_modality[modality]
|
718
805
|
self._validated = validated_var and validated_obs
|
719
806
|
return self._validated
|
720
807
|
|
@@ -743,7 +830,32 @@ class MuDataCurator:
|
|
743
830
|
|
744
831
|
|
745
832
|
class Curate:
|
746
|
-
"""
|
833
|
+
"""Curation flow.
|
834
|
+
|
835
|
+
Data curation entails accurately labeling datasets with standardized metadata
|
836
|
+
to facilitate data integration, interpretation and analysis.
|
837
|
+
|
838
|
+
The curation flow has several steps:
|
839
|
+
|
840
|
+
1. Create a :class:`Curate` object corresponding to the object type that you want to curate:
|
841
|
+
|
842
|
+
- :meth:`~lamindb.Curate.from_df`
|
843
|
+
- :meth:`~lamindb.Curate.from_anndata`
|
844
|
+
- :meth:`~lamindb.Curate.from_mudata`
|
845
|
+
|
846
|
+
During object creation, any passed categoricals found in the object will be saved.
|
847
|
+
|
848
|
+
2. Run :meth:`~lamindb.core.DataFrameCurator.validate` to check the data against the defined criteria. This method identifies:
|
849
|
+
|
850
|
+
- Values that can successfully validated and already exist in the registry.
|
851
|
+
- Values which are new and not yet validated or potentially problematic values.
|
852
|
+
|
853
|
+
3. Determine how to handle validated and unvalidated values:
|
854
|
+
|
855
|
+
- Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
|
856
|
+
- Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
|
857
|
+
- All unvalidated values can be accessed using :meth:`~lamindb.core.DataFrameCurator.non_validated` and subsequently removed from the object at hand.
|
858
|
+
"""
|
747
859
|
|
748
860
|
@classmethod
|
749
861
|
@doc_args(DataFrameCurator.__doc__)
|
@@ -752,7 +864,7 @@ class Curate:
|
|
752
864
|
df: pd.DataFrame,
|
753
865
|
categoricals: dict[str, FieldAttr] | None = None,
|
754
866
|
columns: FieldAttr = Feature.name,
|
755
|
-
|
867
|
+
using_key: str | None = None,
|
756
868
|
verbosity: str = "hint",
|
757
869
|
organism: str | None = None,
|
758
870
|
) -> DataFrameCurator:
|
@@ -761,7 +873,7 @@ class Curate:
|
|
761
873
|
df=df,
|
762
874
|
categoricals=categoricals,
|
763
875
|
columns=columns,
|
764
|
-
|
876
|
+
using_key=using_key,
|
765
877
|
verbosity=verbosity,
|
766
878
|
organism=organism,
|
767
879
|
)
|
@@ -773,18 +885,22 @@ class Curate:
|
|
773
885
|
data: ad.AnnData | UPathStr,
|
774
886
|
var_index: FieldAttr,
|
775
887
|
categoricals: dict[str, FieldAttr] | None = None,
|
776
|
-
|
888
|
+
obs_columns: FieldAttr = Feature.name,
|
889
|
+
using_key: str = "default",
|
777
890
|
verbosity: str = "hint",
|
778
891
|
organism: str | None = None,
|
892
|
+
sources: dict[str, Record] | None = None,
|
779
893
|
) -> AnnDataCurator:
|
780
894
|
"""{}""" # noqa: D415
|
781
895
|
return AnnDataCurator(
|
782
896
|
data=data,
|
783
897
|
var_index=var_index,
|
784
898
|
categoricals=categoricals,
|
785
|
-
|
899
|
+
obs_columns=obs_columns,
|
900
|
+
using_key=using_key,
|
786
901
|
verbosity=verbosity,
|
787
902
|
organism=organism,
|
903
|
+
sources=sources,
|
788
904
|
)
|
789
905
|
|
790
906
|
@classmethod
|
@@ -794,7 +910,7 @@ class Curate:
|
|
794
910
|
mdata: MuData,
|
795
911
|
var_index: dict[str, dict[str, FieldAttr]],
|
796
912
|
categoricals: dict[str, FieldAttr] | None = None,
|
797
|
-
|
913
|
+
using_key: str = "default",
|
798
914
|
verbosity: str = "hint",
|
799
915
|
organism: str | None = None,
|
800
916
|
) -> MuDataCurator:
|
@@ -803,29 +919,68 @@ class Curate:
|
|
803
919
|
mdata=mdata,
|
804
920
|
var_index=var_index,
|
805
921
|
categoricals=categoricals,
|
806
|
-
|
922
|
+
using_key=using_key,
|
807
923
|
verbosity=verbosity,
|
808
924
|
organism=organism,
|
809
925
|
)
|
810
926
|
|
811
927
|
|
812
|
-
def get_registry_instance(registry: Record,
|
928
|
+
def get_registry_instance(registry: Record, using_key: str | None = None) -> Record:
|
813
929
|
"""Get a registry instance using a specific instance."""
|
814
|
-
if
|
815
|
-
return registry.using(
|
930
|
+
if using_key is not None and using_key != "default":
|
931
|
+
return registry.using(using_key)
|
816
932
|
return registry
|
817
933
|
|
818
934
|
|
935
|
+
def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
936
|
+
"""Make sure the source and organism are saved in the same database as the registry."""
|
937
|
+
from lamindb.core._settings import settings
|
938
|
+
|
939
|
+
db = registry.filter().db
|
940
|
+
source = kwargs.get("source")
|
941
|
+
organism = kwargs.get("organism")
|
942
|
+
filter_kwargs = kwargs.copy()
|
943
|
+
try:
|
944
|
+
verbosity = settings.verbosity
|
945
|
+
settings.verbosity = "error"
|
946
|
+
if isinstance(organism, Record) and organism._state.db != "default":
|
947
|
+
if db is None or db == "default":
|
948
|
+
organism_default = copy.copy(organism)
|
949
|
+
# save the organism record in the default database
|
950
|
+
organism_default.save()
|
951
|
+
filter_kwargs["organism"] = organism_default
|
952
|
+
if isinstance(source, Record) and source._state.db != "default":
|
953
|
+
if db is None or db == "default":
|
954
|
+
source_default = copy.copy(source)
|
955
|
+
# save the source record in the default database
|
956
|
+
source_default.save()
|
957
|
+
filter_kwargs["source"] = source_default
|
958
|
+
finally:
|
959
|
+
settings.verbosity = verbosity
|
960
|
+
return filter_kwargs
|
961
|
+
|
962
|
+
|
819
963
|
def standardize_and_inspect(
|
820
|
-
values: Iterable[str],
|
964
|
+
values: Iterable[str],
|
965
|
+
field: FieldAttr,
|
966
|
+
registry: type[Record],
|
967
|
+
standardize: bool = False,
|
968
|
+
**kwargs,
|
821
969
|
):
|
822
970
|
"""Standardize and inspect values using a registry."""
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
971
|
+
filter_kwargs = get_current_filter_kwargs(registry, kwargs)
|
972
|
+
|
973
|
+
if standardize:
|
974
|
+
if hasattr(registry, "standardize") and hasattr(
|
975
|
+
registry,
|
976
|
+
"synonyms", # https://github.com/laminlabs/lamindb/issues/1685
|
977
|
+
):
|
978
|
+
standardized_values = registry.standardize(
|
979
|
+
values, field=field, mute=True, **filter_kwargs
|
980
|
+
)
|
981
|
+
values = standardized_values
|
982
|
+
|
983
|
+
return registry.inspect(values, field=field, mute=True, **filter_kwargs)
|
829
984
|
|
830
985
|
|
831
986
|
def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
|
@@ -846,10 +1001,26 @@ def validate_categories(
|
|
846
1001
|
values: Iterable[str],
|
847
1002
|
field: FieldAttr,
|
848
1003
|
key: str,
|
849
|
-
|
1004
|
+
using_key: str | None = None,
|
850
1005
|
organism: str | None = None,
|
851
|
-
|
852
|
-
|
1006
|
+
source: Record | None = None,
|
1007
|
+
exclude: str | list | None = None,
|
1008
|
+
standardize: bool = True,
|
1009
|
+
validated_hint_print: str | None = None,
|
1010
|
+
) -> tuple[bool, list]:
|
1011
|
+
"""Validate ontology terms in a pandas series using LaminDB registries.
|
1012
|
+
|
1013
|
+
Args:
|
1014
|
+
values: The values to validate.
|
1015
|
+
field: The field attribute.
|
1016
|
+
key: The key referencing the slot in the DataFrame.
|
1017
|
+
using_key: A reference LaminDB instance.
|
1018
|
+
organism: The organism name.
|
1019
|
+
source: The source record.
|
1020
|
+
exclude: Exclude specific values.
|
1021
|
+
standardize: Standardize the values.
|
1022
|
+
validated_hint_print: The hint to print for validated values.
|
1023
|
+
"""
|
853
1024
|
from lamindb._from_values import _print_values
|
854
1025
|
from lamindb.core._settings import settings
|
855
1026
|
|
@@ -861,42 +1032,60 @@ def validate_categories(
|
|
861
1032
|
logger.indent = " "
|
862
1033
|
|
863
1034
|
registry = field.field.model
|
864
|
-
|
1035
|
+
kwargs = check_registry_organism(registry, organism)
|
1036
|
+
kwargs.update({"source": source} if source else {})
|
1037
|
+
|
1038
|
+
# inspect the default instance
|
1039
|
+
if exclude is not None:
|
1040
|
+
exclude = [exclude] if isinstance(exclude, str) else exclude
|
1041
|
+
# exclude values are validated without source and organism
|
1042
|
+
inspect_result = registry.inspect(exclude, field=field, mute=True)
|
1043
|
+
# if exclude values are validated, remove them from the values
|
1044
|
+
values = [i for i in values if i not in inspect_result.validated]
|
865
1045
|
|
866
|
-
# Inspect the default instance
|
867
1046
|
inspect_result = standardize_and_inspect(
|
868
|
-
values=values,
|
1047
|
+
values=values,
|
1048
|
+
field=field,
|
1049
|
+
registry=registry,
|
1050
|
+
standardize=standardize,
|
1051
|
+
**kwargs,
|
869
1052
|
)
|
870
1053
|
non_validated = inspect_result.non_validated
|
871
1054
|
|
872
1055
|
values_validated = []
|
873
|
-
if
|
874
|
-
|
875
|
-
#
|
1056
|
+
if using_key is not None and using_key != "default" and non_validated:
|
1057
|
+
registry_using = get_registry_instance(registry, using_key)
|
1058
|
+
# inspect the using instance
|
876
1059
|
inspect_result = standardize_and_inspect(
|
877
|
-
values=non_validated,
|
1060
|
+
values=non_validated,
|
1061
|
+
field=field,
|
1062
|
+
registry=registry_using,
|
1063
|
+
standardize=standardize,
|
1064
|
+
**kwargs,
|
878
1065
|
)
|
879
1066
|
non_validated = inspect_result.non_validated
|
880
1067
|
values_validated += inspect_result.validated
|
881
1068
|
|
882
|
-
#
|
1069
|
+
# inspect from public (bionty only)
|
883
1070
|
if hasattr(registry, "public"):
|
884
1071
|
verbosity = settings.verbosity
|
885
1072
|
try:
|
886
1073
|
settings.verbosity = "error"
|
887
1074
|
public_records = registry.from_values(
|
888
|
-
non_validated,
|
1075
|
+
non_validated,
|
1076
|
+
field=field,
|
1077
|
+
**get_current_filter_kwargs(registry, kwargs),
|
889
1078
|
)
|
890
1079
|
values_validated += [getattr(r, field.field.name) for r in public_records]
|
891
1080
|
finally:
|
892
1081
|
settings.verbosity = verbosity
|
893
1082
|
|
894
|
-
validated_hint_print = f".add_validated_from('{key}')"
|
1083
|
+
validated_hint_print = validated_hint_print or f".add_validated_from('{key}')"
|
895
1084
|
n_validated = len(values_validated)
|
896
1085
|
if n_validated > 0:
|
897
1086
|
_log_mapping_info()
|
898
1087
|
logger.warning(
|
899
|
-
f"found {colors.yellow(
|
1088
|
+
f"found {colors.yellow(n_validated)} validated terms: "
|
900
1089
|
f"{colors.yellow(values_validated)}\n → save terms via "
|
901
1090
|
f"{colors.yellow(validated_hint_print)}"
|
902
1091
|
)
|
@@ -907,39 +1096,49 @@ def validate_categories(
|
|
907
1096
|
if n_non_validated == 0:
|
908
1097
|
logger.indent = ""
|
909
1098
|
logger.success(f"{key} is validated against {colors.italic(model_field)}")
|
910
|
-
return True
|
1099
|
+
return True, []
|
911
1100
|
else:
|
912
1101
|
are = "are" if n_non_validated > 1 else "is"
|
913
1102
|
print_values = _print_values(non_validated)
|
914
1103
|
warning_message = (
|
915
|
-
f"{colors.
|
916
|
-
f"{colors.
|
917
|
-
f"{colors.
|
1104
|
+
f"{colors.red(f'{n_non_validated} terms')} {are} not validated: "
|
1105
|
+
f"{colors.red(print_values)}\n → save terms via "
|
1106
|
+
f"{colors.red(non_validated_hint_print)}"
|
918
1107
|
)
|
919
1108
|
if logger.indent == "":
|
920
1109
|
_log_mapping_info()
|
921
1110
|
logger.warning(warning_message)
|
922
1111
|
logger.indent = ""
|
923
|
-
return False
|
1112
|
+
return False, non_validated
|
924
1113
|
|
925
1114
|
|
926
1115
|
def validate_categories_in_df(
|
927
1116
|
df: pd.DataFrame,
|
928
1117
|
fields: dict[str, FieldAttr],
|
929
|
-
|
1118
|
+
using_key: str | None = None,
|
1119
|
+
sources: dict[str, Record] = None,
|
1120
|
+
exclude: dict | None = None,
|
930
1121
|
**kwargs,
|
931
|
-
) -> bool:
|
1122
|
+
) -> tuple[bool, dict]:
|
932
1123
|
"""Validate categories in DataFrame columns using LaminDB registries."""
|
1124
|
+
if sources is None:
|
1125
|
+
sources = {}
|
933
1126
|
validated = True
|
1127
|
+
non_validated = {}
|
934
1128
|
for key, field in fields.items():
|
935
|
-
|
1129
|
+
is_val, non_val = validate_categories(
|
936
1130
|
df[key],
|
937
1131
|
field=field,
|
938
1132
|
key=key,
|
939
|
-
|
1133
|
+
using_key=using_key,
|
1134
|
+
source=sources.get(key),
|
1135
|
+
exclude=exclude.get(key) if exclude else None,
|
940
1136
|
**kwargs,
|
941
1137
|
)
|
942
|
-
|
1138
|
+
validated &= is_val
|
1139
|
+
if len(non_val) > 0:
|
1140
|
+
non_validated[key] = non_val
|
1141
|
+
return validated, non_validated
|
943
1142
|
|
944
1143
|
|
945
1144
|
def save_artifact(
|
@@ -998,13 +1197,13 @@ def save_artifact(
|
|
998
1197
|
organism,
|
999
1198
|
)
|
1000
1199
|
|
1001
|
-
if artifact.
|
1200
|
+
if artifact._accessor == "DataFrame":
|
1002
1201
|
artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
|
1003
|
-
elif artifact.
|
1202
|
+
elif artifact._accessor == "AnnData":
|
1004
1203
|
artifact.features._add_set_from_anndata(
|
1005
1204
|
var_field=columns_field, **feature_kwargs
|
1006
1205
|
)
|
1007
|
-
elif artifact.
|
1206
|
+
elif artifact._accessor == "MuData":
|
1008
1207
|
artifact.features._add_set_from_mudata(
|
1009
1208
|
var_fields=columns_field, **feature_kwargs
|
1010
1209
|
)
|
@@ -1017,11 +1216,16 @@ def save_artifact(
|
|
1017
1216
|
feature = features.get(key)
|
1018
1217
|
registry = field.field.model
|
1019
1218
|
filter_kwargs = check_registry_organism(registry, organism)
|
1219
|
+
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
1020
1220
|
df = data if isinstance(data, pd.DataFrame) else data.obs
|
1021
|
-
labels = registry.from_values(
|
1221
|
+
labels = registry.from_values(
|
1222
|
+
df[key],
|
1223
|
+
field=field,
|
1224
|
+
**filter_kwargs_current,
|
1225
|
+
)
|
1022
1226
|
artifact.labels.add(labels, feature)
|
1023
1227
|
|
1024
|
-
if artifact.
|
1228
|
+
if artifact._accessor == "MuData":
|
1025
1229
|
for modality, modality_fields in fields.items():
|
1026
1230
|
if modality == "obs":
|
1027
1231
|
_add_labels(data, artifact, modality_fields)
|
@@ -1041,25 +1245,29 @@ def update_registry(
|
|
1041
1245
|
field: FieldAttr,
|
1042
1246
|
key: str,
|
1043
1247
|
save_function: str = "add_new_from",
|
1044
|
-
|
1248
|
+
using_key: str | None = None,
|
1045
1249
|
validated_only: bool = True,
|
1046
1250
|
df: pd.DataFrame | None = None,
|
1047
1251
|
organism: str | None = None,
|
1048
1252
|
dtype: str | None = None,
|
1253
|
+
source: Record | None = None,
|
1254
|
+
standardize: bool = True,
|
1255
|
+
warning: bool = True,
|
1049
1256
|
**kwargs,
|
1050
|
-
) ->
|
1051
|
-
"""Save features or labels records in the default instance from the
|
1257
|
+
) -> None:
|
1258
|
+
"""Save features or labels records in the default instance from the using_key instance.
|
1052
1259
|
|
1053
1260
|
Args:
|
1054
1261
|
values: A list of values to be saved as labels.
|
1055
1262
|
field: The FieldAttr object representing the field for which labels are being saved.
|
1056
1263
|
key: The name of the feature to save.
|
1057
1264
|
save_function: The name of the function to save the labels.
|
1058
|
-
|
1265
|
+
using_key: The name of the instance from which to transfer labels (if applicable).
|
1059
1266
|
validated_only: If True, only save validated labels.
|
1060
1267
|
df: A DataFrame to save labels from.
|
1061
1268
|
organism: The organism name.
|
1062
1269
|
dtype: The type of the feature.
|
1270
|
+
source: The source record.
|
1063
1271
|
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
1064
1272
|
"""
|
1065
1273
|
from lamindb._save import save as ln_save
|
@@ -1067,51 +1275,79 @@ def update_registry(
|
|
1067
1275
|
|
1068
1276
|
registry = field.field.model
|
1069
1277
|
filter_kwargs = check_registry_organism(registry, organism)
|
1278
|
+
filter_kwargs.update({"source": source} if source else {})
|
1070
1279
|
|
1071
1280
|
verbosity = settings.verbosity
|
1072
1281
|
try:
|
1073
1282
|
settings.verbosity = "error"
|
1283
|
+
|
1284
|
+
# save from public
|
1285
|
+
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
1286
|
+
existing_and_public_records = (
|
1287
|
+
registry.from_values(
|
1288
|
+
list(values),
|
1289
|
+
field=field,
|
1290
|
+
**filter_kwargs_current,
|
1291
|
+
)
|
1292
|
+
if values
|
1293
|
+
else []
|
1294
|
+
)
|
1295
|
+
|
1296
|
+
labels_saved: dict = {"from public": [], "without reference": []}
|
1297
|
+
|
1298
|
+
public_records = [r for r in existing_and_public_records if r._state.adding]
|
1299
|
+
# here we check to only save the public records if they are from the specified source
|
1300
|
+
# we check the uid because r.source and soruce can be from different instances
|
1301
|
+
if source:
|
1302
|
+
public_records = [r for r in public_records if r.source.uid == source.uid]
|
1303
|
+
ln_save(public_records)
|
1304
|
+
labels_saved["from public"] = [
|
1305
|
+
getattr(r, field.field.name) for r in public_records
|
1306
|
+
]
|
1307
|
+
non_public_labels = [i for i in values if i not in labels_saved["from public"]]
|
1308
|
+
|
1309
|
+
# inspect the default instance
|
1074
1310
|
inspect_result_current = standardize_and_inspect(
|
1075
|
-
values=
|
1311
|
+
values=non_public_labels,
|
1312
|
+
field=field,
|
1313
|
+
registry=registry,
|
1314
|
+
standardize=standardize,
|
1315
|
+
**filter_kwargs,
|
1076
1316
|
)
|
1077
1317
|
if not inspect_result_current.non_validated:
|
1078
1318
|
all_labels = registry.from_values(
|
1079
|
-
inspect_result_current.validated,
|
1319
|
+
inspect_result_current.validated,
|
1320
|
+
field=field,
|
1321
|
+
**filter_kwargs_current,
|
1080
1322
|
)
|
1081
1323
|
settings.verbosity = verbosity
|
1082
1324
|
return all_labels
|
1083
1325
|
|
1084
|
-
|
1085
|
-
|
1326
|
+
# inspect the using_key instance
|
1086
1327
|
(
|
1087
|
-
labels_saved[f"from {
|
1328
|
+
labels_saved[f"from {using_key}"],
|
1088
1329
|
non_validated_labels,
|
1089
1330
|
) = update_registry_from_using_instance(
|
1090
1331
|
inspect_result_current.non_validated,
|
1091
1332
|
field=field,
|
1092
|
-
|
1333
|
+
using_key=using_key,
|
1093
1334
|
**filter_kwargs,
|
1094
1335
|
)
|
1095
1336
|
|
1096
|
-
public_records = (
|
1097
|
-
registry.from_values(non_validated_labels, field=field, **filter_kwargs)
|
1098
|
-
if non_validated_labels
|
1099
|
-
else []
|
1100
|
-
)
|
1101
|
-
ln_save(public_records)
|
1102
|
-
labels_saved["from public"] = [
|
1103
|
-
getattr(r, field.field.name) for r in public_records
|
1104
|
-
]
|
1105
1337
|
labels_saved["without reference"] = [
|
1106
|
-
i
|
1338
|
+
i
|
1339
|
+
for i in non_validated_labels
|
1340
|
+
if i not in labels_saved[f"from {using_key}"]
|
1107
1341
|
]
|
1108
1342
|
|
1343
|
+
# save non-validated records
|
1109
1344
|
if not validated_only:
|
1110
1345
|
non_validated_records = []
|
1111
1346
|
if df is not None and registry == Feature:
|
1112
1347
|
non_validated_records = Feature.from_df(df)
|
1113
1348
|
else:
|
1114
1349
|
if "organism" in filter_kwargs:
|
1350
|
+
# make sure organism record is saved to the current instance
|
1115
1351
|
filter_kwargs["organism"] = _save_organism(name=organism)
|
1116
1352
|
init_kwargs = {}
|
1117
1353
|
for value in labels_saved["without reference"]:
|
@@ -1119,19 +1355,24 @@ def update_registry(
|
|
1119
1355
|
if registry == Feature:
|
1120
1356
|
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
1121
1357
|
non_validated_records.append(
|
1122
|
-
registry(
|
1358
|
+
registry(
|
1359
|
+
**init_kwargs,
|
1360
|
+
**{k: v for k, v in filter_kwargs.items() if k != "source"},
|
1361
|
+
**{k: v for k, v in kwargs.items() if k != "sources"},
|
1362
|
+
)
|
1123
1363
|
)
|
1124
1364
|
ln_save(non_validated_records)
|
1125
1365
|
|
1366
|
+
# save parent labels for ulabels
|
1126
1367
|
if registry == ULabel and field.field.name == "name":
|
1127
1368
|
save_ulabels_with_parent(values, field=field, key=key)
|
1128
1369
|
|
1129
|
-
# get all records
|
1130
|
-
all_labels = registry.from_values(
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
)
|
1370
|
+
# # get all records that are now validated in the current instance
|
1371
|
+
# all_labels = registry.from_values(
|
1372
|
+
# inspect_result_current.validated + inspect_result_current.non_validated,
|
1373
|
+
# field=field,
|
1374
|
+
# **get_current_filter_kwargs(registry, filter_kwargs),
|
1375
|
+
# )
|
1135
1376
|
finally:
|
1136
1377
|
settings.verbosity = verbosity
|
1137
1378
|
|
@@ -1141,9 +1382,10 @@ def update_registry(
|
|
1141
1382
|
save_function=save_function,
|
1142
1383
|
model_field=f"{registry.__name__}.{field.field.name}",
|
1143
1384
|
validated_only=validated_only,
|
1385
|
+
warning=warning,
|
1144
1386
|
)
|
1145
1387
|
|
1146
|
-
return all_labels
|
1388
|
+
# return all_labels
|
1147
1389
|
|
1148
1390
|
|
1149
1391
|
def log_saved_labels(
|
@@ -1152,6 +1394,7 @@ def log_saved_labels(
|
|
1152
1394
|
save_function: str,
|
1153
1395
|
model_field: str,
|
1154
1396
|
validated_only: bool = True,
|
1397
|
+
warning: bool = True,
|
1155
1398
|
) -> None:
|
1156
1399
|
"""Log the saved labels."""
|
1157
1400
|
from ._from_values import _print_values
|
@@ -1176,7 +1419,10 @@ def log_saved_labels(
|
|
1176
1419
|
if save_function == "add_new_from"
|
1177
1420
|
else f"\n → to save, run {colors.yellow(save_function)}"
|
1178
1421
|
)
|
1179
|
-
|
1422
|
+
if warning:
|
1423
|
+
logger.warning(msg)
|
1424
|
+
else:
|
1425
|
+
logger.info(msg)
|
1180
1426
|
else:
|
1181
1427
|
k = "" if k == "without reference" else f"{colors.green(k)} "
|
1182
1428
|
# the term "transferred" stresses that this is always in the context of transferring
|
@@ -1191,7 +1437,7 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
|
|
1191
1437
|
"""Save a parent label for the given labels."""
|
1192
1438
|
registry = field.field.model
|
1193
1439
|
assert registry == ULabel # noqa: S101
|
1194
|
-
all_records = registry.from_values(values, field=field)
|
1440
|
+
all_records = registry.from_values(list(values), field=field)
|
1195
1441
|
is_feature = registry.filter(name=f"is_{key}").one_or_none()
|
1196
1442
|
if is_feature is None:
|
1197
1443
|
is_feature = registry(name=f"is_{key}")
|
@@ -1202,15 +1448,16 @@ def save_ulabels_with_parent(values: list[str], field: FieldAttr, key: str) -> N
|
|
1202
1448
|
def update_registry_from_using_instance(
|
1203
1449
|
values: list[str],
|
1204
1450
|
field: FieldAttr,
|
1205
|
-
|
1451
|
+
using_key: str | None = None,
|
1452
|
+
standardize: bool = False,
|
1206
1453
|
**kwargs,
|
1207
1454
|
) -> tuple[list[str], list[str]]:
|
1208
|
-
"""Save features or labels records from the
|
1455
|
+
"""Save features or labels records from the using_key instance.
|
1209
1456
|
|
1210
1457
|
Args:
|
1211
1458
|
values: A list of values to be saved as labels.
|
1212
1459
|
field: The FieldAttr object representing the field for which labels are being saved.
|
1213
|
-
|
1460
|
+
using_key: The name of the instance from which to transfer labels (if applicable).
|
1214
1461
|
kwargs: Additional keyword arguments to pass to the registry model.
|
1215
1462
|
|
1216
1463
|
Returns:
|
@@ -1219,11 +1466,15 @@ def update_registry_from_using_instance(
|
|
1219
1466
|
labels_saved = []
|
1220
1467
|
not_saved = values
|
1221
1468
|
|
1222
|
-
if
|
1223
|
-
|
1224
|
-
|
1469
|
+
if using_key is not None and using_key != "default":
|
1470
|
+
registry_using = get_registry_instance(field.field.model, using_key)
|
1471
|
+
|
1225
1472
|
inspect_result_using = standardize_and_inspect(
|
1226
|
-
values=values,
|
1473
|
+
values=values,
|
1474
|
+
field=field,
|
1475
|
+
registry=registry_using,
|
1476
|
+
standardize=standardize,
|
1477
|
+
**kwargs,
|
1227
1478
|
)
|
1228
1479
|
labels_using = registry_using.filter(
|
1229
1480
|
**{f"{field.field.name}__in": inspect_result_using.validated}
|
@@ -1242,7 +1493,7 @@ def _save_organism(name: str): # pragma: no cover
|
|
1242
1493
|
|
1243
1494
|
organism = bt.Organism.filter(name=name).one_or_none()
|
1244
1495
|
if organism is None:
|
1245
|
-
organism = bt.Organism.
|
1496
|
+
organism = bt.Organism.from_source(name=name)
|
1246
1497
|
if organism is None:
|
1247
1498
|
raise ValueError(
|
1248
1499
|
f"Organism '{name}' not found\n"
|