lamindb 0.76.2__py3-none-any.whl → 0.76.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +7 -9
- lamindb/_artifact.py +43 -24
- lamindb/_can_validate.py +20 -4
- lamindb/_curate.py +120 -40
- lamindb/_filter.py +7 -21
- lamindb/_finish.py +97 -81
- lamindb/_query_set.py +67 -34
- lamindb/_record.py +3 -2
- lamindb/_transform.py +1 -2
- lamindb/core/__init__.py +2 -2
- lamindb/core/_context.py +24 -14
- lamindb/core/_label_manager.py +1 -3
- lamindb/core/_mapped_collection.py +31 -1
- lamindb/core/exceptions.py +1 -1
- lamindb/core/storage/__init__.py +1 -1
- lamindb/core/storage/_anndata_accessor.py +6 -1
- lamindb/core/storage/_tiledbsoma.py +99 -132
- lamindb/core/versioning.py +4 -0
- lamindb/integrations/__init__.py +3 -0
- lamindb/integrations/_vitessce.py +1 -11
- {lamindb-0.76.2.dist-info → lamindb-0.76.4.dist-info}/METADATA +7 -7
- {lamindb-0.76.2.dist-info → lamindb-0.76.4.dist-info}/RECORD +24 -24
- {lamindb-0.76.2.dist-info → lamindb-0.76.4.dist-info}/LICENSE +0 -0
- {lamindb-0.76.2.dist-info → lamindb-0.76.4.dist-info}/WHEEL +0 -0
lamindb/__init__.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
"""A data framework for biology.
|
2
2
|
|
3
|
-
|
4
|
-
=======
|
3
|
+
Core registries.
|
5
4
|
|
6
5
|
.. autosummary::
|
7
6
|
:toctree: .
|
@@ -17,20 +16,18 @@ Records
|
|
17
16
|
FeatureSet
|
18
17
|
Param
|
19
18
|
|
20
|
-
Key functionality
|
21
|
-
=================
|
19
|
+
Key functionality.
|
22
20
|
|
23
21
|
.. autosummary::
|
24
22
|
:toctree: .
|
25
23
|
|
26
24
|
context
|
27
25
|
connect
|
28
|
-
|
26
|
+
Curator
|
29
27
|
view
|
30
28
|
save
|
31
29
|
|
32
|
-
Modules
|
33
|
-
==================
|
30
|
+
Modules and settings.
|
34
31
|
|
35
32
|
.. autosummary::
|
36
33
|
:toctree: .
|
@@ -44,7 +41,7 @@ Modules & settings
|
|
44
41
|
"""
|
45
42
|
|
46
43
|
# denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
|
47
|
-
__version__ = "0.76.
|
44
|
+
__version__ = "0.76.4"
|
48
45
|
|
49
46
|
import os as _os
|
50
47
|
|
@@ -94,7 +91,7 @@ if _check_instance_setup(from_lamindb=True):
|
|
94
91
|
_ulabel,
|
95
92
|
integrations,
|
96
93
|
)
|
97
|
-
from ._curate import
|
94
|
+
from ._curate import Curator
|
98
95
|
from ._save import save
|
99
96
|
from ._view import view
|
100
97
|
from .core._context import context
|
@@ -110,6 +107,7 @@ if _check_instance_setup(from_lamindb=True):
|
|
110
107
|
|
111
108
|
track = context.track # backward compat
|
112
109
|
finish = context.finish # backward compat
|
110
|
+
Curate = Curator # backward compat
|
113
111
|
settings.__doc__ = """Global :class:`~lamindb.core.Settings`."""
|
114
112
|
context.__doc__ = """Global :class:`~lamindb.core.Context`."""
|
115
113
|
from django.db.models import Q
|
lamindb/_artifact.py
CHANGED
@@ -366,11 +366,6 @@ def get_artifact_kwargs_from_data(
|
|
366
366
|
else:
|
367
367
|
storage = default_storage
|
368
368
|
|
369
|
-
# for now comment out this error to allow creating new versions of stores
|
370
|
-
# in the default folder (.lamindb)
|
371
|
-
# if key is not None and key.startswith(AUTO_KEY_PREFIX):
|
372
|
-
# raise ValueError(f"Key cannot start with {AUTO_KEY_PREFIX}")
|
373
|
-
|
374
369
|
log_storage_hint(
|
375
370
|
check_path_in_storage=check_path_in_storage,
|
376
371
|
storage=storage,
|
@@ -542,6 +537,7 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
542
537
|
else VisibilityChoice.default.value
|
543
538
|
)
|
544
539
|
format = kwargs.pop("format") if "format" in kwargs else None
|
540
|
+
_is_internal_call = kwargs.pop("_is_internal_call", False)
|
545
541
|
skip_check_exists = (
|
546
542
|
kwargs.pop("skip_check_exists") if "skip_check_exists" in kwargs else False
|
547
543
|
)
|
@@ -575,13 +571,29 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
575
571
|
raise ValueError(
|
576
572
|
f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
|
577
573
|
)
|
578
|
-
|
579
|
-
provisional_uid, revises = create_uid(revises=revises, version=version)
|
580
574
|
if revises is not None:
|
581
575
|
if not isinstance(revises, Artifact):
|
582
576
|
raise TypeError("`revises` has to be of type `Artifact`")
|
583
577
|
if description is None:
|
584
578
|
description = revises.description
|
579
|
+
if key is not None and AUTO_KEY_PREFIX in key:
|
580
|
+
raise ValueError(
|
581
|
+
f"Do not pass key that contains a managed storage path in `{AUTO_KEY_PREFIX}`"
|
582
|
+
)
|
583
|
+
# below is for internal calls that require defining the storage location
|
584
|
+
# ahead of constructing the Artifact
|
585
|
+
if isinstance(data, (str, Path)) and AUTO_KEY_PREFIX in str(data):
|
586
|
+
if _is_internal_call:
|
587
|
+
is_automanaged_path = True
|
588
|
+
user_provided_key = key
|
589
|
+
key = None
|
590
|
+
else:
|
591
|
+
raise ValueError(
|
592
|
+
f"Do not pass path inside the `{AUTO_KEY_PREFIX}` directory."
|
593
|
+
)
|
594
|
+
else:
|
595
|
+
is_automanaged_path = False
|
596
|
+
provisional_uid, revises = create_uid(revises=revises, version=version)
|
585
597
|
kwargs_or_artifact, privates = get_artifact_kwargs_from_data(
|
586
598
|
data=data,
|
587
599
|
key=key,
|
@@ -609,16 +621,29 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
609
621
|
else:
|
610
622
|
kwargs = kwargs_or_artifact
|
611
623
|
|
624
|
+
if data is not None:
|
625
|
+
artifact._local_filepath = privates["local_filepath"]
|
626
|
+
artifact._cloud_filepath = privates["cloud_filepath"]
|
627
|
+
artifact._memory_rep = privates["memory_rep"]
|
628
|
+
artifact._to_store = not privates["check_path_in_storage"]
|
629
|
+
|
630
|
+
if is_automanaged_path and _is_internal_call:
|
631
|
+
kwargs["_key_is_virtual"] = True
|
632
|
+
assert AUTO_KEY_PREFIX in kwargs["key"] # noqa: S101
|
633
|
+
uid = kwargs["key"].replace(AUTO_KEY_PREFIX, "").replace(kwargs["suffix"], "")
|
634
|
+
kwargs["key"] = user_provided_key
|
635
|
+
if revises is not None:
|
636
|
+
assert uid.startswith(revises.stem_uid) # noqa: S101
|
637
|
+
if len(uid) == 16:
|
638
|
+
if revises is None:
|
639
|
+
uid += "0000"
|
640
|
+
else:
|
641
|
+
uid, revises = create_uid(revises=revises, version=version)
|
642
|
+
kwargs["uid"] = uid
|
643
|
+
|
612
644
|
# only set key now so that we don't do a look-up on it in case revises is passed
|
613
645
|
if revises is not None:
|
614
646
|
kwargs["key"] = revises.key
|
615
|
-
# in case we have a new version of a folder with a different hash, print a
|
616
|
-
# warning that the old version can't be recovered
|
617
|
-
if revises is not None and revises.n_objects is not None and revises.n_objects > 1:
|
618
|
-
logger.warning(
|
619
|
-
f"artifact version {version} will _update_ the state of folder {revises.path} - "
|
620
|
-
"to _retain_ the old state by duplicating the entire folder, do _not_ pass `revises`"
|
621
|
-
)
|
622
647
|
|
623
648
|
kwargs["type"] = type
|
624
649
|
kwargs["version"] = version
|
@@ -637,12 +662,6 @@ def __init__(artifact: Artifact, *args, **kwargs):
|
|
637
662
|
|
638
663
|
add_transform_to_kwargs(kwargs, kwargs["run"])
|
639
664
|
|
640
|
-
if data is not None:
|
641
|
-
artifact._local_filepath = privates["local_filepath"]
|
642
|
-
artifact._cloud_filepath = privates["cloud_filepath"]
|
643
|
-
artifact._memory_rep = privates["memory_rep"]
|
644
|
-
artifact._to_store = not privates["check_path_in_storage"]
|
645
|
-
|
646
665
|
super(Artifact, artifact).__init__(**kwargs)
|
647
666
|
|
648
667
|
|
@@ -937,10 +956,9 @@ def open(
|
|
937
956
|
if self.hash != hash:
|
938
957
|
from ._record import init_self_from_db
|
939
958
|
|
940
|
-
|
941
|
-
|
942
|
-
)
|
943
|
-
new_version = Artifact(filepath, revises=self).save()
|
959
|
+
new_version = Artifact(
|
960
|
+
filepath, revises=self, _is_internal_call=True
|
961
|
+
).save()
|
944
962
|
init_self_from_db(self, new_version)
|
945
963
|
|
946
964
|
if localpath != filepath and localpath.exists():
|
@@ -1168,3 +1186,4 @@ Artifact._delete_skip_storage = _delete_skip_storage
|
|
1168
1186
|
Artifact._save_skip_storage = _save_skip_storage
|
1169
1187
|
Artifact.path = path
|
1170
1188
|
Artifact.backed = backed
|
1189
|
+
Artifact.view_lineage = HasFeatures.view_lineage
|
lamindb/_can_validate.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING,
|
3
|
+
from typing import TYPE_CHECKING, Literal
|
4
4
|
|
5
5
|
import lamindb_setup as ln_setup
|
6
6
|
import numpy as np
|
@@ -79,6 +79,19 @@ def _check_organism_db(organism: Record, using_key: str | None):
|
|
79
79
|
)
|
80
80
|
|
81
81
|
|
82
|
+
def _concat_lists(values: ListLike) -> list[str]:
|
83
|
+
"""Concatenate a list of lists of strings into a single list."""
|
84
|
+
if len(values) > 0 and isinstance(values, (list, pd.Series)):
|
85
|
+
try:
|
86
|
+
if isinstance(values[0], list):
|
87
|
+
if isinstance(values, pd.Series):
|
88
|
+
values = values.tolist()
|
89
|
+
values = sum([v for v in values if isinstance(v, list)], [])
|
90
|
+
except KeyError:
|
91
|
+
pass
|
92
|
+
return values
|
93
|
+
|
94
|
+
|
82
95
|
def _inspect(
|
83
96
|
cls,
|
84
97
|
values: ListLike,
|
@@ -94,6 +107,7 @@ def _inspect(
|
|
94
107
|
|
95
108
|
if isinstance(values, str):
|
96
109
|
values = [values]
|
110
|
+
values = _concat_lists(values)
|
97
111
|
|
98
112
|
field = get_name_field(cls, field=field)
|
99
113
|
queryset = _queryset(cls, using_key)
|
@@ -184,6 +198,7 @@ def _validate(
|
|
184
198
|
return_str = True if isinstance(values, str) else False
|
185
199
|
if isinstance(values, str):
|
186
200
|
values = [values]
|
201
|
+
values = _concat_lists(values)
|
187
202
|
|
188
203
|
field = get_name_field(cls, field=field)
|
189
204
|
|
@@ -229,7 +244,7 @@ def _validate(
|
|
229
244
|
@doc_args(CanValidate.standardize.__doc__)
|
230
245
|
def standardize(
|
231
246
|
cls,
|
232
|
-
values:
|
247
|
+
values: ListLike,
|
233
248
|
field: str | StrField | None = None,
|
234
249
|
*,
|
235
250
|
return_field: str = None,
|
@@ -295,7 +310,7 @@ def remove_synonym(self, synonym: str | ListLike):
|
|
295
310
|
|
296
311
|
def _standardize(
|
297
312
|
cls,
|
298
|
-
values:
|
313
|
+
values: ListLike,
|
299
314
|
field: str | StrField | None = None,
|
300
315
|
*,
|
301
316
|
return_field: str = None,
|
@@ -315,6 +330,7 @@ def _standardize(
|
|
315
330
|
return_str = True if isinstance(values, str) else False
|
316
331
|
if isinstance(values, str):
|
317
332
|
values = [values]
|
333
|
+
values = _concat_lists(values)
|
318
334
|
|
319
335
|
field = get_name_field(cls, field=field)
|
320
336
|
return_field = get_name_field(
|
@@ -416,7 +432,7 @@ def _standardize(
|
|
416
432
|
|
417
433
|
|
418
434
|
def _add_or_remove_synonyms(
|
419
|
-
synonym: str |
|
435
|
+
synonym: str | ListLike,
|
420
436
|
record: Record,
|
421
437
|
action: Literal["add", "remove"],
|
422
438
|
force: bool = False,
|
lamindb/_curate.py
CHANGED
@@ -84,10 +84,34 @@ class CurateLookup:
|
|
84
84
|
return colors.warning("No fields are found!")
|
85
85
|
|
86
86
|
|
87
|
-
class
|
87
|
+
class BaseCurator:
|
88
|
+
"""Curate a dataset."""
|
89
|
+
|
90
|
+
def validate(self) -> bool:
|
91
|
+
"""Validate dataset.
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
Boolean indicating whether the dataset is validated.
|
95
|
+
"""
|
96
|
+
pass
|
97
|
+
|
98
|
+
def save_artifact(self, description: str | None = None, **kwargs) -> Artifact:
|
99
|
+
"""Save the dataset as artifact.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
description: Description of the DataFrame object.
|
103
|
+
**kwargs: Object level metadata.
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
A saved artifact record.
|
107
|
+
"""
|
108
|
+
pass
|
109
|
+
|
110
|
+
|
111
|
+
class DataFrameCurator(BaseCurator):
|
88
112
|
"""Curation flow for a DataFrame object.
|
89
113
|
|
90
|
-
See also :class:`~lamindb.
|
114
|
+
See also :class:`~lamindb.Curator`.
|
91
115
|
|
92
116
|
Args:
|
93
117
|
df: The DataFrame object to curate.
|
@@ -101,7 +125,7 @@ class DataFrameCurator:
|
|
101
125
|
|
102
126
|
Examples:
|
103
127
|
>>> import bionty as bt
|
104
|
-
>>> curate = ln.
|
128
|
+
>>> curate = ln.Curator.from_df(
|
105
129
|
... df,
|
106
130
|
... categoricals={
|
107
131
|
... "cell_type_ontology_id": bt.CellType.ontology_id,
|
@@ -120,6 +144,7 @@ class DataFrameCurator:
|
|
120
144
|
organism: str | None = None,
|
121
145
|
sources: dict[str, Record] | None = None,
|
122
146
|
exclude: dict | None = None,
|
147
|
+
check_valid_keys: bool = True,
|
123
148
|
) -> None:
|
124
149
|
from lamindb.core._settings import settings
|
125
150
|
|
@@ -139,6 +164,8 @@ class DataFrameCurator:
|
|
139
164
|
exclude = {}
|
140
165
|
self._exclude = exclude
|
141
166
|
self._non_validated = None
|
167
|
+
if check_valid_keys:
|
168
|
+
self._check_valid_keys()
|
142
169
|
self._save_columns()
|
143
170
|
|
144
171
|
@property
|
@@ -167,14 +194,25 @@ class DataFrameCurator:
|
|
167
194
|
using_key=using_key or self._using_key,
|
168
195
|
)
|
169
196
|
|
197
|
+
def _check_valid_keys(self, extra: set = None) -> None:
|
198
|
+
if extra is None:
|
199
|
+
extra = set()
|
200
|
+
for name, d in {
|
201
|
+
"categoricals": self._fields,
|
202
|
+
"sources": self._sources,
|
203
|
+
"exclude": self._exclude,
|
204
|
+
}.items():
|
205
|
+
if not isinstance(d, dict):
|
206
|
+
raise TypeError(f"{name} must be a dictionary!")
|
207
|
+
valid_keys = set(self._df.columns) | {"columns"} | extra
|
208
|
+
nonval_keys = [key for key in d.keys() if key not in valid_keys]
|
209
|
+
if len(nonval_keys) > 0:
|
210
|
+
raise ValueError(
|
211
|
+
f"the following keys passed to {name} are not allowed: {nonval_keys}"
|
212
|
+
)
|
213
|
+
|
170
214
|
def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
|
171
215
|
"""Save column name records."""
|
172
|
-
missing_columns = set(self.fields.keys()) - set(self._df.columns)
|
173
|
-
if missing_columns:
|
174
|
-
raise ValueError(
|
175
|
-
f"Columns {missing_columns} are not found in the data object!"
|
176
|
-
)
|
177
|
-
|
178
216
|
# Always save features specified as the fields keys
|
179
217
|
update_registry(
|
180
218
|
values=list(self.fields.keys()),
|
@@ -184,6 +222,7 @@ class DataFrameCurator:
|
|
184
222
|
using_key=self._using_key,
|
185
223
|
validated_only=False,
|
186
224
|
source=self._sources.get("columns"),
|
225
|
+
exclude=self._exclude.get("columns"),
|
187
226
|
**kwargs,
|
188
227
|
)
|
189
228
|
|
@@ -199,6 +238,7 @@ class DataFrameCurator:
|
|
199
238
|
validated_only=validated_only,
|
200
239
|
df=self._df, # Get the Feature type from df
|
201
240
|
source=self._sources.get("columns"),
|
241
|
+
exclude=self._exclude.get("columns"),
|
202
242
|
warning=False, # Do not warn about missing columns, just an info message
|
203
243
|
**kwargs,
|
204
244
|
)
|
@@ -251,6 +291,7 @@ class DataFrameCurator:
|
|
251
291
|
using_key=self._using_key,
|
252
292
|
validated_only=validated_only,
|
253
293
|
source=self._sources.get(categorical),
|
294
|
+
exclude=self._exclude.get(categorical),
|
254
295
|
**kwargs,
|
255
296
|
)
|
256
297
|
|
@@ -330,9 +371,11 @@ class DataFrameCurator:
|
|
330
371
|
class AnnDataCurator(DataFrameCurator):
|
331
372
|
"""Curation flow for ``AnnData``.
|
332
373
|
|
333
|
-
See also :class:`~lamindb.
|
374
|
+
See also :class:`~lamindb.Curator`.
|
375
|
+
|
376
|
+
Note that if genes are removed from the AnnData object, the object should be recreated using :meth:`~lamindb.Curator.from_anndata`.
|
334
377
|
|
335
|
-
|
378
|
+
See :doc:`docs:cellxgene-curate` for instructions on how to curate against a specific cellxgene schema version.
|
336
379
|
|
337
380
|
Args:
|
338
381
|
data: The AnnData object or an AnnData-like path.
|
@@ -346,7 +389,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
346
389
|
|
347
390
|
Examples:
|
348
391
|
>>> import bionty as bt
|
349
|
-
>>> curate = ln.
|
392
|
+
>>> curate = ln.Curator.from_anndata(
|
350
393
|
... adata,
|
351
394
|
... var_index=bt.Gene.ensembl_gene_id,
|
352
395
|
... categoricals={
|
@@ -397,8 +440,10 @@ class AnnDataCurator(DataFrameCurator):
|
|
397
440
|
organism=organism,
|
398
441
|
sources=sources,
|
399
442
|
exclude=exclude,
|
443
|
+
check_valid_keys=False,
|
400
444
|
)
|
401
445
|
self._obs_fields = categoricals
|
446
|
+
self._check_valid_keys(extra={"var_index"})
|
402
447
|
|
403
448
|
@property
|
404
449
|
def var_index(self) -> FieldAttr:
|
@@ -437,6 +482,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
437
482
|
validated_only=validated_only,
|
438
483
|
organism=organism,
|
439
484
|
source=self._sources.get("var_index"),
|
485
|
+
exclude=self._exclude.get("var_index"),
|
440
486
|
)
|
441
487
|
|
442
488
|
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
@@ -536,10 +582,10 @@ class AnnDataCurator(DataFrameCurator):
|
|
536
582
|
class MuDataCurator:
|
537
583
|
"""Curation flow for a ``MuData`` object.
|
538
584
|
|
539
|
-
See also :class:`~lamindb.
|
585
|
+
See also :class:`~lamindb.Curator`.
|
540
586
|
|
541
587
|
Note that if genes or other measurements are removed from the MuData object,
|
542
|
-
the object should be recreated using :meth:`~lamindb.
|
588
|
+
the object should be recreated using :meth:`~lamindb.Curator.from_mudata`.
|
543
589
|
|
544
590
|
Args:
|
545
591
|
mdata: The MuData object to curate.
|
@@ -556,7 +602,7 @@ class MuDataCurator:
|
|
556
602
|
|
557
603
|
Examples:
|
558
604
|
>>> import bionty as bt
|
559
|
-
>>> curate = ln.
|
605
|
+
>>> curate = ln.Curator.from_mudata(
|
560
606
|
... mdata,
|
561
607
|
... var_index={
|
562
608
|
... "rna": bt.Gene.ensembl_gene_id,
|
@@ -603,6 +649,7 @@ class MuDataCurator:
|
|
603
649
|
verbosity=verbosity,
|
604
650
|
sources=self._sources.get(modality),
|
605
651
|
exclude=self._exclude.get(modality),
|
652
|
+
check_valid_keys=False,
|
606
653
|
**self._kwargs,
|
607
654
|
)
|
608
655
|
for modality in self._modalities
|
@@ -641,6 +688,7 @@ class MuDataCurator:
|
|
641
688
|
validated_only=validated_only,
|
642
689
|
dtype="number",
|
643
690
|
source=self._sources.get(modality, {}).get("var_index"),
|
691
|
+
exclude=self._exclude.get(modality, {}).get("var_index"),
|
644
692
|
**kwargs,
|
645
693
|
)
|
646
694
|
|
@@ -704,6 +752,7 @@ class MuDataCurator:
|
|
704
752
|
validated_only=False,
|
705
753
|
df=self._mdata[modality].obs,
|
706
754
|
source=self._sources.get(modality, {}).get("columns"),
|
755
|
+
exclude=self._exclude.get(modality, {}).get("columns"),
|
707
756
|
**self._kwargs, # type: ignore
|
708
757
|
**kwargs,
|
709
758
|
)
|
@@ -789,7 +838,8 @@ class MuDataCurator:
|
|
789
838
|
field=var_field,
|
790
839
|
key=f"{modality}_var_index",
|
791
840
|
using_key=self._using_key,
|
792
|
-
|
841
|
+
source=self._sources.get(modality, {}).get("var_index"),
|
842
|
+
exclude=self._exclude.get(modality, {}).get("var_index"),
|
793
843
|
**self._kwargs, # type: ignore
|
794
844
|
)
|
795
845
|
validated_var &= is_validated_var
|
@@ -846,19 +896,19 @@ class MuDataCurator:
|
|
846
896
|
return self._artifact
|
847
897
|
|
848
898
|
|
849
|
-
class
|
850
|
-
"""
|
899
|
+
class Curator(BaseCurator):
|
900
|
+
"""Dataset curator.
|
851
901
|
|
852
902
|
Data curation entails accurately labeling datasets with standardized metadata
|
853
903
|
to facilitate data integration, interpretation and analysis.
|
854
904
|
|
855
905
|
The curation flow has several steps:
|
856
906
|
|
857
|
-
1.
|
907
|
+
1. Instantiate `Curator` from one of the following dataset objects:
|
858
908
|
|
859
|
-
- :meth:`~lamindb.
|
860
|
-
- :meth:`~lamindb.
|
861
|
-
- :meth:`~lamindb.
|
909
|
+
- :meth:`~lamindb.Curator.from_df`
|
910
|
+
- :meth:`~lamindb.Curator.from_anndata`
|
911
|
+
- :meth:`~lamindb.Curator.from_mudata`
|
862
912
|
|
863
913
|
During object creation, any passed categoricals found in the object will be saved.
|
864
914
|
|
@@ -867,7 +917,7 @@ class Curate:
|
|
867
917
|
- Values that can successfully validated and already exist in the registry.
|
868
918
|
- Values which are new and not yet validated or potentially problematic values.
|
869
919
|
|
870
|
-
3. Determine how to handle validated and
|
920
|
+
3. Determine how to handle validated and non-validated values:
|
871
921
|
|
872
922
|
- Validated values not yet in the registry can be automatically registered using :meth:`~lamindb.core.DataFrameCurator.add_validated_from`.
|
873
923
|
- Valid and new values can be registered using :meth:`~lamindb.core.DataFrameCurator.add_new_from`.
|
@@ -982,10 +1032,22 @@ def standardize_and_inspect(
|
|
982
1032
|
field: FieldAttr,
|
983
1033
|
registry: type[Record],
|
984
1034
|
standardize: bool = False,
|
1035
|
+
exclude: str | list | None = None,
|
985
1036
|
**kwargs,
|
986
1037
|
):
|
987
1038
|
"""Standardize and inspect values using a registry."""
|
988
|
-
|
1039
|
+
# inspect exclude values in the default instance
|
1040
|
+
values = list(values)
|
1041
|
+
include_validated = []
|
1042
|
+
if exclude is not None:
|
1043
|
+
exclude = [exclude] if isinstance(exclude, str) else exclude
|
1044
|
+
exclude = [i for i in exclude if i in values]
|
1045
|
+
if len(exclude) > 0:
|
1046
|
+
# exclude values are validated without source and organism
|
1047
|
+
inspect_result_exclude = registry.inspect(exclude, field=field, mute=True)
|
1048
|
+
# if exclude values are validated, remove them from the values
|
1049
|
+
values = [i for i in values if i not in inspect_result_exclude.validated]
|
1050
|
+
include_validated = inspect_result_exclude.validated
|
989
1051
|
|
990
1052
|
if standardize:
|
991
1053
|
if hasattr(registry, "standardize") and hasattr(
|
@@ -993,11 +1055,17 @@ def standardize_and_inspect(
|
|
993
1055
|
"synonyms", # https://github.com/laminlabs/lamindb/issues/1685
|
994
1056
|
):
|
995
1057
|
standardized_values = registry.standardize(
|
996
|
-
values, field=field, mute=True, **
|
1058
|
+
values, field=field, mute=True, **kwargs
|
997
1059
|
)
|
998
1060
|
values = standardized_values
|
999
1061
|
|
1000
|
-
|
1062
|
+
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
|
1063
|
+
inspect_result._validated += include_validated
|
1064
|
+
inspect_result._non_validated = [
|
1065
|
+
i for i in inspect_result.non_validated if i not in include_validated
|
1066
|
+
]
|
1067
|
+
|
1068
|
+
return inspect_result
|
1001
1069
|
|
1002
1070
|
|
1003
1071
|
def check_registry_organism(registry: Record, organism: str | None = None) -> dict:
|
@@ -1049,35 +1117,32 @@ def validate_categories(
|
|
1049
1117
|
logger.indent = " "
|
1050
1118
|
|
1051
1119
|
registry = field.field.model
|
1120
|
+
|
1052
1121
|
kwargs = check_registry_organism(registry, organism)
|
1053
1122
|
kwargs.update({"source": source} if source else {})
|
1123
|
+
kwargs_current = get_current_filter_kwargs(registry, kwargs)
|
1054
1124
|
|
1055
1125
|
# inspect the default instance
|
1056
|
-
if exclude is not None:
|
1057
|
-
exclude = [exclude] if isinstance(exclude, str) else exclude
|
1058
|
-
# exclude values are validated without source and organism
|
1059
|
-
inspect_result = registry.inspect(exclude, field=field, mute=True)
|
1060
|
-
# if exclude values are validated, remove them from the values
|
1061
|
-
values = [i for i in values if i not in inspect_result.validated]
|
1062
|
-
|
1063
1126
|
inspect_result = standardize_and_inspect(
|
1064
1127
|
values=values,
|
1065
1128
|
field=field,
|
1066
1129
|
registry=registry,
|
1067
1130
|
standardize=standardize,
|
1068
|
-
|
1131
|
+
exclude=exclude,
|
1132
|
+
**kwargs_current,
|
1069
1133
|
)
|
1070
1134
|
non_validated = inspect_result.non_validated
|
1071
1135
|
|
1136
|
+
# inspect the using instance
|
1072
1137
|
values_validated = []
|
1073
1138
|
if using_key is not None and using_key != "default" and non_validated:
|
1074
1139
|
registry_using = get_registry_instance(registry, using_key)
|
1075
|
-
# inspect the using instance
|
1076
1140
|
inspect_result = standardize_and_inspect(
|
1077
1141
|
values=non_validated,
|
1078
1142
|
field=field,
|
1079
1143
|
registry=registry_using,
|
1080
1144
|
standardize=standardize,
|
1145
|
+
exclude=exclude,
|
1081
1146
|
**kwargs,
|
1082
1147
|
)
|
1083
1148
|
non_validated = inspect_result.non_validated
|
@@ -1091,7 +1156,7 @@ def validate_categories(
|
|
1091
1156
|
public_records = registry.from_values(
|
1092
1157
|
non_validated,
|
1093
1158
|
field=field,
|
1094
|
-
**
|
1159
|
+
**kwargs_current,
|
1095
1160
|
)
|
1096
1161
|
values_validated += [getattr(r, field.field.name) for r in public_records]
|
1097
1162
|
finally:
|
@@ -1111,9 +1176,13 @@ def validate_categories(
|
|
1111
1176
|
non_validated = [i for i in non_validated if i not in values_validated]
|
1112
1177
|
n_non_validated = len(non_validated)
|
1113
1178
|
if n_non_validated == 0:
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1179
|
+
if n_validated == 0:
|
1180
|
+
logger.indent = ""
|
1181
|
+
logger.success(f"{key} is validated against {colors.italic(model_field)}")
|
1182
|
+
return True, []
|
1183
|
+
else:
|
1184
|
+
# validated values still need to be saved to the current instance
|
1185
|
+
return False, []
|
1117
1186
|
else:
|
1118
1187
|
are = "are" if n_non_validated > 1 else "is"
|
1119
1188
|
print_values = _print_values(non_validated)
|
@@ -1138,6 +1207,9 @@ def validate_categories_in_df(
|
|
1138
1207
|
**kwargs,
|
1139
1208
|
) -> tuple[bool, dict]:
|
1140
1209
|
"""Validate categories in DataFrame columns using LaminDB registries."""
|
1210
|
+
if not fields:
|
1211
|
+
return True, {}
|
1212
|
+
|
1141
1213
|
if sources is None:
|
1142
1214
|
sources = {}
|
1143
1215
|
validated = True
|
@@ -1270,6 +1342,7 @@ def update_registry(
|
|
1270
1342
|
source: Record | None = None,
|
1271
1343
|
standardize: bool = True,
|
1272
1344
|
warning: bool = True,
|
1345
|
+
exclude: str | list | None = None,
|
1273
1346
|
**kwargs,
|
1274
1347
|
) -> None:
|
1275
1348
|
"""Save features or labels records in the default instance from the using_key instance.
|
@@ -1329,7 +1402,8 @@ def update_registry(
|
|
1329
1402
|
field=field,
|
1330
1403
|
registry=registry,
|
1331
1404
|
standardize=standardize,
|
1332
|
-
|
1405
|
+
exclude=exclude,
|
1406
|
+
**filter_kwargs_current,
|
1333
1407
|
)
|
1334
1408
|
if not inspect_result_current.non_validated:
|
1335
1409
|
all_labels = registry.from_values(
|
@@ -1348,6 +1422,7 @@ def update_registry(
|
|
1348
1422
|
inspect_result_current.non_validated,
|
1349
1423
|
field=field,
|
1350
1424
|
using_key=using_key,
|
1425
|
+
exclude=exclude,
|
1351
1426
|
**filter_kwargs,
|
1352
1427
|
)
|
1353
1428
|
|
@@ -1467,6 +1542,7 @@ def update_registry_from_using_instance(
|
|
1467
1542
|
field: FieldAttr,
|
1468
1543
|
using_key: str | None = None,
|
1469
1544
|
standardize: bool = False,
|
1545
|
+
exclude: str | list | None = None,
|
1470
1546
|
**kwargs,
|
1471
1547
|
) -> tuple[list[str], list[str]]:
|
1472
1548
|
"""Save features or labels records from the using_key instance.
|
@@ -1492,6 +1568,7 @@ def update_registry_from_using_instance(
|
|
1492
1568
|
field=field,
|
1493
1569
|
registry=registry_using,
|
1494
1570
|
standardize=standardize,
|
1571
|
+
exclude=exclude,
|
1495
1572
|
**kwargs,
|
1496
1573
|
)
|
1497
1574
|
labels_using = registry_using.filter(
|
@@ -1519,3 +1596,6 @@ def _save_organism(name: str): # pragma: no cover
|
|
1519
1596
|
)
|
1520
1597
|
organism.save()
|
1521
1598
|
return organism
|
1599
|
+
|
1600
|
+
|
1601
|
+
Curate = Curator # backward compat
|