lamindb 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_artifact.py +6 -3
- lamindb/_can_curate.py +3 -1
- lamindb/_collection.py +1 -1
- lamindb/_curate.py +387 -318
- lamindb/_feature.py +84 -58
- lamindb/_feature_set.py +6 -4
- lamindb/_finish.py +68 -13
- lamindb/_from_values.py +10 -6
- lamindb/_query_set.py +321 -102
- lamindb/_record.py +5 -3
- lamindb/_save.py +1 -0
- lamindb/_view.py +105 -9
- lamindb/core/__init__.py +2 -2
- lamindb/core/_context.py +9 -13
- lamindb/core/_data.py +58 -88
- lamindb/core/_describe.py +139 -0
- lamindb/core/_django.py +5 -6
- lamindb/core/_feature_manager.py +408 -198
- lamindb/core/_label_manager.py +147 -109
- lamindb/core/datasets/__init__.py +31 -2
- lamindb/core/datasets/_core.py +0 -27
- lamindb/core/datasets/_small.py +100 -0
- lamindb/core/exceptions.py +1 -1
- lamindb/core/storage/paths.py +9 -4
- lamindb/core/types.py +12 -2
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/METADATA +7 -8
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/RECORD +30 -28
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/LICENSE +0 -0
- {lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/WHEEL +0 -0
lamindb/_curate.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import copy
|
4
|
+
import warnings
|
4
5
|
from typing import TYPE_CHECKING
|
5
6
|
|
6
7
|
import anndata as ad
|
@@ -16,6 +17,7 @@ from lnschema_core import (
|
|
16
17
|
ULabel,
|
17
18
|
)
|
18
19
|
|
20
|
+
from ._from_values import _print_values
|
19
21
|
from .core.exceptions import ValidationError
|
20
22
|
|
21
23
|
if TYPE_CHECKING:
|
@@ -28,7 +30,21 @@ if TYPE_CHECKING:
|
|
28
30
|
|
29
31
|
|
30
32
|
class CurateLookup:
|
31
|
-
"""Lookup categories from the reference instance.
|
33
|
+
"""Lookup categories from the reference instance.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
categoricals: A dictionary of categorical fields to lookup.
|
37
|
+
slots: A dictionary of slot fields to lookup.
|
38
|
+
using_key: The key of the instance to lookup from. Defaults to the
|
39
|
+
current instance if not specified.
|
40
|
+
public: Whether to lookup from the public instance. Defaults to False.
|
41
|
+
|
42
|
+
Example:
|
43
|
+
>>> validator = ln.Validator()
|
44
|
+
>>> validator.lookup()["cell_type"].alveolar_type_1_fibroblast_cell
|
45
|
+
<Category: alveolar_type_1_fibroblast_cell>
|
46
|
+
|
47
|
+
"""
|
32
48
|
|
33
49
|
def __init__(
|
34
50
|
self,
|
@@ -37,8 +53,7 @@ class CurateLookup:
|
|
37
53
|
using_key: str | None = None,
|
38
54
|
public: bool = False,
|
39
55
|
) -> None:
|
40
|
-
|
41
|
-
slots = {}
|
56
|
+
slots = slots or {}
|
42
57
|
self._fields = {**categoricals, **slots}
|
43
58
|
self._using_key = None if using_key == "default" else using_key
|
44
59
|
self._using_key_name = self._using_key or ln_setup.settings.instance.slug
|
@@ -54,7 +69,7 @@ class CurateLookup:
|
|
54
69
|
else:
|
55
70
|
return get_registry_instance(registry, self._using_key).lookup()
|
56
71
|
raise AttributeError(
|
57
|
-
f"
|
72
|
+
f'"{self.__class__.__name__}" object has no attribute "{name}"'
|
58
73
|
)
|
59
74
|
|
60
75
|
def __getitem__(self, name):
|
@@ -65,7 +80,7 @@ class CurateLookup:
|
|
65
80
|
else:
|
66
81
|
return get_registry_instance(registry, self._using_key).lookup()
|
67
82
|
raise AttributeError(
|
68
|
-
f"
|
83
|
+
f'"{self.__class__.__name__}" object has no attribute "{name}"'
|
69
84
|
)
|
70
85
|
|
71
86
|
def __repr__(self) -> str:
|
@@ -81,7 +96,7 @@ class CurateLookup:
|
|
81
96
|
f"Lookup objects from the {colors.italic(ref)}:\n "
|
82
97
|
f"{colors.green(getattr_keys)}\n "
|
83
98
|
f"{colors.green(getitem_keys)}\n"
|
84
|
-
|
99
|
+
'Example:\n → categories = validator.lookup()["cell_type"]\n'
|
85
100
|
" → categories.alveolar_type_1_fibroblast_cell\n\n"
|
86
101
|
"To look up public ontologies, use .lookup(public=True)"
|
87
102
|
)
|
@@ -95,10 +110,25 @@ class BaseCurator:
|
|
95
110
|
def validate(self) -> bool:
|
96
111
|
"""Validate dataset.
|
97
112
|
|
113
|
+
This method also registers the validated records in the current instance.
|
114
|
+
|
98
115
|
Returns:
|
99
116
|
Boolean indicating whether the dataset is validated.
|
100
117
|
"""
|
101
|
-
pass
|
118
|
+
pass # pragma: no cover
|
119
|
+
|
120
|
+
def standardize(self, key: str) -> None:
|
121
|
+
"""Replace synonyms with standardized values.
|
122
|
+
|
123
|
+
Inplace modification of the dataset.
|
124
|
+
|
125
|
+
Args:
|
126
|
+
key: The name of the column to standardize.
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
None
|
130
|
+
"""
|
131
|
+
pass # pragma: no cover
|
102
132
|
|
103
133
|
def save_artifact(
|
104
134
|
self,
|
@@ -110,15 +140,15 @@ class BaseCurator:
|
|
110
140
|
"""Save the dataset as artifact.
|
111
141
|
|
112
142
|
Args:
|
113
|
-
description:
|
114
|
-
key:
|
115
|
-
revises:
|
116
|
-
run:
|
143
|
+
description: A description of the DataFrame object.
|
144
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
145
|
+
revises: Previous version of the artifact. Triggers a revision.
|
146
|
+
run: The run that creates the artifact.
|
117
147
|
|
118
148
|
Returns:
|
119
149
|
A saved artifact record.
|
120
150
|
"""
|
121
|
-
pass
|
151
|
+
pass # pragma: no cover
|
122
152
|
|
123
153
|
|
124
154
|
class DataFrameCurator(BaseCurator):
|
@@ -136,6 +166,9 @@ class DataFrameCurator(BaseCurator):
|
|
136
166
|
sources: A dictionary mapping column names to Source records.
|
137
167
|
exclude: A dictionary mapping column names to values to exclude.
|
138
168
|
|
169
|
+
Returns:
|
170
|
+
A curator object.
|
171
|
+
|
139
172
|
Examples:
|
140
173
|
>>> import bionty as bt
|
141
174
|
>>> curate = ln.Curator.from_df(
|
@@ -165,24 +198,21 @@ class DataFrameCurator(BaseCurator):
|
|
165
198
|
self._fields = categoricals or {}
|
166
199
|
self._columns_field = columns
|
167
200
|
self._using_key = using_key
|
201
|
+
# TODO: change verbosity back
|
168
202
|
settings.verbosity = verbosity
|
169
203
|
self._artifact = None
|
170
204
|
self._collection = None
|
171
205
|
self._validated = False
|
172
206
|
self._kwargs = {"organism": organism} if organism else {}
|
173
|
-
|
174
|
-
|
175
|
-
self._sources = sources
|
176
|
-
if exclude is None:
|
177
|
-
exclude = {}
|
178
|
-
self._exclude = exclude
|
207
|
+
self._sources = sources or {}
|
208
|
+
self._exclude = exclude or {}
|
179
209
|
self._non_validated = None
|
180
210
|
if check_valid_keys:
|
181
211
|
self._check_valid_keys()
|
182
212
|
self._save_columns()
|
183
213
|
|
184
214
|
@property
|
185
|
-
def non_validated(self) -> list:
|
215
|
+
def non_validated(self) -> dict[str, list[str]]:
|
186
216
|
"""Return the non-validated features and labels."""
|
187
217
|
if self._non_validated is None:
|
188
218
|
raise ValidationError("Please run validate() first!")
|
@@ -200,7 +230,6 @@ class DataFrameCurator(BaseCurator):
|
|
200
230
|
|
201
231
|
Args:
|
202
232
|
using_key: The instance where the lookup is performed.
|
203
|
-
if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
|
204
233
|
if "public", the lookup is performed on the public reference.
|
205
234
|
"""
|
206
235
|
return CurateLookup(
|
@@ -210,9 +239,8 @@ class DataFrameCurator(BaseCurator):
|
|
210
239
|
public=public,
|
211
240
|
)
|
212
241
|
|
213
|
-
def _check_valid_keys(self, extra: set = None) -> None:
|
214
|
-
|
215
|
-
extra = set()
|
242
|
+
def _check_valid_keys(self, extra: set | None = None) -> None:
|
243
|
+
extra = extra or set()
|
216
244
|
for name, d in {
|
217
245
|
"categoricals": self._fields,
|
218
246
|
"sources": self._sources,
|
@@ -222,9 +250,12 @@ class DataFrameCurator(BaseCurator):
|
|
222
250
|
raise TypeError(f"{name} must be a dictionary!")
|
223
251
|
valid_keys = set(self._df.columns) | {"columns"} | extra
|
224
252
|
nonval_keys = [key for key in d.keys() if key not in valid_keys]
|
253
|
+
n = len(nonval_keys)
|
254
|
+
s = "s" if n > 1 else ""
|
255
|
+
are = "are" if n > 1 else "is"
|
225
256
|
if len(nonval_keys) > 0:
|
226
257
|
raise ValidationError(
|
227
|
-
f"the following
|
258
|
+
f"the following {n} key{s} passed to {name} {are} not allowed: {colors.yellow(_print_values(nonval_keys))}"
|
228
259
|
)
|
229
260
|
|
230
261
|
def _save_columns(self, validated_only: bool = True) -> None:
|
@@ -234,7 +265,6 @@ class DataFrameCurator(BaseCurator):
|
|
234
265
|
values=list(self.fields.keys()),
|
235
266
|
field=self._columns_field,
|
236
267
|
key="columns",
|
237
|
-
save_function="add_new_from_columns",
|
238
268
|
using_key=self._using_key,
|
239
269
|
validated_only=False,
|
240
270
|
source=self._sources.get("columns"),
|
@@ -249,13 +279,11 @@ class DataFrameCurator(BaseCurator):
|
|
249
279
|
values=list(additional_columns),
|
250
280
|
field=self._columns_field,
|
251
281
|
key="columns",
|
252
|
-
save_function="add_new_from_columns",
|
253
282
|
using_key=self._using_key,
|
254
283
|
validated_only=validated_only,
|
255
284
|
df=self._df, # Get the Feature type from df
|
256
285
|
source=self._sources.get("columns"),
|
257
286
|
exclude=self._exclude.get("columns"),
|
258
|
-
warning=False, # Do not warn about missing columns, just an info message
|
259
287
|
**self._kwargs, # type: ignore
|
260
288
|
)
|
261
289
|
|
@@ -265,7 +293,7 @@ class DataFrameCurator(BaseCurator):
|
|
265
293
|
Args:
|
266
294
|
key: The key referencing the slot in the DataFrame from which to draw terms.
|
267
295
|
organism: The organism name.
|
268
|
-
**kwargs: Additional keyword arguments to pass to
|
296
|
+
**kwargs: Additional keyword arguments to pass to create new records
|
269
297
|
"""
|
270
298
|
if len(kwargs) > 0 and key == "all":
|
271
299
|
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
@@ -273,20 +301,83 @@ class DataFrameCurator(BaseCurator):
|
|
273
301
|
self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
|
274
302
|
|
275
303
|
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
276
|
-
"""
|
304
|
+
"""Deprecated to run by default during init."""
|
305
|
+
warnings.warn(
|
306
|
+
"`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
|
307
|
+
DeprecationWarning,
|
308
|
+
stacklevel=2,
|
309
|
+
)
|
310
|
+
pass
|
311
|
+
|
312
|
+
def _replace_synonyms(
|
313
|
+
self, key: str, syn_mapper: dict, values: pd.Series | pd.Index
|
314
|
+
):
|
315
|
+
# replace the values in df
|
316
|
+
std_values = values.map(lambda unstd_val: syn_mapper.get(unstd_val, unstd_val))
|
317
|
+
# remove the standardized values from self.non_validated
|
318
|
+
non_validated = [i for i in self.non_validated[key] if i not in syn_mapper]
|
319
|
+
if len(non_validated) == 0:
|
320
|
+
self._non_validated.pop(key, None) # type: ignore
|
321
|
+
else:
|
322
|
+
self._non_validated[key] = non_validated # type: ignore
|
323
|
+
# logging
|
324
|
+
n = len(syn_mapper)
|
325
|
+
if n > 0:
|
326
|
+
syn_mapper_print = _print_values(
|
327
|
+
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
328
|
+
)
|
329
|
+
s = "s" if n > 1 else ""
|
330
|
+
logger.success(
|
331
|
+
f'standardized {n} synonym{s} in "{key}": {colors.green(syn_mapper_print)}'
|
332
|
+
)
|
333
|
+
return std_values
|
334
|
+
|
335
|
+
def standardize(self, key: str):
|
336
|
+
"""Replace synonyms with standardized values.
|
277
337
|
|
278
338
|
Args:
|
279
|
-
|
280
|
-
|
339
|
+
key: The key referencing the slot in the DataFrame from which to draw terms.
|
340
|
+
|
341
|
+
Modifies the input dataset inplace.
|
281
342
|
"""
|
282
|
-
|
283
|
-
|
343
|
+
# list is needed to avoid RuntimeError: dictionary changed size during iteration
|
344
|
+
avail_keys = list(self.non_validated.keys())
|
345
|
+
if len(avail_keys) == 0:
|
346
|
+
logger.warning("values are already standardized")
|
347
|
+
return
|
348
|
+
|
349
|
+
if key == "all":
|
350
|
+
for k in avail_keys:
|
351
|
+
if k in self._fields: # needed to exclude var_index
|
352
|
+
syn_mapper = standardize_categories(
|
353
|
+
self.non_validated[k],
|
354
|
+
field=self._fields[k],
|
355
|
+
using_key=self._using_key,
|
356
|
+
source=self._sources.get(k),
|
357
|
+
**self._kwargs,
|
358
|
+
)
|
359
|
+
self._df[k] = self._replace_synonyms(k, syn_mapper, self._df[k])
|
360
|
+
else:
|
361
|
+
if key not in avail_keys:
|
362
|
+
raise KeyError(
|
363
|
+
f'"{key}" is not a valid key, available keys are: {_print_values(avail_keys)}!'
|
364
|
+
)
|
365
|
+
else:
|
366
|
+
if key in self._fields: # needed to exclude var_index
|
367
|
+
syn_mapper = standardize_categories(
|
368
|
+
self.non_validated[key],
|
369
|
+
field=self._fields[key],
|
370
|
+
using_key=self._using_key,
|
371
|
+
source=self._sources.get(key),
|
372
|
+
**self._kwargs,
|
373
|
+
)
|
374
|
+
self._df[key] = self._replace_synonyms(
|
375
|
+
key, syn_mapper, self._df[key]
|
376
|
+
)
|
284
377
|
|
285
378
|
def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
|
286
379
|
if categorical == "all":
|
287
380
|
self._update_registry_all(validated_only=validated_only, **kwargs)
|
288
|
-
elif categorical == "columns":
|
289
|
-
self._save_columns(validated_only=validated_only, **kwargs)
|
290
381
|
else:
|
291
382
|
if categorical not in self.fields:
|
292
383
|
raise ValidationError(
|
@@ -302,6 +393,9 @@ class DataFrameCurator(BaseCurator):
|
|
302
393
|
exclude=self._exclude.get(categorical),
|
303
394
|
**kwargs,
|
304
395
|
)
|
396
|
+
# adding new records removes them from non_validated
|
397
|
+
if not validated_only and self._non_validated:
|
398
|
+
self._non_validated.pop(categorical, None) # type: ignore
|
305
399
|
|
306
400
|
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
307
401
|
"""Save labels for all features."""
|
@@ -311,6 +405,10 @@ class DataFrameCurator(BaseCurator):
|
|
311
405
|
def validate(self, organism: str | None = None) -> bool:
|
312
406
|
"""Validate variables and categorical observations.
|
313
407
|
|
408
|
+
This method also registers the validated records in the current instance:
|
409
|
+
- from public sources
|
410
|
+
- from the using_key instance
|
411
|
+
|
314
412
|
Args:
|
315
413
|
organism: The organism name.
|
316
414
|
|
@@ -342,10 +440,10 @@ class DataFrameCurator(BaseCurator):
|
|
342
440
|
"""Save the validated DataFrame and metadata.
|
343
441
|
|
344
442
|
Args:
|
345
|
-
description:
|
346
|
-
key:
|
347
|
-
revises:
|
348
|
-
run:
|
443
|
+
description: Description of the DataFrame object.
|
444
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
445
|
+
revises: Previous version of the artifact. Triggers a revision.
|
446
|
+
run: The run that creates the artifact.
|
349
447
|
|
350
448
|
Returns:
|
351
449
|
A saved artifact record.
|
@@ -361,10 +459,6 @@ class DataFrameCurator(BaseCurator):
|
|
361
459
|
verbosity = settings.verbosity
|
362
460
|
try:
|
363
461
|
settings.verbosity = "warning"
|
364
|
-
if not self._validated:
|
365
|
-
# save all validated records to the current instance
|
366
|
-
self._update_registry_all()
|
367
|
-
|
368
462
|
self._artifact = save_artifact(
|
369
463
|
self._df,
|
370
464
|
description=description,
|
@@ -403,6 +497,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
403
497
|
data: The AnnData object or an AnnData-like path.
|
404
498
|
var_index: The registry field for mapping the ``.var`` index.
|
405
499
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
500
|
+
obs_columns: The registry field for mapping the ``.obs.columns``.
|
406
501
|
using_key: A reference LaminDB instance.
|
407
502
|
verbosity: The verbosity level.
|
408
503
|
organism: The organism name.
|
@@ -428,7 +523,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
428
523
|
var_index: FieldAttr,
|
429
524
|
categoricals: dict[str, FieldAttr] | None = None,
|
430
525
|
obs_columns: FieldAttr = Feature.name,
|
431
|
-
using_key: str =
|
526
|
+
using_key: str | None = None,
|
432
527
|
verbosity: str = "hint",
|
433
528
|
organism: str | None = None,
|
434
529
|
sources: dict[str, Record] | None = None,
|
@@ -456,7 +551,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
456
551
|
|
457
552
|
if "symbol" in str(var_index):
|
458
553
|
logger.warning(
|
459
|
-
"
|
554
|
+
"indexing datasets with gene symbols can be problematic: https://docs.lamin.ai/faq/symbol-mapping"
|
460
555
|
)
|
461
556
|
|
462
557
|
self._data = data
|
@@ -492,7 +587,6 @@ class AnnDataCurator(DataFrameCurator):
|
|
492
587
|
|
493
588
|
Args:
|
494
589
|
using_key: The instance where the lookup is performed.
|
495
|
-
if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
|
496
590
|
if "public", the lookup is performed on the public reference.
|
497
591
|
"""
|
498
592
|
return CurateLookup(
|
@@ -510,7 +604,6 @@ class AnnDataCurator(DataFrameCurator):
|
|
510
604
|
values=list(self._adata.var.index),
|
511
605
|
field=self.var_index,
|
512
606
|
key="var_index",
|
513
|
-
save_function=".add_new_from_var_index()",
|
514
607
|
using_key=self._using_key,
|
515
608
|
validated_only=validated_only,
|
516
609
|
organism=organism,
|
@@ -529,7 +622,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
529
622
|
|
530
623
|
Args:
|
531
624
|
organism: The organism name.
|
532
|
-
**kwargs: Additional keyword arguments to pass to
|
625
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
533
626
|
"""
|
534
627
|
self._kwargs.update({"organism": organism} if organism else {})
|
535
628
|
self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
|
@@ -537,6 +630,8 @@ class AnnDataCurator(DataFrameCurator):
|
|
537
630
|
def validate(self, organism: str | None = None) -> bool:
|
538
631
|
"""Validate categories.
|
539
632
|
|
633
|
+
This method also registers the validated records in the current instance.
|
634
|
+
|
540
635
|
Args:
|
541
636
|
organism: The organism name.
|
542
637
|
|
@@ -558,7 +653,7 @@ class AnnDataCurator(DataFrameCurator):
|
|
558
653
|
key="var_index",
|
559
654
|
using_key=self._using_key,
|
560
655
|
source=self._sources.get("var_index"),
|
561
|
-
|
656
|
+
hint_print=".add_new_from_var_index()",
|
562
657
|
exclude=self._exclude.get("var_index"),
|
563
658
|
**self._kwargs, # type: ignore
|
564
659
|
)
|
@@ -576,6 +671,34 @@ class AnnDataCurator(DataFrameCurator):
|
|
576
671
|
self._validated = validated_var and validated_obs
|
577
672
|
return self._validated
|
578
673
|
|
674
|
+
def standardize(self, key: str):
|
675
|
+
"""Replace synonyms with standardized values.
|
676
|
+
|
677
|
+
Args:
|
678
|
+
key: The key referencing the slot in `adata.obs` from which to draw terms. Same as the key in `categoricals`.
|
679
|
+
|
680
|
+
- If "var_index", standardize the var.index.
|
681
|
+
- If "all", standardize all obs columns and var.index.
|
682
|
+
|
683
|
+
Inplace modification of the dataset.
|
684
|
+
"""
|
685
|
+
if key in self._adata.obs.columns or key == "all":
|
686
|
+
# standardize obs columns
|
687
|
+
super().standardize(key)
|
688
|
+
# in addition to the obs columns, standardize the var.index
|
689
|
+
if key == "var_index" or key == "all":
|
690
|
+
syn_mapper = standardize_categories(
|
691
|
+
self._adata.var.index,
|
692
|
+
field=self.var_index,
|
693
|
+
using_key=self._using_key,
|
694
|
+
source=self._sources.get("var_index"),
|
695
|
+
**self._kwargs,
|
696
|
+
)
|
697
|
+
if "var_index" in self._non_validated: # type: ignore
|
698
|
+
self._adata.var.index = self._replace_synonyms(
|
699
|
+
"var_index", syn_mapper, self._adata.var.index
|
700
|
+
)
|
701
|
+
|
579
702
|
def save_artifact(
|
580
703
|
self,
|
581
704
|
description: str | None = None,
|
@@ -586,10 +709,10 @@ class AnnDataCurator(DataFrameCurator):
|
|
586
709
|
"""Save the validated ``AnnData`` and metadata.
|
587
710
|
|
588
711
|
Args:
|
589
|
-
description:
|
590
|
-
key:
|
591
|
-
revises:
|
592
|
-
run:
|
712
|
+
description: A description of the ``AnnData`` object.
|
713
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
714
|
+
revises: Previous version of the artifact. Triggers a revision.
|
715
|
+
run: The run that creates the artifact.
|
593
716
|
|
594
717
|
Returns:
|
595
718
|
A saved artifact record.
|
@@ -603,9 +726,6 @@ class AnnDataCurator(DataFrameCurator):
|
|
603
726
|
verbosity = settings.verbosity
|
604
727
|
try:
|
605
728
|
settings.verbosity = "warning"
|
606
|
-
if not self._validated:
|
607
|
-
# save all validated records to the current instance
|
608
|
-
self._update_registry_all()
|
609
729
|
self._artifact = save_artifact(
|
610
730
|
self._data,
|
611
731
|
adata=self._adata,
|
@@ -662,13 +782,13 @@ class MuDataCurator:
|
|
662
782
|
def __init__(
|
663
783
|
self,
|
664
784
|
mdata: MuData,
|
665
|
-
var_index: dict[str,
|
785
|
+
var_index: dict[str, FieldAttr],
|
666
786
|
categoricals: dict[str, FieldAttr] | None = None,
|
667
|
-
using_key: str =
|
787
|
+
using_key: str | None = None,
|
668
788
|
verbosity: str = "hint",
|
669
789
|
organism: str | None = None,
|
670
790
|
sources: dict[str, Record] | None = None,
|
671
|
-
exclude: dict | None = None,
|
791
|
+
exclude: dict | None = None, # {modality: {field: [values]}}
|
672
792
|
) -> None:
|
673
793
|
if sources is None:
|
674
794
|
sources = {}
|
@@ -684,19 +804,34 @@ class MuDataCurator:
|
|
684
804
|
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
685
805
|
self._using_key = using_key
|
686
806
|
self._verbosity = verbosity
|
687
|
-
self.
|
688
|
-
|
689
|
-
|
690
|
-
|
807
|
+
self._obs_df_curator = None
|
808
|
+
if "obs" in self._modalities:
|
809
|
+
self._obs_df_curator = DataFrameCurator(
|
810
|
+
df=mdata.obs,
|
811
|
+
columns=Feature.name,
|
812
|
+
categoricals=self._obs_fields.get("obs", {}),
|
813
|
+
using_key=using_key,
|
814
|
+
verbosity=verbosity,
|
815
|
+
sources=self._sources.get("obs"),
|
816
|
+
exclude=self._exclude.get("obs"),
|
817
|
+
check_valid_keys=False,
|
818
|
+
**self._kwargs,
|
819
|
+
)
|
820
|
+
self._mod_adata_curators = {
|
821
|
+
modality: AnnDataCurator(
|
822
|
+
data=mdata[modality],
|
823
|
+
var_index=var_index.get(modality),
|
824
|
+
categoricals=self._obs_fields.get(modality),
|
691
825
|
using_key=using_key,
|
692
826
|
verbosity=verbosity,
|
693
827
|
sources=self._sources.get(modality),
|
694
828
|
exclude=self._exclude.get(modality),
|
695
|
-
check_valid_keys=False,
|
696
829
|
**self._kwargs,
|
697
830
|
)
|
698
831
|
for modality in self._modalities
|
832
|
+
if modality != "obs"
|
699
833
|
}
|
834
|
+
self._non_validated = None
|
700
835
|
|
701
836
|
@property
|
702
837
|
def var_index(self) -> FieldAttr:
|
@@ -708,29 +843,19 @@ class MuDataCurator:
|
|
708
843
|
"""Return the obs fields to validate against."""
|
709
844
|
return self._obs_fields
|
710
845
|
|
846
|
+
@property
|
847
|
+
def non_validated(self) -> dict[str, dict[str, list[str]]]:
|
848
|
+
"""Return the non-validated features and labels."""
|
849
|
+
if self._non_validated is None:
|
850
|
+
raise ValidationError("Please run validate() first!")
|
851
|
+
return self._non_validated
|
852
|
+
|
711
853
|
def _verify_modality(self, modalities: Iterable[str]):
|
712
854
|
"""Verify the modality exists."""
|
713
855
|
for modality in modalities:
|
714
856
|
if modality not in self._mdata.mod.keys():
|
715
857
|
raise ValidationError(f"modality '{modality}' does not exist!")
|
716
858
|
|
717
|
-
def _save_from_var_index_modality(
|
718
|
-
self, modality: str, validated_only: bool = True, **kwargs
|
719
|
-
):
|
720
|
-
"""Save variable records."""
|
721
|
-
update_registry(
|
722
|
-
values=list(self._mdata[modality].var.index),
|
723
|
-
field=self._var_fields[modality],
|
724
|
-
key="var_index",
|
725
|
-
save_function=f'.add_new_from_var_index("{modality}")',
|
726
|
-
using_key=self._using_key,
|
727
|
-
validated_only=validated_only,
|
728
|
-
dtype="number",
|
729
|
-
source=self._sources.get(modality, {}).get("var_index"),
|
730
|
-
exclude=self._exclude.get(modality, {}).get("var_index"),
|
731
|
-
**kwargs,
|
732
|
-
)
|
733
|
-
|
734
859
|
def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
|
735
860
|
"""Parse the categorical fields."""
|
736
861
|
prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
|
@@ -756,13 +881,18 @@ class MuDataCurator:
|
|
756
881
|
|
757
882
|
Args:
|
758
883
|
using_key: The instance where the lookup is performed.
|
759
|
-
if None (default), the lookup is performed on the instance specified in "using_key" parameter of the validator.
|
760
884
|
if "public", the lookup is performed on the public reference.
|
761
885
|
"""
|
886
|
+
obs_fields = {}
|
887
|
+
for mod, fields in self._obs_fields.items():
|
888
|
+
for k, v in fields.items():
|
889
|
+
if k == "obs":
|
890
|
+
obs_fields[k] = v
|
891
|
+
else:
|
892
|
+
obs_fields[f"{mod}:{k}"] = v
|
762
893
|
return CurateLookup(
|
763
|
-
categoricals=
|
894
|
+
categoricals=obs_fields,
|
764
895
|
slots={
|
765
|
-
**self._obs_fields,
|
766
896
|
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
767
897
|
},
|
768
898
|
using_key=using_key or self._using_key,
|
@@ -776,27 +906,11 @@ class MuDataCurator:
|
|
776
906
|
organism: str | None = None,
|
777
907
|
**kwargs,
|
778
908
|
):
|
779
|
-
"""Update columns records.
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
organism: The organism name.
|
785
|
-
**kwargs: Additional keyword arguments to pass to the registry model.
|
786
|
-
"""
|
787
|
-
self._kwargs.update({"organism": organism} if organism else {})
|
788
|
-
values = column_names or self._mdata[modality].obs.columns
|
789
|
-
update_registry(
|
790
|
-
values=list(values),
|
791
|
-
field=Feature.name,
|
792
|
-
key=f"{modality} obs columns",
|
793
|
-
using_key=self._using_key,
|
794
|
-
validated_only=False,
|
795
|
-
df=self._mdata[modality].obs,
|
796
|
-
source=self._sources.get(modality, {}).get("columns"),
|
797
|
-
exclude=self._exclude.get(modality, {}).get("columns"),
|
798
|
-
**self._kwargs, # type: ignore
|
799
|
-
**kwargs,
|
909
|
+
"""Update columns records."""
|
910
|
+
warnings.warn(
|
911
|
+
"`.add_new_from_columns()` is deprecated and will be removed in a future version. It's run by default during initialization.",
|
912
|
+
DeprecationWarning,
|
913
|
+
stacklevel=2,
|
800
914
|
)
|
801
915
|
|
802
916
|
def add_new_from_var_index(
|
@@ -807,21 +921,21 @@ class MuDataCurator:
|
|
807
921
|
Args:
|
808
922
|
modality: The modality name.
|
809
923
|
organism: The organism name.
|
810
|
-
**kwargs: Additional keyword arguments to pass to
|
924
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
811
925
|
"""
|
812
926
|
self._kwargs.update({"organism": organism} if organism else {})
|
813
|
-
self.
|
814
|
-
|
927
|
+
self._mod_adata_curators[modality].add_new_from_var_index(
|
928
|
+
**self._kwargs, **kwargs
|
815
929
|
)
|
816
930
|
|
817
931
|
def _update_registry_all(self):
|
818
932
|
"""Update all registries."""
|
819
|
-
|
820
|
-
self.
|
821
|
-
|
933
|
+
if self._obs_df_curator is not None:
|
934
|
+
self._obs_df_curator._update_registry_all(
|
935
|
+
validated_only=True, **self._kwargs
|
822
936
|
)
|
823
|
-
for _,
|
824
|
-
|
937
|
+
for _, adata_curator in self._mod_adata_curators.items():
|
938
|
+
adata_curator._update_registry_all(validated_only=True, **self._kwargs)
|
825
939
|
|
826
940
|
def add_new_from(
|
827
941
|
self,
|
@@ -836,15 +950,17 @@ class MuDataCurator:
|
|
836
950
|
key: The key referencing the slot in the DataFrame.
|
837
951
|
modality: The modality name.
|
838
952
|
organism: The organism name.
|
839
|
-
**kwargs: Additional keyword arguments to pass to
|
953
|
+
**kwargs: Additional keyword arguments to pass to create new records.
|
840
954
|
"""
|
841
955
|
if len(kwargs) > 0 and key == "all":
|
842
956
|
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
843
957
|
self._kwargs.update({"organism": organism} if organism else {})
|
844
958
|
modality = modality or "obs"
|
845
|
-
if modality in self.
|
846
|
-
|
847
|
-
|
959
|
+
if modality in self._mod_adata_curators:
|
960
|
+
adata_curator = self._mod_adata_curators[modality]
|
961
|
+
adata_curator.add_new_from(key=key, **self._kwargs, **kwargs)
|
962
|
+
if modality == "obs":
|
963
|
+
self._obs_df_curator.add_new_from(key=key, **self._kwargs, **kwargs)
|
848
964
|
|
849
965
|
def validate(self, organism: str | None = None) -> bool:
|
850
966
|
"""Validate categories."""
|
@@ -853,7 +969,7 @@ class MuDataCurator:
|
|
853
969
|
self._kwargs.update({"organism": organism} if organism else {})
|
854
970
|
if self._using_key is not None and self._using_key != "default":
|
855
971
|
logger.important(
|
856
|
-
f"validating
|
972
|
+
f"validating using registries of instance {colors.italic(self._using_key)}"
|
857
973
|
)
|
858
974
|
|
859
975
|
# add all validated records to the current instance
|
@@ -864,49 +980,42 @@ class MuDataCurator:
|
|
864
980
|
finally:
|
865
981
|
settings.verbosity = verbosity
|
866
982
|
|
867
|
-
|
868
|
-
non_validated_var_modality = {}
|
869
|
-
for modality, var_field in self._var_fields.items():
|
870
|
-
is_validated_var, non_validated_var = validate_categories(
|
871
|
-
self._mdata[modality].var.index,
|
872
|
-
field=var_field,
|
873
|
-
key=f"{modality}_var_index",
|
874
|
-
using_key=self._using_key,
|
875
|
-
source=self._sources.get(modality, {}).get("var_index"),
|
876
|
-
exclude=self._exclude.get(modality, {}).get("var_index"),
|
877
|
-
validated_hint_print=f'.add_validated_from_var_index("{modality}")',
|
878
|
-
**self._kwargs, # type: ignore
|
879
|
-
)
|
880
|
-
validated_var &= is_validated_var
|
881
|
-
if len(non_validated_var) > 0:
|
882
|
-
non_validated_var_modality[modality] = non_validated_var
|
983
|
+
self._non_validated = {} # type: ignore
|
883
984
|
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
non_validated_obs_modality[modality] = non_validated_obs
|
901
|
-
if modality in non_validated_var_modality:
|
902
|
-
non_validated_obs_modality[modality]["var_index"] = (
|
903
|
-
non_validated_var_modality[modality]
|
904
|
-
)
|
905
|
-
if len(non_validated_obs_modality[modality]) > 0:
|
906
|
-
self._non_validated = non_validated_obs_modality[modality]
|
907
|
-
self._validated = validated_var and validated_obs
|
985
|
+
obs_validated = True
|
986
|
+
if "obs" in self._modalities:
|
987
|
+
logger.info('validating categoricals in "obs"...')
|
988
|
+
obs_validated &= self._obs_df_curator.validate(**self._kwargs)
|
989
|
+
self._non_validated["obs"] = self._obs_df_curator.non_validated # type: ignore
|
990
|
+
logger.print("")
|
991
|
+
|
992
|
+
mods_validated = True
|
993
|
+
for modality, adata_curator in self._mod_adata_curators.items():
|
994
|
+
logger.info(f'validating categoricals in modality "{modality}"...')
|
995
|
+
mods_validated &= adata_curator.validate(**self._kwargs)
|
996
|
+
if len(adata_curator.non_validated) > 0:
|
997
|
+
self._non_validated[modality] = adata_curator.non_validated # type: ignore
|
998
|
+
logger.print("")
|
999
|
+
|
1000
|
+
self._validated = obs_validated & mods_validated
|
908
1001
|
return self._validated
|
909
1002
|
|
1003
|
+
def standardize(self, key: str, modality: str | None = None):
|
1004
|
+
"""Replace synonyms with standardized values.
|
1005
|
+
|
1006
|
+
Args:
|
1007
|
+
key: The key referencing the slot in the `MuData`.
|
1008
|
+
modality: The modality name.
|
1009
|
+
|
1010
|
+
Inplace modification of the dataset.
|
1011
|
+
"""
|
1012
|
+
modality = modality or "obs"
|
1013
|
+
if modality in self._mod_adata_curators:
|
1014
|
+
adata_curator = self._mod_adata_curators[modality]
|
1015
|
+
adata_curator.standardize(key=key)
|
1016
|
+
if modality == "obs":
|
1017
|
+
self._obs_df_curator.standardize(key=key)
|
1018
|
+
|
910
1019
|
def save_artifact(
|
911
1020
|
self,
|
912
1021
|
description: str | None = None,
|
@@ -917,10 +1026,10 @@ class MuDataCurator:
|
|
917
1026
|
"""Save the validated ``MuData`` and metadata.
|
918
1027
|
|
919
1028
|
Args:
|
920
|
-
description:
|
921
|
-
key:
|
922
|
-
revises:
|
923
|
-
run:
|
1029
|
+
description: A description of the ``MuData`` object.
|
1030
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
1031
|
+
revises: Previous version of the artifact. Triggers a revision.
|
1032
|
+
run: The run that creates the artifact.
|
924
1033
|
|
925
1034
|
Returns:
|
926
1035
|
A saved artifact record.
|
@@ -934,10 +1043,6 @@ class MuDataCurator:
|
|
934
1043
|
verbosity = settings.verbosity
|
935
1044
|
try:
|
936
1045
|
settings.verbosity = "warning"
|
937
|
-
if not self._validated:
|
938
|
-
# save all validated records to the current instance
|
939
|
-
self._update_registry_all()
|
940
|
-
|
941
1046
|
self._artifact = save_artifact(
|
942
1047
|
self._mdata,
|
943
1048
|
description=description,
|
@@ -1007,7 +1112,7 @@ class Curator(BaseCurator):
|
|
1007
1112
|
var_index: FieldAttr,
|
1008
1113
|
categoricals: dict[str, FieldAttr] | None = None,
|
1009
1114
|
obs_columns: FieldAttr = Feature.name,
|
1010
|
-
using_key: str =
|
1115
|
+
using_key: str | None = None,
|
1011
1116
|
verbosity: str = "hint",
|
1012
1117
|
organism: str | None = None,
|
1013
1118
|
sources: dict[str, Record] | None = None,
|
@@ -1031,7 +1136,7 @@ class Curator(BaseCurator):
|
|
1031
1136
|
mdata: MuData,
|
1032
1137
|
var_index: dict[str, dict[str, FieldAttr]],
|
1033
1138
|
categoricals: dict[str, FieldAttr] | None = None,
|
1034
|
-
using_key: str =
|
1139
|
+
using_key: str | None = None,
|
1035
1140
|
verbosity: str = "hint",
|
1036
1141
|
organism: str | None = None,
|
1037
1142
|
) -> MuDataCurator:
|
@@ -1081,15 +1186,14 @@ def get_current_filter_kwargs(registry: type[Record], kwargs: dict) -> dict:
|
|
1081
1186
|
return filter_kwargs
|
1082
1187
|
|
1083
1188
|
|
1084
|
-
def
|
1189
|
+
def inspect_instance(
|
1085
1190
|
values: Iterable[str],
|
1086
1191
|
field: FieldAttr,
|
1087
1192
|
registry: type[Record],
|
1088
|
-
standardize: bool = False,
|
1089
1193
|
exclude: str | list | None = None,
|
1090
1194
|
**kwargs,
|
1091
1195
|
):
|
1092
|
-
"""
|
1196
|
+
"""Inspect values using a registry."""
|
1093
1197
|
# inspect exclude values in the default instance
|
1094
1198
|
values = list(values)
|
1095
1199
|
include_validated = []
|
@@ -1103,16 +1207,6 @@ def standardize_and_inspect(
|
|
1103
1207
|
values = [i for i in values if i not in inspect_result_exclude.validated]
|
1104
1208
|
include_validated = inspect_result_exclude.validated
|
1105
1209
|
|
1106
|
-
if standardize:
|
1107
|
-
if hasattr(registry, "standardize") and hasattr(
|
1108
|
-
registry,
|
1109
|
-
"synonyms", # https://github.com/laminlabs/lamindb/issues/1685
|
1110
|
-
):
|
1111
|
-
standardized_values = registry.standardize(
|
1112
|
-
values, field=field, mute=True, **kwargs
|
1113
|
-
)
|
1114
|
-
values = standardized_values
|
1115
|
-
|
1116
1210
|
inspect_result = registry.inspect(values, field=field, mute=True, **kwargs)
|
1117
1211
|
inspect_result._validated += include_validated
|
1118
1212
|
inspect_result._non_validated = [
|
@@ -1144,8 +1238,7 @@ def validate_categories(
|
|
1144
1238
|
organism: str | None = None,
|
1145
1239
|
source: Record | None = None,
|
1146
1240
|
exclude: str | list | None = None,
|
1147
|
-
|
1148
|
-
validated_hint_print: str | None = None,
|
1241
|
+
hint_print: str | None = None,
|
1149
1242
|
) -> tuple[bool, list]:
|
1150
1243
|
"""Validate ontology terms in a pandas series using LaminDB registries.
|
1151
1244
|
|
@@ -1158,7 +1251,7 @@ def validate_categories(
|
|
1158
1251
|
source: The source record.
|
1159
1252
|
exclude: Exclude specific values from validation.
|
1160
1253
|
standardize: Whether to standardize the values.
|
1161
|
-
|
1254
|
+
hint_print: The hint to print that suggests fixing non-validated values.
|
1162
1255
|
"""
|
1163
1256
|
from lamindb._from_values import _print_values
|
1164
1257
|
from lamindb.core._settings import settings
|
@@ -1167,42 +1260,43 @@ def validate_categories(
|
|
1167
1260
|
|
1168
1261
|
def _log_mapping_info():
|
1169
1262
|
logger.indent = ""
|
1170
|
-
logger.info(f
|
1171
|
-
logger.indent = "
|
1263
|
+
logger.info(f'mapping "{key}" on {colors.italic(model_field)}')
|
1264
|
+
logger.indent = " "
|
1172
1265
|
|
1173
1266
|
registry = field.field.model
|
1174
1267
|
|
1268
|
+
# {"organism": organism_name/organism_record}
|
1175
1269
|
kwargs = check_registry_organism(registry, organism)
|
1176
1270
|
kwargs.update({"source": source} if source else {})
|
1177
1271
|
kwargs_current = get_current_filter_kwargs(registry, kwargs)
|
1178
1272
|
|
1179
|
-
# inspect the default instance
|
1180
|
-
inspect_result =
|
1273
|
+
# inspect values from the default instance
|
1274
|
+
inspect_result = inspect_instance(
|
1181
1275
|
values=values,
|
1182
1276
|
field=field,
|
1183
1277
|
registry=registry,
|
1184
|
-
standardize=standardize,
|
1185
1278
|
exclude=exclude,
|
1186
1279
|
**kwargs_current,
|
1187
1280
|
)
|
1188
1281
|
non_validated = inspect_result.non_validated
|
1282
|
+
syn_mapper = inspect_result.synonyms_mapper
|
1189
1283
|
|
1190
|
-
# inspect the
|
1284
|
+
# inspect the non-validated values from the using_key instance
|
1191
1285
|
values_validated = []
|
1192
1286
|
if using_key is not None and using_key != "default" and non_validated:
|
1193
1287
|
registry_using = get_registry_instance(registry, using_key)
|
1194
|
-
inspect_result =
|
1288
|
+
inspect_result = inspect_instance(
|
1195
1289
|
values=non_validated,
|
1196
1290
|
field=field,
|
1197
1291
|
registry=registry_using,
|
1198
|
-
standardize=standardize,
|
1199
1292
|
exclude=exclude,
|
1200
1293
|
**kwargs,
|
1201
1294
|
)
|
1202
1295
|
non_validated = inspect_result.non_validated
|
1203
1296
|
values_validated += inspect_result.validated
|
1297
|
+
syn_mapper.update(inspect_result.synonyms_mapper)
|
1204
1298
|
|
1205
|
-
# inspect from public (bionty only)
|
1299
|
+
# inspect the non-validated values from public (bionty only)
|
1206
1300
|
if hasattr(registry, "public"):
|
1207
1301
|
verbosity = settings.verbosity
|
1208
1302
|
try:
|
@@ -1216,39 +1310,35 @@ def validate_categories(
|
|
1216
1310
|
finally:
|
1217
1311
|
settings.verbosity = verbosity
|
1218
1312
|
|
1219
|
-
|
1220
|
-
|
1221
|
-
|
1222
|
-
if n_validated > 0:
|
1223
|
-
_log_mapping_info()
|
1224
|
-
terms_str = f"{', '.join([f'{chr(39)}{v}{chr(39)}' for v in values_validated[:10]])}{', ...' if len(values_validated) > 10 else ''}"
|
1225
|
-
val_numerous = "" if n_validated == 1 else "s"
|
1226
|
-
logger.warning(
|
1227
|
-
f"found {colors.yellow(n_validated)} validated term{val_numerous}: "
|
1228
|
-
f"{colors.yellow(terms_str)}\n"
|
1229
|
-
f"→ save term{val_numerous} via {colors.yellow(validated_hint_print)}"
|
1230
|
-
)
|
1231
|
-
|
1232
|
-
non_validated_hint_print = validated_hint_print.replace("_validated_", "_new_")
|
1313
|
+
# logging messages
|
1314
|
+
non_validated_hint_print = hint_print or f'.add_new_from("{key}")'
|
1233
1315
|
non_validated = [i for i in non_validated if i not in values_validated]
|
1234
1316
|
n_non_validated = len(non_validated)
|
1235
1317
|
if n_non_validated == 0:
|
1236
|
-
if
|
1318
|
+
if len(values_validated) == 0:
|
1319
|
+
# nothing to validate
|
1237
1320
|
logger.indent = ""
|
1238
|
-
logger.success(f"
|
1321
|
+
logger.success(f'"{key}" is validated against {colors.italic(model_field)}')
|
1239
1322
|
return True, []
|
1240
1323
|
else:
|
1241
1324
|
# validated values still need to be saved to the current instance
|
1242
1325
|
return False, []
|
1243
1326
|
else:
|
1244
|
-
|
1327
|
+
are = "is" if n_non_validated == 1 else "are"
|
1328
|
+
s = "" if n_non_validated == 1 else "s"
|
1245
1329
|
print_values = _print_values(non_validated)
|
1246
|
-
warning_message = (
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1330
|
+
warning_message = f"{colors.red(f'{n_non_validated} term{s}')} {are} not validated: {colors.red(print_values)}\n"
|
1331
|
+
if syn_mapper:
|
1332
|
+
s = "" if len(syn_mapper) == 1 else "s"
|
1333
|
+
syn_mapper_print = _print_values(
|
1334
|
+
[f'"{k}" → "{v}"' for k, v in syn_mapper.items()], sep=""
|
1335
|
+
)
|
1336
|
+
hint_msg = f'.standardize("{key}")'
|
1337
|
+
warning_message += f" {colors.yellow(f'{len(syn_mapper)} synonym{s}')} found: {colors.yellow(syn_mapper_print)}\n → curate synonyms via {colors.cyan(hint_msg)}"
|
1338
|
+
if n_non_validated > len(syn_mapper):
|
1339
|
+
if syn_mapper:
|
1340
|
+
warning_message += " for remaining terms:\n"
|
1341
|
+
warning_message += f" → fix typos, remove non-existent values, or save terms via {colors.cyan(non_validated_hint_print)}"
|
1252
1342
|
|
1253
1343
|
if logger.indent == "":
|
1254
1344
|
_log_mapping_info()
|
@@ -1257,6 +1347,44 @@ def validate_categories(
|
|
1257
1347
|
return False, non_validated
|
1258
1348
|
|
1259
1349
|
|
1350
|
+
def standardize_categories(
|
1351
|
+
values: Iterable[str],
|
1352
|
+
field: FieldAttr,
|
1353
|
+
using_key: str | None = None,
|
1354
|
+
organism: str | None = None,
|
1355
|
+
source: Record | None = None,
|
1356
|
+
) -> dict:
|
1357
|
+
"""Get a synonym mapper."""
|
1358
|
+
registry = field.field.model
|
1359
|
+
if not hasattr(registry, "standardize"):
|
1360
|
+
return {}
|
1361
|
+
# standardize values using the default instance
|
1362
|
+
syn_mapper = registry.standardize(
|
1363
|
+
values,
|
1364
|
+
field=field.field.name,
|
1365
|
+
organism=organism,
|
1366
|
+
source=source,
|
1367
|
+
mute=True,
|
1368
|
+
return_mapper=True,
|
1369
|
+
)
|
1370
|
+
|
1371
|
+
if len(values) > len(syn_mapper): # type: ignore
|
1372
|
+
# standardize values using the using_key instance
|
1373
|
+
if using_key is not None and using_key != "default":
|
1374
|
+
registry_using = get_registry_instance(registry, using_key)
|
1375
|
+
syn_mapper.update(
|
1376
|
+
registry_using.standardize(
|
1377
|
+
[v for v in values if v not in syn_mapper],
|
1378
|
+
field=field.field.name,
|
1379
|
+
organism=organism,
|
1380
|
+
source=source,
|
1381
|
+
mute=True,
|
1382
|
+
return_mapper=True,
|
1383
|
+
)
|
1384
|
+
)
|
1385
|
+
return syn_mapper
|
1386
|
+
|
1387
|
+
|
1260
1388
|
def validate_categories_in_df(
|
1261
1389
|
df: pd.DataFrame,
|
1262
1390
|
fields: dict[str, FieldAttr],
|
@@ -1304,15 +1432,15 @@ def save_artifact(
|
|
1304
1432
|
|
1305
1433
|
Args:
|
1306
1434
|
data: The DataFrame or AnnData object to save.
|
1307
|
-
description: A description of the artifact.
|
1308
1435
|
fields: A dictionary mapping obs_column to registry_field.
|
1309
1436
|
columns_field: The registry field to validate variables index against.
|
1437
|
+
description: A description of the artifact.
|
1310
1438
|
organism: The organism name.
|
1311
1439
|
adata: The AnnData object to save and get n_observations, must be provided if data is a path.
|
1312
|
-
type:
|
1313
|
-
key:
|
1314
|
-
revises:
|
1315
|
-
run:
|
1440
|
+
type: The artifact type.
|
1441
|
+
key: A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
|
1442
|
+
revises: Previous version of the artifact. Triggers a revision.
|
1443
|
+
run: The run that creates the artifact.
|
1316
1444
|
|
1317
1445
|
Returns:
|
1318
1446
|
The saved Artifact.
|
@@ -1402,6 +1530,7 @@ def save_artifact(
|
|
1402
1530
|
feature=feature,
|
1403
1531
|
feature_ref_is_name=feature_ref_is_name,
|
1404
1532
|
label_ref_is_name=label_ref_is_name,
|
1533
|
+
from_curator=True,
|
1405
1534
|
)
|
1406
1535
|
|
1407
1536
|
if artifact._accessor == "MuData":
|
@@ -1457,15 +1586,12 @@ def update_registry(
|
|
1457
1586
|
values: list[str],
|
1458
1587
|
field: FieldAttr,
|
1459
1588
|
key: str,
|
1460
|
-
save_function: str = "add_new_from",
|
1461
1589
|
using_key: str | None = None,
|
1462
1590
|
validated_only: bool = True,
|
1463
1591
|
df: pd.DataFrame | None = None,
|
1464
1592
|
organism: str | None = None,
|
1465
1593
|
dtype: str | None = None,
|
1466
1594
|
source: Record | None = None,
|
1467
|
-
standardize: bool = True,
|
1468
|
-
warning: bool = True,
|
1469
1595
|
exclude: str | list | None = None,
|
1470
1596
|
**kwargs,
|
1471
1597
|
) -> None:
|
@@ -1475,13 +1601,13 @@ def update_registry(
|
|
1475
1601
|
values: A list of values to be saved as labels.
|
1476
1602
|
field: The FieldAttr object representing the field for which labels are being saved.
|
1477
1603
|
key: The name of the feature to save.
|
1478
|
-
save_function: The name of the function to save the labels.
|
1479
1604
|
using_key: The name of the instance from which to transfer labels (if applicable).
|
1480
1605
|
validated_only: If True, only save validated labels.
|
1481
1606
|
df: A DataFrame to save labels from.
|
1482
1607
|
organism: The organism name.
|
1483
1608
|
dtype: The type of the feature.
|
1484
1609
|
source: The source record.
|
1610
|
+
exclude: Values to exclude from inspect.
|
1485
1611
|
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
1486
1612
|
"""
|
1487
1613
|
from lamindb._save import save as ln_save
|
@@ -1490,78 +1616,55 @@ def update_registry(
|
|
1490
1616
|
registry = field.field.model
|
1491
1617
|
filter_kwargs = check_registry_organism(registry, organism)
|
1492
1618
|
filter_kwargs.update({"source": source} if source else {})
|
1619
|
+
if not values:
|
1620
|
+
return
|
1493
1621
|
|
1494
1622
|
verbosity = settings.verbosity
|
1495
1623
|
try:
|
1496
1624
|
settings.verbosity = "error"
|
1625
|
+
labels_saved: dict = {"from public": [], "new": []}
|
1497
1626
|
|
1498
|
-
# save from public
|
1627
|
+
# inspect the default instance and save validated records from public
|
1499
1628
|
filter_kwargs_current = get_current_filter_kwargs(registry, filter_kwargs)
|
1500
|
-
existing_and_public_records = (
|
1501
|
-
|
1502
|
-
list(values),
|
1503
|
-
field=field,
|
1504
|
-
**filter_kwargs_current,
|
1505
|
-
)
|
1506
|
-
if values
|
1507
|
-
else []
|
1629
|
+
existing_and_public_records = registry.from_values(
|
1630
|
+
list(values), field=field, **filter_kwargs_current
|
1508
1631
|
)
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1632
|
+
existing_and_public_labels = [
|
1633
|
+
getattr(r, field.field.name) for r in existing_and_public_records
|
1634
|
+
]
|
1635
|
+
# public records that are not already in the database
|
1512
1636
|
public_records = [r for r in existing_and_public_records if r._state.adding]
|
1513
1637
|
# here we check to only save the public records if they are from the specified source
|
1514
1638
|
# we check the uid because r.source and source can be from different instances
|
1515
1639
|
if source:
|
1516
1640
|
public_records = [r for r in public_records if r.source.uid == source.uid]
|
1517
|
-
|
1518
|
-
if public_records:
|
1641
|
+
if len(public_records) > 0:
|
1519
1642
|
settings.verbosity = "info"
|
1520
1643
|
logger.info(f"saving validated records of '{key}'")
|
1521
1644
|
settings.verbosity = "error"
|
1522
|
-
|
1523
|
-
|
1524
|
-
|
1645
|
+
ln_save(public_records)
|
1646
|
+
labels_saved["from public"] = [
|
1647
|
+
getattr(r, field.field.name) for r in public_records
|
1648
|
+
]
|
1649
|
+
# non-validated records from the default instance
|
1650
|
+
non_validated_labels = [
|
1651
|
+
i for i in values if i not in existing_and_public_labels
|
1525
1652
|
]
|
1526
|
-
non_public_labels = [i for i in values if i not in labels_saved["from public"]]
|
1527
|
-
|
1528
|
-
# inspect the default instance
|
1529
|
-
inspect_result_current = standardize_and_inspect(
|
1530
|
-
values=non_public_labels,
|
1531
|
-
field=field,
|
1532
|
-
registry=registry,
|
1533
|
-
standardize=standardize,
|
1534
|
-
exclude=exclude,
|
1535
|
-
**filter_kwargs_current,
|
1536
|
-
)
|
1537
|
-
if not inspect_result_current.non_validated:
|
1538
|
-
all_labels = registry.from_values(
|
1539
|
-
inspect_result_current.validated,
|
1540
|
-
field=field,
|
1541
|
-
**filter_kwargs_current,
|
1542
|
-
)
|
1543
|
-
settings.verbosity = verbosity
|
1544
|
-
return all_labels
|
1545
1653
|
|
1546
|
-
# inspect the using_key instance
|
1654
|
+
# inspect and save validated records the using_key instance
|
1547
1655
|
(
|
1548
1656
|
labels_saved[f"from {using_key}"],
|
1549
1657
|
non_validated_labels,
|
1550
1658
|
) = update_registry_from_using_instance(
|
1551
|
-
|
1659
|
+
non_validated_labels,
|
1552
1660
|
field=field,
|
1553
1661
|
using_key=using_key,
|
1554
1662
|
exclude=exclude,
|
1555
1663
|
**filter_kwargs,
|
1556
1664
|
)
|
1557
1665
|
|
1558
|
-
|
1559
|
-
|
1560
|
-
for i in non_validated_labels
|
1561
|
-
if i not in labels_saved[f"from {using_key}"]
|
1562
|
-
]
|
1563
|
-
|
1564
|
-
# save non-validated records
|
1666
|
+
# save non-validated/new records
|
1667
|
+
labels_saved["new"] = non_validated_labels
|
1565
1668
|
if not validated_only:
|
1566
1669
|
non_validated_records = []
|
1567
1670
|
if df is not None and registry == Feature:
|
@@ -1572,7 +1675,7 @@ def update_registry(
|
|
1572
1675
|
# make sure organism record is saved to the current instance
|
1573
1676
|
filter_kwargs["organism"] = _save_organism(name=organism)
|
1574
1677
|
init_kwargs = {}
|
1575
|
-
for value in labels_saved["
|
1678
|
+
for value in labels_saved["new"]:
|
1576
1679
|
init_kwargs[field.field.name] = value
|
1577
1680
|
if registry == Feature:
|
1578
1681
|
init_kwargs["dtype"] = "cat" if dtype is None else dtype
|
@@ -1585,38 +1688,26 @@ def update_registry(
|
|
1585
1688
|
)
|
1586
1689
|
ln_save(non_validated_records)
|
1587
1690
|
|
1588
|
-
# save parent labels for ulabels
|
1691
|
+
# save parent labels for ulabels, for example a parent label "project" for label "project001"
|
1589
1692
|
if registry == ULabel and field.field.name == "name":
|
1590
|
-
|
1591
|
-
|
1592
|
-
# # get all records that are now validated in the current instance
|
1593
|
-
# all_labels = registry.from_values(
|
1594
|
-
# inspect_result_current.validated + inspect_result_current.non_validated,
|
1595
|
-
# field=field,
|
1596
|
-
# **get_current_filter_kwargs(registry, filter_kwargs),
|
1597
|
-
# )
|
1693
|
+
save_ulabels_parent(values, field=field, key=key)
|
1694
|
+
|
1598
1695
|
finally:
|
1599
1696
|
settings.verbosity = verbosity
|
1600
1697
|
|
1601
1698
|
log_saved_labels(
|
1602
1699
|
labels_saved,
|
1603
1700
|
key=key,
|
1604
|
-
save_function=save_function,
|
1605
1701
|
model_field=f"{registry.__name__}.{field.field.name}",
|
1606
1702
|
validated_only=validated_only,
|
1607
|
-
warning=warning,
|
1608
1703
|
)
|
1609
1704
|
|
1610
|
-
# return all_labels
|
1611
|
-
|
1612
1705
|
|
1613
1706
|
def log_saved_labels(
|
1614
1707
|
labels_saved: dict,
|
1615
1708
|
key: str,
|
1616
|
-
save_function: str,
|
1617
1709
|
model_field: str,
|
1618
1710
|
validated_only: bool = True,
|
1619
|
-
warning: bool = True,
|
1620
1711
|
) -> None:
|
1621
1712
|
"""Log the saved labels."""
|
1622
1713
|
from ._from_values import _print_values
|
@@ -1625,45 +1716,26 @@ def log_saved_labels(
|
|
1625
1716
|
for k, labels in labels_saved.items():
|
1626
1717
|
if not labels:
|
1627
1718
|
continue
|
1628
|
-
|
1629
|
-
if k == "without reference" and validated_only:
|
1719
|
+
if k == "new" and validated_only:
|
1630
1720
|
continue
|
1631
|
-
# msg = colors.yellow(
|
1632
|
-
# f"{len(labels)} non-validated values are not saved in {model_field}: {labels}!"
|
1633
|
-
# )
|
1634
|
-
# lookup_print = (
|
1635
|
-
# f"lookup().{key}" if key.isidentifier() else f".lookup()['{key}']"
|
1636
|
-
# )
|
1637
|
-
|
1638
|
-
# hint = f".add_new_from('{key}')"
|
1639
|
-
# msg += f"\n → to lookup values, use {lookup_print}"
|
1640
|
-
# msg += (
|
1641
|
-
# f"\n → to save, run {colors.yellow(hint)}"
|
1642
|
-
# if save_function == "add_new_from"
|
1643
|
-
# else f"\n → to save, run {colors.yellow(save_function)}"
|
1644
|
-
# )
|
1645
|
-
# if warning:
|
1646
|
-
# logger.warning(msg)
|
1647
|
-
# else:
|
1648
|
-
# logger.info(msg)
|
1649
1721
|
else:
|
1650
|
-
k = "" if k == "
|
1722
|
+
k = "" if k == "new" else f"{colors.green(k)} "
|
1651
1723
|
# the term "transferred" stresses that this is always in the context of transferring
|
1652
1724
|
# labels from a public ontology or a different instance to the present instance
|
1653
1725
|
s = "s" if len(labels) > 1 else ""
|
1654
1726
|
logger.success(
|
1655
|
-
f
|
1727
|
+
f'added {len(labels)} record{s} {k}with {model_field} for "{key}": {_print_values(labels)}'
|
1656
1728
|
)
|
1657
1729
|
|
1658
1730
|
|
1659
|
-
def
|
1731
|
+
def save_ulabels_parent(values: list[str], field: FieldAttr, key: str) -> None:
|
1660
1732
|
"""Save a parent label for the given labels."""
|
1661
1733
|
registry = field.field.model
|
1662
1734
|
assert registry == ULabel # noqa: S101
|
1663
1735
|
all_records = registry.from_values(list(values), field=field)
|
1664
|
-
is_feature = registry.filter(name=f"
|
1736
|
+
is_feature = registry.filter(name=f"{key}").one_or_none()
|
1665
1737
|
if is_feature is None:
|
1666
|
-
is_feature = registry(name=f"
|
1738
|
+
is_feature = registry(name=f"{key}").save()
|
1667
1739
|
logger.important(f"Created a parent ULabel: {is_feature}")
|
1668
1740
|
is_feature.children.add(*all_records)
|
1669
1741
|
|
@@ -1672,7 +1744,6 @@ def update_registry_from_using_instance(
|
|
1672
1744
|
values: list[str],
|
1673
1745
|
field: FieldAttr,
|
1674
1746
|
using_key: str | None = None,
|
1675
|
-
standardize: bool = False,
|
1676
1747
|
exclude: str | list | None = None,
|
1677
1748
|
**kwargs,
|
1678
1749
|
) -> tuple[list[str], list[str]]:
|
@@ -1682,7 +1753,6 @@ def update_registry_from_using_instance(
|
|
1682
1753
|
values: A list of values to be saved as labels.
|
1683
1754
|
field: The FieldAttr object representing the field for which labels are being saved.
|
1684
1755
|
using_key: The name of the instance from which to transfer labels (if applicable).
|
1685
|
-
standardize: Whether to also standardize the values.
|
1686
1756
|
kwargs: Additional keyword arguments to pass to the registry model.
|
1687
1757
|
|
1688
1758
|
Returns:
|
@@ -1694,11 +1764,10 @@ def update_registry_from_using_instance(
|
|
1694
1764
|
if using_key is not None and using_key != "default":
|
1695
1765
|
registry_using = get_registry_instance(field.field.model, using_key)
|
1696
1766
|
|
1697
|
-
inspect_result_using =
|
1767
|
+
inspect_result_using = inspect_instance(
|
1698
1768
|
values=values,
|
1699
1769
|
field=field,
|
1700
1770
|
registry=registry_using,
|
1701
|
-
standardize=standardize,
|
1702
1771
|
exclude=exclude,
|
1703
1772
|
**kwargs,
|
1704
1773
|
)
|
@@ -1713,7 +1782,7 @@ def update_registry_from_using_instance(
|
|
1713
1782
|
return labels_saved, not_saved
|
1714
1783
|
|
1715
1784
|
|
1716
|
-
def _save_organism(name: str):
|
1785
|
+
def _save_organism(name: str):
|
1717
1786
|
"""Save an organism record."""
|
1718
1787
|
import bionty as bt
|
1719
1788
|
|
@@ -1722,8 +1791,8 @@ def _save_organism(name: str): # pragma: no cover
|
|
1722
1791
|
organism = bt.Organism.from_source(name=name)
|
1723
1792
|
if organism is None:
|
1724
1793
|
raise ValidationError(
|
1725
|
-
f
|
1726
|
-
f
|
1794
|
+
f'Organism "{name}" not found\n'
|
1795
|
+
f' → please save it: bt.Organism(name="{name}").save()'
|
1727
1796
|
)
|
1728
1797
|
organism.save()
|
1729
1798
|
return organism
|