lamindb 0.69.9__py3-none-any.whl → 0.69.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +1 -1
- lamindb/_annotate.py +461 -126
- lamindb/_artifact.py +46 -9
- lamindb/_can_validate.py +13 -18
- lamindb/_collection.py +32 -40
- lamindb/_feature_set.py +20 -8
- lamindb/_from_values.py +21 -16
- lamindb/_registry.py +7 -2
- lamindb/core/__init__.py +16 -4
- lamindb/core/_data.py +5 -16
- lamindb/core/_feature_manager.py +72 -23
- lamindb/core/_label_manager.py +1 -1
- lamindb/core/_mapped_collection.py +106 -52
- lamindb/core/datasets/_core.py +41 -1
- lamindb/core/storage/_backed_access.py +8 -4
- lamindb/core/storage/file.py +9 -0
- lamindb/core/storage/object.py +19 -0
- {lamindb-0.69.9.dist-info → lamindb-0.69.10.dist-info}/METADATA +6 -7
- {lamindb-0.69.9.dist-info → lamindb-0.69.10.dist-info}/RECORD +21 -21
- {lamindb-0.69.9.dist-info → lamindb-0.69.10.dist-info}/LICENSE +0 -0
- {lamindb-0.69.9.dist-info → lamindb-0.69.10.dist-info}/WHEEL +0 -0
lamindb/_annotate.py
CHANGED
@@ -1,15 +1,17 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING, Iterable
|
3
|
+
from typing import TYPE_CHECKING, Iterable, Optional
|
4
4
|
|
5
5
|
import anndata as ad
|
6
6
|
import lamindb_setup as ln_setup
|
7
7
|
import pandas as pd
|
8
8
|
from lamin_utils import colors, logger
|
9
|
+
from lamindb_setup.core._docs import doc_args
|
9
10
|
from lnschema_core import Artifact, Collection, Feature, Registry, Run, ULabel
|
10
11
|
|
11
12
|
if TYPE_CHECKING:
|
12
13
|
from lnschema_core.types import FieldAttr
|
14
|
+
from mudata import MuData
|
13
15
|
|
14
16
|
|
15
17
|
class ValidationError(ValueError):
|
@@ -85,10 +87,16 @@ class DataFrameAnnotator:
|
|
85
87
|
df: The DataFrame object to annotate.
|
86
88
|
columns: The field attribute for the feature column.
|
87
89
|
categoricals: A dictionary mapping column names to registry_field.
|
88
|
-
For example:
|
89
|
-
``{"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name}``.
|
90
90
|
using: The reference instance containing registries to validate against.
|
91
91
|
verbosity: The verbosity level.
|
92
|
+
organism: The organism name.
|
93
|
+
|
94
|
+
Examples:
|
95
|
+
>>> import bionty as bt
|
96
|
+
>>> annotate = ln.Annotate.from_df(
|
97
|
+
df,
|
98
|
+
categoricals={"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name}
|
99
|
+
)
|
92
100
|
"""
|
93
101
|
|
94
102
|
def __init__(
|
@@ -98,7 +106,7 @@ class DataFrameAnnotator:
|
|
98
106
|
categoricals: dict[str, FieldAttr] | None = None,
|
99
107
|
using: str | None = None,
|
100
108
|
verbosity: str = "hint",
|
101
|
-
|
109
|
+
organism: str | None = None,
|
102
110
|
) -> None:
|
103
111
|
from lamindb.core._settings import settings
|
104
112
|
|
@@ -110,7 +118,7 @@ class DataFrameAnnotator:
|
|
110
118
|
self._artifact = None
|
111
119
|
self._collection = None
|
112
120
|
self._validated = False
|
113
|
-
self._kwargs:
|
121
|
+
self._kwargs = {"organism": organism} if organism else {}
|
114
122
|
self._save_columns()
|
115
123
|
|
116
124
|
@property
|
@@ -119,7 +127,7 @@ class DataFrameAnnotator:
|
|
119
127
|
return self._fields
|
120
128
|
|
121
129
|
def lookup(self, using: str | None = None) -> AnnotateLookup:
|
122
|
-
"""Lookup
|
130
|
+
"""Lookup categories.
|
123
131
|
|
124
132
|
Args:
|
125
133
|
using: The instance where the lookup is performed.
|
@@ -132,7 +140,7 @@ class DataFrameAnnotator:
|
|
132
140
|
using=using or self._using,
|
133
141
|
)
|
134
142
|
|
135
|
-
def _save_columns(self, validated_only: bool = True) -> None:
|
143
|
+
def _save_columns(self, validated_only: bool = True, **kwargs) -> None:
|
136
144
|
"""Save column name records."""
|
137
145
|
missing_columns = set(self.fields.keys()) - set(self._df.columns)
|
138
146
|
if missing_columns:
|
@@ -148,7 +156,7 @@ class DataFrameAnnotator:
|
|
148
156
|
save_function="add_new_from_columns",
|
149
157
|
using=self._using,
|
150
158
|
validated_only=False,
|
151
|
-
kwargs
|
159
|
+
**kwargs,
|
152
160
|
)
|
153
161
|
|
154
162
|
# Save the rest of the columns based on validated_only
|
@@ -162,36 +170,47 @@ class DataFrameAnnotator:
|
|
162
170
|
using=self._using,
|
163
171
|
validated_only=validated_only,
|
164
172
|
df=self._df, # Get the Feature type from df
|
165
|
-
kwargs
|
173
|
+
**kwargs,
|
166
174
|
)
|
167
175
|
|
168
|
-
def add_validated_from(self, key: str,
|
176
|
+
def add_validated_from(self, key: str, organism: str | None = None):
|
169
177
|
"""Add validated categories.
|
170
178
|
|
171
179
|
Args:
|
172
180
|
key: The key referencing the slot in the DataFrame.
|
173
|
-
|
181
|
+
organism: The organism name.
|
174
182
|
"""
|
175
|
-
self.
|
183
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
184
|
+
self._update_registry(key, validated_only=True, **self._kwargs)
|
176
185
|
|
177
|
-
def add_new_from(self, key: str, **kwargs):
|
186
|
+
def add_new_from(self, key: str, organism: str | None = None, **kwargs):
|
178
187
|
"""Add validated & new categories.
|
179
188
|
|
180
189
|
Args:
|
181
190
|
key: The key referencing the slot in the DataFrame from which to draw terms.
|
182
|
-
|
191
|
+
organism: The organism name.
|
192
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
183
193
|
"""
|
184
|
-
|
194
|
+
if len(kwargs) > 0 and key == "all":
|
195
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
196
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
197
|
+
self._update_registry(key, validated_only=False, **self._kwargs, **kwargs)
|
198
|
+
|
199
|
+
def add_new_from_columns(self, organism: str | None = None, **kwargs):
|
200
|
+
"""Add validated & new column names to its registry.
|
185
201
|
|
186
|
-
|
187
|
-
|
188
|
-
|
202
|
+
Args:
|
203
|
+
organism: The organism name.
|
204
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
205
|
+
"""
|
206
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
207
|
+
self._save_columns(validated_only=False, **self._kwargs, **kwargs)
|
189
208
|
|
190
209
|
def _update_registry(self, categorical: str, validated_only: bool = True, **kwargs):
|
191
210
|
if categorical == "all":
|
192
211
|
self._update_registry_all(validated_only=validated_only, **kwargs)
|
193
212
|
elif categorical == "columns":
|
194
|
-
self._save_columns(validated_only=validated_only)
|
213
|
+
self._save_columns(validated_only=validated_only, **kwargs)
|
195
214
|
else:
|
196
215
|
if categorical not in self.fields:
|
197
216
|
raise ValueError(f"Feature {categorical} is not part of the fields!")
|
@@ -201,7 +220,7 @@ class DataFrameAnnotator:
|
|
201
220
|
key=categorical,
|
202
221
|
using=self._using,
|
203
222
|
validated_only=validated_only,
|
204
|
-
kwargs
|
223
|
+
**kwargs,
|
205
224
|
)
|
206
225
|
|
207
226
|
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
@@ -210,13 +229,13 @@ class DataFrameAnnotator:
|
|
210
229
|
logger.info(f"saving labels for '{name}'")
|
211
230
|
self._update_registry(name, validated_only=validated_only, **kwargs)
|
212
231
|
|
213
|
-
def validate(self,
|
232
|
+
def validate(self, organism: str | None = None) -> bool:
|
214
233
|
"""Validate variables and categorical observations.
|
215
234
|
|
216
235
|
Returns:
|
217
236
|
Whether the DataFrame is validated.
|
218
237
|
"""
|
219
|
-
self._kwargs.update(
|
238
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
220
239
|
self._validated = validate_categories_in_df(
|
221
240
|
self._df,
|
222
241
|
fields=self.fields,
|
@@ -237,7 +256,6 @@ class DataFrameAnnotator:
|
|
237
256
|
"""
|
238
257
|
from lamindb.core._settings import settings
|
239
258
|
|
240
|
-
self._kwargs.update(kwargs)
|
241
259
|
if not self._validated:
|
242
260
|
raise ValidationError(
|
243
261
|
f"Data object is not validated, please run {colors.yellow('validate()')}!"
|
@@ -255,6 +273,7 @@ class DataFrameAnnotator:
|
|
255
273
|
description=description,
|
256
274
|
fields=self.fields,
|
257
275
|
columns_field=self._columns_field,
|
276
|
+
**kwargs,
|
258
277
|
**self._kwargs,
|
259
278
|
)
|
260
279
|
finally:
|
@@ -314,9 +333,18 @@ class AnnDataAnnotator(DataFrameAnnotator):
|
|
314
333
|
adata: The AnnData object to annotate.
|
315
334
|
var_index: The registry field for mapping the ``.var`` index.
|
316
335
|
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
317
|
-
For example:
|
318
|
-
``{"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name}``
|
319
336
|
using: A reference LaminDB instance.
|
337
|
+
verbosity: The verbosity level.
|
338
|
+
organism: The organism name.
|
339
|
+
|
340
|
+
Examples:
|
341
|
+
>>> import bionty as bt
|
342
|
+
>>> annotate = ln.Annotate.from_anndata(
|
343
|
+
adata,
|
344
|
+
var_index=bt.Gene.ensembl_gene_id,
|
345
|
+
categoricals={"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name},
|
346
|
+
organism="human",
|
347
|
+
)
|
320
348
|
"""
|
321
349
|
|
322
350
|
def __init__(
|
@@ -326,7 +354,7 @@ class AnnDataAnnotator(DataFrameAnnotator):
|
|
326
354
|
categoricals: dict[str, FieldAttr],
|
327
355
|
using: str = "default",
|
328
356
|
verbosity: str = "hint",
|
329
|
-
|
357
|
+
organism: str | None = None,
|
330
358
|
) -> None:
|
331
359
|
self._adata = adata
|
332
360
|
self._var_field = var_index
|
@@ -335,10 +363,10 @@ class AnnDataAnnotator(DataFrameAnnotator):
|
|
335
363
|
categoricals=categoricals,
|
336
364
|
using=using,
|
337
365
|
verbosity=verbosity,
|
338
|
-
|
366
|
+
organism=organism,
|
339
367
|
)
|
340
368
|
self._obs_fields = categoricals
|
341
|
-
self._save_from_var_index()
|
369
|
+
self._save_from_var_index(validated_only=True, **self._kwargs)
|
342
370
|
|
343
371
|
@property
|
344
372
|
def var_index(self) -> FieldAttr:
|
@@ -351,16 +379,23 @@ class AnnDataAnnotator(DataFrameAnnotator):
|
|
351
379
|
return self._obs_fields
|
352
380
|
|
353
381
|
def lookup(self, using: str | None = None) -> AnnotateLookup:
|
354
|
-
"""Lookup
|
382
|
+
"""Lookup categories.
|
383
|
+
|
384
|
+
Args:
|
385
|
+
using: The instance where the lookup is performed.
|
386
|
+
if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
|
387
|
+
if "public", the lookup is performed on the public reference.
|
388
|
+
"""
|
355
389
|
return AnnotateLookup(
|
356
390
|
categorials=self._obs_fields,
|
357
391
|
slots={"columns": self._columns_field, "var_index": self._var_field},
|
358
392
|
using=using or self._using,
|
359
393
|
)
|
360
394
|
|
361
|
-
def _save_from_var_index(
|
395
|
+
def _save_from_var_index(
|
396
|
+
self, validated_only: bool = True, organism: str | None = None
|
397
|
+
):
|
362
398
|
"""Save variable records."""
|
363
|
-
self._kwargs.update(kwargs)
|
364
399
|
update_registry(
|
365
400
|
values=self._adata.var.index,
|
366
401
|
field=self.var_index,
|
@@ -368,35 +403,56 @@ class AnnDataAnnotator(DataFrameAnnotator):
|
|
368
403
|
save_function="add_new_from_var_index",
|
369
404
|
using=self._using,
|
370
405
|
validated_only=validated_only,
|
371
|
-
|
406
|
+
organism=organism,
|
372
407
|
)
|
373
408
|
|
374
|
-
def add_new_from_var_index(self, **kwargs):
|
375
|
-
"""Update variable records.
|
376
|
-
self._save_from_var_index(validated_only=False, **kwargs)
|
409
|
+
def add_new_from_var_index(self, organism: str | None = None, **kwargs):
|
410
|
+
"""Update variable records.
|
377
411
|
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
412
|
+
Args:
|
413
|
+
organism: The organism name.
|
414
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
415
|
+
"""
|
416
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
417
|
+
self._save_from_var_index(validated_only=False, **self._kwargs, **kwargs)
|
418
|
+
|
419
|
+
def validate(self, organism: str | None = None) -> bool:
|
420
|
+
"""Validate categories.
|
421
|
+
|
422
|
+
Args:
|
423
|
+
organism: The organism name.
|
424
|
+
|
425
|
+
Returns:
|
426
|
+
Whether the AnnData object is validated.
|
427
|
+
"""
|
428
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
429
|
+
if self._using is not None and self._using != "default":
|
430
|
+
logger.important(
|
431
|
+
f"validating metadata using registries of instance {colors.italic(self._using)}"
|
432
|
+
)
|
433
|
+
validated_var = validate_categories(
|
434
|
+
self._adata.var.index,
|
435
|
+
field=self._var_field,
|
436
|
+
key="var_index",
|
437
|
+
using=self._using,
|
385
438
|
**self._kwargs,
|
386
439
|
)
|
440
|
+
validated_obs = validate_categories_in_df(
|
441
|
+
self._adata.obs, fields=self.categoricals, using=self._using, **self._kwargs
|
442
|
+
)
|
443
|
+
self._validated = validated_var and validated_obs
|
387
444
|
return self._validated
|
388
445
|
|
389
446
|
def save_artifact(self, description: str, **kwargs) -> Artifact:
|
390
|
-
"""Save the validated AnnData and metadata.
|
447
|
+
"""Save the validated ``AnnData`` and metadata.
|
391
448
|
|
392
449
|
Args:
|
393
|
-
description: Description of the AnnData object.
|
450
|
+
description: Description of the ``AnnData`` object.
|
394
451
|
**kwargs: Object level metadata.
|
395
452
|
|
396
453
|
Returns:
|
397
454
|
A saved artifact record.
|
398
455
|
"""
|
399
|
-
self._kwargs.update(kwargs)
|
400
456
|
if not self._validated:
|
401
457
|
raise ValidationError("Please run `validate()` first!")
|
402
458
|
|
@@ -406,6 +462,261 @@ class AnnDataAnnotator(DataFrameAnnotator):
|
|
406
462
|
columns_field=self.var_index,
|
407
463
|
fields=self.categoricals,
|
408
464
|
**self._kwargs,
|
465
|
+
**kwargs,
|
466
|
+
)
|
467
|
+
return self._artifact
|
468
|
+
|
469
|
+
|
470
|
+
class MuDataAnnotator:
|
471
|
+
"""Annotation flow for a ``MuData`` object.
|
472
|
+
|
473
|
+
Args:
|
474
|
+
mdata: The MuData object to annotate.
|
475
|
+
var_index: The registry field for mapping the ``.var`` index for each modality.
|
476
|
+
For example:
|
477
|
+
``{"modality_1": bt.Gene.ensembl_gene_id, "modality_2": ln.CellMarker.name}``
|
478
|
+
categoricals: A dictionary mapping ``.obs.columns`` to a registry field.
|
479
|
+
using: A reference LaminDB instance.
|
480
|
+
verbosity: The verbosity level.
|
481
|
+
organism: The organism name.
|
482
|
+
|
483
|
+
Examples:
|
484
|
+
>>> import bionty as bt
|
485
|
+
>>> annotate = ln.Annotate.from_mudata(
|
486
|
+
mdata,
|
487
|
+
var_index={"rna": bt.Gene.ensembl_gene_id, "adt": ln.CellMarker.name},
|
488
|
+
categoricals={"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name},
|
489
|
+
organism="human",
|
490
|
+
)
|
491
|
+
"""
|
492
|
+
|
493
|
+
def __init__(
|
494
|
+
self,
|
495
|
+
mdata: MuData,
|
496
|
+
var_index: dict[str, dict[str, FieldAttr]],
|
497
|
+
categoricals: dict[str, FieldAttr],
|
498
|
+
using: str = "default",
|
499
|
+
verbosity: str = "hint",
|
500
|
+
organism: str | None = None,
|
501
|
+
) -> None:
|
502
|
+
self._mdata = mdata
|
503
|
+
self._kwargs = {"organism": organism} if organism else {}
|
504
|
+
self._var_fields = var_index
|
505
|
+
self._verify_modality(self._var_fields.keys())
|
506
|
+
self._obs_fields = self._parse_categoricals(categoricals)
|
507
|
+
self._modalities = set(self._var_fields.keys()) | set(self._obs_fields.keys())
|
508
|
+
self._using = using
|
509
|
+
self._verbosity = verbosity
|
510
|
+
self._df_annotators = {
|
511
|
+
modality: DataFrameAnnotator(
|
512
|
+
df=mdata[modality].obs if modality != "obs" else mdata.obs,
|
513
|
+
categoricals=self._obs_fields.get(modality, {}),
|
514
|
+
using=using,
|
515
|
+
verbosity=verbosity,
|
516
|
+
**self._kwargs,
|
517
|
+
)
|
518
|
+
for modality in self._modalities
|
519
|
+
}
|
520
|
+
for modality in self._var_fields.keys():
|
521
|
+
self._save_from_var_index_modality(
|
522
|
+
modality=modality, validated_only=True, **self._kwargs
|
523
|
+
)
|
524
|
+
|
525
|
+
@property
|
526
|
+
def var_index(self) -> FieldAttr:
|
527
|
+
"""Return the registry field to validate variables index against."""
|
528
|
+
return self._var_fields
|
529
|
+
|
530
|
+
@property
|
531
|
+
def categoricals(self) -> dict:
|
532
|
+
"""Return the obs fields to validate against."""
|
533
|
+
return self._obs_fields
|
534
|
+
|
535
|
+
def _verify_modality(self, modalities: Iterable[str]):
|
536
|
+
"""Verify the modality exists."""
|
537
|
+
for modality in modalities:
|
538
|
+
if modality not in self._mdata.mod.keys():
|
539
|
+
raise ValueError(f"modality '{modality}' does not exist!")
|
540
|
+
|
541
|
+
def _save_from_var_index_modality(
|
542
|
+
self, modality: str, validated_only: bool = True, **kwargs
|
543
|
+
):
|
544
|
+
"""Save variable records."""
|
545
|
+
update_registry(
|
546
|
+
values=self._mdata[modality].var.index,
|
547
|
+
field=self._var_fields[modality],
|
548
|
+
key="var_index",
|
549
|
+
save_function="add_new_from_var_index",
|
550
|
+
using=self._using,
|
551
|
+
validated_only=validated_only,
|
552
|
+
type="number",
|
553
|
+
**kwargs,
|
554
|
+
)
|
555
|
+
|
556
|
+
def _parse_categoricals(self, categoricals: dict[str, FieldAttr]) -> dict:
|
557
|
+
"""Parse the categorical fields."""
|
558
|
+
prefixes = {f"{k}:" for k in self._mdata.mod.keys()}
|
559
|
+
obs_fields: dict[str, dict[str, FieldAttr]] = {}
|
560
|
+
for k, v in categoricals.items():
|
561
|
+
if k not in self._mdata.obs.columns:
|
562
|
+
raise ValueError(f"column '{k}' does not exist in mdata.obs!")
|
563
|
+
if any(k.startswith(prefix) for prefix in prefixes):
|
564
|
+
modality, col = k.split(":")[0], k.split(":")[1]
|
565
|
+
if modality not in obs_fields.keys():
|
566
|
+
obs_fields[modality] = {}
|
567
|
+
obs_fields[modality][col] = v
|
568
|
+
else:
|
569
|
+
if "obs" not in obs_fields.keys():
|
570
|
+
obs_fields["obs"] = {}
|
571
|
+
obs_fields["obs"][k] = v
|
572
|
+
return obs_fields
|
573
|
+
|
574
|
+
def lookup(self, using: str | None = None) -> AnnotateLookup:
|
575
|
+
"""Lookup categories.
|
576
|
+
|
577
|
+
Args:
|
578
|
+
using: The instance where the lookup is performed.
|
579
|
+
if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
|
580
|
+
if "public", the lookup is performed on the public reference.
|
581
|
+
"""
|
582
|
+
return AnnotateLookup(
|
583
|
+
categorials=self._obs_fields,
|
584
|
+
slots={
|
585
|
+
**self._obs_fields,
|
586
|
+
**{f"{k}_var_index": v for k, v in self._var_fields.items()},
|
587
|
+
},
|
588
|
+
using=using or self._using,
|
589
|
+
)
|
590
|
+
|
591
|
+
def add_new_from_columns(
|
592
|
+
self,
|
593
|
+
modality: str,
|
594
|
+
column_names: list[str] | None = None,
|
595
|
+
organism: str | None = None,
|
596
|
+
**kwargs,
|
597
|
+
):
|
598
|
+
"""Update columns records.
|
599
|
+
|
600
|
+
Args:
|
601
|
+
modality: The modality name.
|
602
|
+
column_names: The column names to save.
|
603
|
+
organism: The organism name.
|
604
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
605
|
+
"""
|
606
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
607
|
+
update_registry(
|
608
|
+
values=column_names or self._mdata[modality].obs.columns,
|
609
|
+
field=Feature.name,
|
610
|
+
key=f"{modality} obs columns",
|
611
|
+
using=self._using,
|
612
|
+
validated_only=False,
|
613
|
+
df=self._mdata[modality].obs,
|
614
|
+
**self._kwargs,
|
615
|
+
**kwargs,
|
616
|
+
)
|
617
|
+
|
618
|
+
def add_new_from_var_index(
|
619
|
+
self, modality: str, organism: str | None = None, **kwargs
|
620
|
+
):
|
621
|
+
"""Update variable records.
|
622
|
+
|
623
|
+
Args:
|
624
|
+
modality: The modality name.
|
625
|
+
organism: The organism name.
|
626
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
627
|
+
"""
|
628
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
629
|
+
self._save_from_var_index_modality(
|
630
|
+
modality=modality, validated_only=False, **self._kwargs, **kwargs
|
631
|
+
)
|
632
|
+
|
633
|
+
def add_validated_from(
|
634
|
+
self, key: str, modality: str | None = None, organism: str | None = None
|
635
|
+
):
|
636
|
+
"""Add validated categories.
|
637
|
+
|
638
|
+
Args:
|
639
|
+
key: The key referencing the slot in the DataFrame.
|
640
|
+
modality: The modality name.
|
641
|
+
organism: The organism name.
|
642
|
+
"""
|
643
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
644
|
+
modality = modality or "obs"
|
645
|
+
if modality in self._df_annotators:
|
646
|
+
df_annotator = self._df_annotators[modality]
|
647
|
+
df_annotator.add_validated_from(key=key, **self._kwargs)
|
648
|
+
|
649
|
+
def add_new_from(
|
650
|
+
self,
|
651
|
+
key: str,
|
652
|
+
modality: str | None = None,
|
653
|
+
organism: str | None = None,
|
654
|
+
**kwargs,
|
655
|
+
):
|
656
|
+
"""Add validated & new categories.
|
657
|
+
|
658
|
+
Args:
|
659
|
+
key: The key referencing the slot in the DataFrame.
|
660
|
+
modality: The modality name.
|
661
|
+
organism: The organism name.
|
662
|
+
**kwargs: Additional keyword arguments to pass to the registry model.
|
663
|
+
"""
|
664
|
+
if len(kwargs) > 0 and key == "all":
|
665
|
+
raise ValueError("Cannot pass additional arguments to 'all' key!")
|
666
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
667
|
+
modality = modality or "obs"
|
668
|
+
if modality in self._df_annotators:
|
669
|
+
df_annotator = self._df_annotators[modality]
|
670
|
+
df_annotator.add_new_from(key=key, **self._kwargs, **kwargs)
|
671
|
+
|
672
|
+
def validate(self, organism: str | None = None) -> bool:
|
673
|
+
"""Validate categories."""
|
674
|
+
self._kwargs.update({"organism": organism} if organism else {})
|
675
|
+
if self._using is not None and self._using != "default":
|
676
|
+
logger.important(
|
677
|
+
f"validating metadata using registries of instance {colors.italic(self._using)}"
|
678
|
+
)
|
679
|
+
validated_var = True
|
680
|
+
for modality, var_field in self._var_fields.items():
|
681
|
+
validated_var &= validate_categories(
|
682
|
+
self._mdata[modality].var.index,
|
683
|
+
field=var_field,
|
684
|
+
key=f"{modality}_var_index",
|
685
|
+
using=self._using,
|
686
|
+
**self._kwargs,
|
687
|
+
)
|
688
|
+
validated_obs = True
|
689
|
+
for modality, fields in self._obs_fields.items():
|
690
|
+
if modality == "obs":
|
691
|
+
obs = self._mdata.obs
|
692
|
+
else:
|
693
|
+
obs = self._mdata[modality].obs
|
694
|
+
validated_obs &= validate_categories_in_df(
|
695
|
+
obs, fields=fields, using=self._using, **self._kwargs
|
696
|
+
)
|
697
|
+
self._validated = validated_var and validated_obs
|
698
|
+
return self._validated
|
699
|
+
|
700
|
+
def save_artifact(self, description: str, **kwargs) -> Artifact:
|
701
|
+
"""Save the validated ``MuData`` and metadata.
|
702
|
+
|
703
|
+
Args:
|
704
|
+
description: Description of the ``MuData`` object.
|
705
|
+
**kwargs: Object level metadata.
|
706
|
+
|
707
|
+
Returns:
|
708
|
+
A saved artifact record.
|
709
|
+
"""
|
710
|
+
if not self._validated:
|
711
|
+
raise ValidationError("Please run `validate()` first!")
|
712
|
+
|
713
|
+
self._artifact = save_artifact(
|
714
|
+
self._mdata,
|
715
|
+
description=description,
|
716
|
+
columns_field=self.var_index,
|
717
|
+
fields=self.categoricals,
|
718
|
+
**self._kwargs,
|
719
|
+
**kwargs,
|
409
720
|
)
|
410
721
|
return self._artifact
|
411
722
|
|
@@ -414,6 +725,7 @@ class Annotate:
|
|
414
725
|
"""Annotation flow."""
|
415
726
|
|
416
727
|
@classmethod
|
728
|
+
@doc_args(DataFrameAnnotator.__doc__)
|
417
729
|
def from_df(
|
418
730
|
cls,
|
419
731
|
df: pd.DataFrame,
|
@@ -421,18 +733,20 @@ class Annotate:
|
|
421
733
|
columns: FieldAttr = Feature.name,
|
422
734
|
using: str | None = None,
|
423
735
|
verbosity: str = "hint",
|
424
|
-
|
736
|
+
organism: str | None = None,
|
425
737
|
) -> DataFrameAnnotator:
|
738
|
+
"""{}."""
|
426
739
|
return DataFrameAnnotator(
|
427
740
|
df=df,
|
428
741
|
categoricals=categoricals,
|
429
742
|
columns=columns,
|
430
743
|
using=using,
|
431
744
|
verbosity=verbosity,
|
432
|
-
|
745
|
+
organism=organism,
|
433
746
|
)
|
434
747
|
|
435
748
|
@classmethod
|
749
|
+
@doc_args(AnnDataAnnotator.__doc__)
|
436
750
|
def from_anndata(
|
437
751
|
cls,
|
438
752
|
adata: ad.AnnData,
|
@@ -440,15 +754,37 @@ class Annotate:
|
|
440
754
|
categoricals: dict[str, FieldAttr],
|
441
755
|
using: str = "default",
|
442
756
|
verbosity: str = "hint",
|
443
|
-
|
757
|
+
organism: str | None = None,
|
444
758
|
) -> AnnDataAnnotator:
|
759
|
+
"""{}."""
|
445
760
|
return AnnDataAnnotator(
|
446
761
|
adata=adata,
|
447
762
|
var_index=var_index,
|
448
763
|
categoricals=categoricals,
|
449
764
|
using=using,
|
450
765
|
verbosity=verbosity,
|
451
|
-
|
766
|
+
organism=organism,
|
767
|
+
)
|
768
|
+
|
769
|
+
@classmethod
|
770
|
+
@doc_args(MuDataAnnotator.__doc__)
|
771
|
+
def from_mudata(
|
772
|
+
cls,
|
773
|
+
mdata: MuData,
|
774
|
+
var_index: dict[str, dict[str, FieldAttr]],
|
775
|
+
categoricals: dict[str, dict[str, FieldAttr]],
|
776
|
+
using: str = "default",
|
777
|
+
verbosity: str = "hint",
|
778
|
+
organism: str | None = None,
|
779
|
+
) -> MuDataAnnotator:
|
780
|
+
"""{}."""
|
781
|
+
return MuDataAnnotator(
|
782
|
+
mdata=mdata,
|
783
|
+
var_index=var_index,
|
784
|
+
categoricals=categoricals,
|
785
|
+
using=using,
|
786
|
+
verbosity=verbosity,
|
787
|
+
organism=organism,
|
452
788
|
)
|
453
789
|
|
454
790
|
|
@@ -468,9 +804,7 @@ def standardize_and_inspect(
|
|
468
804
|
return registry.inspect(values, field=field, mute=True, **kwargs)
|
469
805
|
|
470
806
|
|
471
|
-
def check_registry_organism(
|
472
|
-
registry: Registry, organism: str | None = None
|
473
|
-
) -> str | None:
|
807
|
+
def check_registry_organism(registry: Registry, organism: str | None = None) -> dict:
|
474
808
|
"""Check if a registry needs an organism and return the organism name."""
|
475
809
|
if hasattr(registry, "organism_id"):
|
476
810
|
import bionty as bt
|
@@ -480,8 +814,8 @@ def check_registry_organism(
|
|
480
814
|
f"{registry.__name__} registry requires an organism!\n"
|
481
815
|
" → please pass an organism name via organism="
|
482
816
|
)
|
483
|
-
return organism or bt.settings.organism.name
|
484
|
-
return
|
817
|
+
return {"organism": organism or bt.settings.organism.name}
|
818
|
+
return {}
|
485
819
|
|
486
820
|
|
487
821
|
def validate_categories(
|
@@ -489,22 +823,21 @@ def validate_categories(
|
|
489
823
|
field: FieldAttr,
|
490
824
|
key: str,
|
491
825
|
using: str | None = None,
|
492
|
-
|
826
|
+
organism: str | None = None,
|
493
827
|
) -> bool:
|
494
828
|
"""Validate ontology terms in a pandas series using LaminDB registries."""
|
495
829
|
from lamindb._from_values import _print_values
|
496
830
|
from lamindb.core._settings import settings
|
497
831
|
|
498
832
|
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
499
|
-
|
500
|
-
|
501
|
-
|
833
|
+
|
834
|
+
def _log_mapping_info():
|
835
|
+
logger.indent = ""
|
836
|
+
logger.info(f"mapping {colors.italic(key)} on {colors.italic(model_field)}")
|
837
|
+
logger.indent = " "
|
502
838
|
|
503
839
|
registry = field.field.model
|
504
|
-
filter_kwargs =
|
505
|
-
organism = check_registry_organism(registry, kwargs.get("organism"))
|
506
|
-
if organism is not None:
|
507
|
-
filter_kwargs["organism"] = organism
|
840
|
+
filter_kwargs = check_registry_organism(registry, organism)
|
508
841
|
|
509
842
|
# Inspect the default instance
|
510
843
|
inspect_result = standardize_and_inspect(
|
@@ -537,6 +870,7 @@ def validate_categories(
|
|
537
870
|
validated_hint_print = f".add_validated_from('{key}')"
|
538
871
|
n_validated = len(values_validated)
|
539
872
|
if n_validated > 0:
|
873
|
+
_log_mapping_info()
|
540
874
|
logger.warning(
|
541
875
|
f"found {colors.yellow(f'{n_validated} terms')} validated terms: "
|
542
876
|
f"{colors.yellow(values_validated)}\n → save terms via "
|
@@ -547,7 +881,8 @@ def validate_categories(
|
|
547
881
|
non_validated = [i for i in non_validated if i not in values_validated]
|
548
882
|
n_non_validated = len(non_validated)
|
549
883
|
if n_non_validated == 0:
|
550
|
-
logger.
|
884
|
+
logger.indent = ""
|
885
|
+
logger.success(f"{key} is validated against {colors.italic(model_field)}")
|
551
886
|
return True
|
552
887
|
else:
|
553
888
|
are = "are" if n_non_validated > 1 else "is"
|
@@ -557,6 +892,8 @@ def validate_categories(
|
|
557
892
|
f"{colors.yellow(print_values)}\n → save terms via "
|
558
893
|
f"{colors.yellow(non_validated_hint_print)}"
|
559
894
|
)
|
895
|
+
if logger.indent == "":
|
896
|
+
_log_mapping_info()
|
560
897
|
logger.warning(warning_message)
|
561
898
|
logger.indent = ""
|
562
899
|
return False
|
@@ -581,37 +918,12 @@ def validate_categories_in_df(
|
|
581
918
|
return validated
|
582
919
|
|
583
920
|
|
584
|
-
def validate_anndata(
|
585
|
-
adata: ad.AnnData,
|
586
|
-
var_field: FieldAttr,
|
587
|
-
obs_fields: dict[str, FieldAttr],
|
588
|
-
using: str | None = None,
|
589
|
-
**kwargs,
|
590
|
-
) -> bool:
|
591
|
-
"""Inspect metadata in an AnnData object using LaminDB registries."""
|
592
|
-
if using is not None and using != "default":
|
593
|
-
logger.important(
|
594
|
-
f"validating metadata using registries of instance {colors.italic(using)}"
|
595
|
-
)
|
596
|
-
|
597
|
-
validated_var = validate_categories(
|
598
|
-
adata.var.index,
|
599
|
-
field=var_field,
|
600
|
-
key="var_index",
|
601
|
-
using=using,
|
602
|
-
**kwargs,
|
603
|
-
)
|
604
|
-
validated_obs = validate_categories_in_df(
|
605
|
-
adata.obs, fields=obs_fields, using=using, **kwargs
|
606
|
-
)
|
607
|
-
return validated_var and validated_obs
|
608
|
-
|
609
|
-
|
610
921
|
def save_artifact(
|
611
|
-
data: pd.DataFrame | ad.AnnData,
|
922
|
+
data: pd.DataFrame | ad.AnnData | MuData,
|
612
923
|
description: str,
|
613
|
-
fields: dict[str, FieldAttr],
|
614
|
-
columns_field: FieldAttr,
|
924
|
+
fields: dict[str, FieldAttr] | dict[str, dict[str, FieldAttr]],
|
925
|
+
columns_field: FieldAttr | dict[str, FieldAttr],
|
926
|
+
organism: str | None = None,
|
615
927
|
**kwargs,
|
616
928
|
) -> Artifact:
|
617
929
|
"""Save all metadata with an Artifact.
|
@@ -621,43 +933,67 @@ def save_artifact(
|
|
621
933
|
description: A description of the artifact.
|
622
934
|
fields: A dictionary mapping obs_column to registry_field.
|
623
935
|
columns_field: The registry field to validate variables index against.
|
936
|
+
organism: The organism name.
|
624
937
|
kwargs: Additional keyword arguments to pass to the registry model.
|
625
938
|
|
626
939
|
Returns:
|
627
940
|
The saved Artifact.
|
628
941
|
"""
|
942
|
+
artifact = None
|
629
943
|
if isinstance(data, ad.AnnData):
|
630
|
-
artifact = Artifact.from_anndata(data, description=description)
|
944
|
+
artifact = Artifact.from_anndata(data, description=description, **kwargs)
|
631
945
|
artifact.n_observations = data.n_obs
|
632
946
|
elif isinstance(data, pd.DataFrame):
|
633
|
-
artifact = Artifact.from_df(data, description=description)
|
947
|
+
artifact = Artifact.from_df(data, description=description, **kwargs)
|
634
948
|
else:
|
635
|
-
|
949
|
+
try:
|
950
|
+
from mudata import MuData
|
951
|
+
|
952
|
+
if isinstance(data, MuData):
|
953
|
+
artifact = Artifact.from_mudata(data, description=description, **kwargs)
|
954
|
+
artifact.n_observations = data.n_obs
|
955
|
+
except ImportError:
|
956
|
+
pass
|
957
|
+
if artifact is None:
|
958
|
+
raise ValueError("data must be a DataFrame, AnnData or MuData object")
|
636
959
|
artifact.save()
|
637
960
|
|
638
|
-
feature_kwargs
|
639
|
-
|
640
|
-
|
961
|
+
feature_kwargs = check_registry_organism(
|
962
|
+
(
|
963
|
+
list(columns_field.values())[0].field.model
|
964
|
+
if isinstance(columns_field, dict)
|
965
|
+
else columns_field.field.model
|
966
|
+
),
|
967
|
+
organism,
|
641
968
|
)
|
642
|
-
if organism is not None:
|
643
|
-
feature_kwargs["organism"] = organism
|
644
969
|
|
645
|
-
if
|
970
|
+
if artifact.accessor == "DataFrame":
|
971
|
+
artifact.features.add_from_df(field=columns_field, **feature_kwargs)
|
972
|
+
elif artifact.accessor == "AnnData":
|
646
973
|
artifact.features.add_from_anndata(var_field=columns_field, **feature_kwargs)
|
974
|
+
elif artifact.accessor == "MuData":
|
975
|
+
artifact.features.add_from_mudata(var_fields=columns_field, **feature_kwargs)
|
647
976
|
else:
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
977
|
+
raise NotImplementedError
|
978
|
+
|
979
|
+
def _add_labels(data, artifact: Artifact, fields: dict[str, FieldAttr]):
|
980
|
+
features = Feature.lookup().dict()
|
981
|
+
for key, field in fields.items():
|
982
|
+
feature = features.get(key)
|
983
|
+
registry = field.field.model
|
984
|
+
filter_kwargs = check_registry_organism(registry, organism)
|
985
|
+
df = data if isinstance(data, pd.DataFrame) else data.obs
|
986
|
+
labels = registry.from_values(df[key], field=field, **filter_kwargs)
|
987
|
+
artifact.labels.add(labels, feature)
|
988
|
+
|
989
|
+
if artifact.accessor == "MuData":
|
990
|
+
for modality, modality_fields in fields.items():
|
991
|
+
if modality == "obs":
|
992
|
+
_add_labels(data, artifact, modality_fields)
|
993
|
+
else:
|
994
|
+
_add_labels(data[modality], artifact, modality_fields)
|
995
|
+
else:
|
996
|
+
_add_labels(data, artifact, fields)
|
661
997
|
|
662
998
|
slug = ln_setup.settings.instance.slug
|
663
999
|
if ln_setup.settings.instance.is_remote:
|
@@ -672,8 +1008,10 @@ def update_registry(
|
|
672
1008
|
save_function: str = "add_new_from",
|
673
1009
|
using: str | None = None,
|
674
1010
|
validated_only: bool = True,
|
675
|
-
kwargs: dict | None = None,
|
676
1011
|
df: pd.DataFrame | None = None,
|
1012
|
+
organism: str | None = None,
|
1013
|
+
type: str | None = None,
|
1014
|
+
**kwargs,
|
677
1015
|
) -> None:
|
678
1016
|
"""Save features or labels records in the default instance from the using instance.
|
679
1017
|
|
@@ -684,18 +1022,16 @@ def update_registry(
|
|
684
1022
|
save_function: The name of the function to save the labels.
|
685
1023
|
using: The name of the instance from which to transfer labels (if applicable).
|
686
1024
|
validated_only: If True, only save validated labels.
|
687
|
-
kwargs: Additional keyword arguments to pass to the registry model.
|
688
1025
|
df: A DataFrame to save labels from.
|
1026
|
+
organism: The organism name.
|
1027
|
+
type: The type of the feature.
|
1028
|
+
kwargs: Additional keyword arguments to pass to the registry model to create new records.
|
689
1029
|
"""
|
690
1030
|
from lamindb._save import save as ln_save
|
691
1031
|
from lamindb.core._settings import settings
|
692
1032
|
|
693
|
-
filter_kwargs = {} if kwargs is None else kwargs.copy()
|
694
1033
|
registry = field.field.model
|
695
|
-
|
696
|
-
organism = check_registry_organism(registry, filter_kwargs.pop("organism", None))
|
697
|
-
if organism is not None:
|
698
|
-
filter_kwargs["organism"] = organism
|
1034
|
+
filter_kwargs = check_registry_organism(registry, organism)
|
699
1035
|
|
700
1036
|
verbosity = settings.verbosity
|
701
1037
|
try:
|
@@ -716,7 +1052,7 @@ def update_registry(
|
|
716
1052
|
inspect_result_current.non_validated,
|
717
1053
|
field=field,
|
718
1054
|
using=using,
|
719
|
-
|
1055
|
+
**filter_kwargs,
|
720
1056
|
)
|
721
1057
|
|
722
1058
|
public_records = (
|
@@ -742,8 +1078,8 @@ def update_registry(
|
|
742
1078
|
for value in labels_saved["without reference"]:
|
743
1079
|
filter_kwargs[field.field.name] = value
|
744
1080
|
if registry == Feature:
|
745
|
-
filter_kwargs["type"] = "category"
|
746
|
-
non_validated_records.append(registry(**filter_kwargs))
|
1081
|
+
filter_kwargs["type"] = "category" if type is None else type
|
1082
|
+
non_validated_records.append(registry(**filter_kwargs, **kwargs))
|
747
1083
|
ln_save(non_validated_records)
|
748
1084
|
|
749
1085
|
if registry == ULabel and field.field.name == "name":
|
@@ -815,7 +1151,7 @@ def update_registry_from_using_instance(
|
|
815
1151
|
values: list[str],
|
816
1152
|
field: FieldAttr,
|
817
1153
|
using: str | None = None,
|
818
|
-
kwargs
|
1154
|
+
**kwargs,
|
819
1155
|
) -> tuple[list[str], list[str]]:
|
820
1156
|
"""Save features or labels records from the using instance.
|
821
1157
|
|
@@ -828,7 +1164,6 @@ def update_registry_from_using_instance(
|
|
828
1164
|
Returns:
|
829
1165
|
A tuple containing the list of saved labels and the list of non-saved labels.
|
830
1166
|
"""
|
831
|
-
kwargs = kwargs or {}
|
832
1167
|
labels_saved = []
|
833
1168
|
not_saved = values
|
834
1169
|
|