lamindb 0.69.2__py3-none-any.whl → 0.69.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +5 -3
- lamindb/_annotate.py +790 -0
- lamindb/_artifact.py +2 -6
- lamindb/_query_set.py +3 -3
- lamindb/core/__init__.py +4 -0
- {lamindb-0.69.2.dist-info → lamindb-0.69.3.dist-info}/METADATA +3 -3
- {lamindb-0.69.2.dist-info → lamindb-0.69.3.dist-info}/RECORD +10 -15
- lamindb/validation/__init__.py +0 -19
- lamindb/validation/_anndata_validator.py +0 -117
- lamindb/validation/_lookup.py +0 -42
- lamindb/validation/_register.py +0 -265
- lamindb/validation/_validate.py +0 -139
- lamindb/validation/_validator.py +0 -221
- /lamindb/{_validate.py → _can_validate.py} +0 -0
- {lamindb-0.69.2.dist-info → lamindb-0.69.3.dist-info}/LICENSE +0 -0
- {lamindb-0.69.2.dist-info → lamindb-0.69.3.dist-info}/WHEEL +0 -0
lamindb/_annotate.py
ADDED
@@ -0,0 +1,790 @@
|
|
1
|
+
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
2
|
+
|
3
|
+
import anndata as ad
|
4
|
+
import lamindb_setup as ln_setup
|
5
|
+
import pandas as pd
|
6
|
+
from lamin_utils import colors, logger
|
7
|
+
from lnschema_core import Artifact, Collection, Feature, Registry, Run, ULabel
|
8
|
+
from lnschema_core.types import FieldAttr
|
9
|
+
|
10
|
+
|
11
|
+
class ValidationError(ValueError):
|
12
|
+
"""Validation error."""
|
13
|
+
|
14
|
+
pass
|
15
|
+
|
16
|
+
|
17
|
+
class AnnotateLookup:
|
18
|
+
"""Lookup features and labels from the reference instance."""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self, fields: Dict[str, FieldAttr], using: Optional[str] = None
|
22
|
+
) -> None:
|
23
|
+
self._fields = fields
|
24
|
+
self._using = None if using == "default" else using
|
25
|
+
self._using_name = using or ln_setup.settings.instance.slug
|
26
|
+
logger.debug(f"Lookup objects from the {colors.italic(self._using_name)}")
|
27
|
+
|
28
|
+
def __getitem__(self, name):
|
29
|
+
if name in self._fields:
|
30
|
+
registry = self._fields[name].field.model
|
31
|
+
if self._using == "public":
|
32
|
+
return registry.public().lookup()
|
33
|
+
else:
|
34
|
+
return get_registry_instance(registry, self._using).lookup()
|
35
|
+
raise AttributeError(
|
36
|
+
f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
37
|
+
)
|
38
|
+
|
39
|
+
def __repr__(self) -> str:
|
40
|
+
if len(self._fields) > 0:
|
41
|
+
fields = "\n ".join([str([key]) for key in self._fields.keys()])
|
42
|
+
return (
|
43
|
+
f"Lookup objects from the {colors.italic(self._using_name)}:\n {colors.green(fields)}\n\n"
|
44
|
+
"Example:\n → categories = validator.lookup().['cell_type']\n"
|
45
|
+
" → categories.alveolar_type_1_fibroblast_cell"
|
46
|
+
)
|
47
|
+
else:
|
48
|
+
return colors.warning("No fields are found!")
|
49
|
+
|
50
|
+
|
51
|
+
class DataFrameAnnotator:
|
52
|
+
"""Annotation flow for a DataFrame object.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
df: The DataFrame object to annotate.
|
56
|
+
fields: A dictionary mapping column to registry_field.
|
57
|
+
For example:
|
58
|
+
{"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name}
|
59
|
+
feature_field: The field attribute for the feature column.
|
60
|
+
using: The reference instance containing registries to validate against.
|
61
|
+
verbosity: The verbosity level.
|
62
|
+
"""
|
63
|
+
|
64
|
+
def __init__(
|
65
|
+
self,
|
66
|
+
df: pd.DataFrame,
|
67
|
+
fields: Optional[Dict[str, FieldAttr]] = None,
|
68
|
+
feature_field: FieldAttr = Feature.name,
|
69
|
+
using: Optional[str] = None,
|
70
|
+
verbosity: str = "hint",
|
71
|
+
**kwargs,
|
72
|
+
) -> None:
|
73
|
+
from lamindb.core._settings import settings
|
74
|
+
|
75
|
+
self._df = df
|
76
|
+
self._fields = fields or {}
|
77
|
+
self._feature_field = feature_field
|
78
|
+
self._using = using
|
79
|
+
settings.verbosity = verbosity
|
80
|
+
self._artifact = None
|
81
|
+
self._collection = None
|
82
|
+
self._validated = False
|
83
|
+
self._kwargs: Dict = kwargs
|
84
|
+
self.register_features()
|
85
|
+
|
86
|
+
@property
|
87
|
+
def fields(self) -> Dict:
|
88
|
+
"""Return the columns fields to validate against."""
|
89
|
+
return self._fields
|
90
|
+
|
91
|
+
def lookup(self, using: Optional[str] = None) -> AnnotateLookup:
|
92
|
+
"""Lookup features and labels.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
using: The instance where the lookup is performed.
|
96
|
+
if None (default), the lookup is performed on the instance specified in "using" parameter of the validator.
|
97
|
+
if "public", the lookup is performed on the public reference.
|
98
|
+
"""
|
99
|
+
fields = {**{"feature": self._feature_field}, **self.fields}
|
100
|
+
return AnnotateLookup(fields=fields, using=using or self._using)
|
101
|
+
|
102
|
+
def register_features(self, validated_only: bool = True) -> None:
|
103
|
+
"""Register features records."""
|
104
|
+
missing_columns = set(self.fields.keys()) - set(self._df.columns)
|
105
|
+
if missing_columns:
|
106
|
+
raise ValueError(
|
107
|
+
f"Columns {missing_columns} are not found in the data object!"
|
108
|
+
)
|
109
|
+
|
110
|
+
# Always register features specified as the fields keys
|
111
|
+
update_registry(
|
112
|
+
values=list(self.fields.keys()),
|
113
|
+
field=self._feature_field,
|
114
|
+
feature_name="feature",
|
115
|
+
using=self._using,
|
116
|
+
validated_only=False,
|
117
|
+
kwargs=self._kwargs,
|
118
|
+
)
|
119
|
+
|
120
|
+
# Register the rest of the columns based on validated_only
|
121
|
+
additional_columns = set(self._df.columns) - set(self.fields.keys())
|
122
|
+
if additional_columns:
|
123
|
+
update_registry(
|
124
|
+
values=list(additional_columns),
|
125
|
+
field=self._feature_field,
|
126
|
+
feature_name="feature",
|
127
|
+
using=self._using,
|
128
|
+
validated_only=validated_only,
|
129
|
+
df=self._df, # Get the Feature type from df
|
130
|
+
kwargs=self._kwargs,
|
131
|
+
)
|
132
|
+
|
133
|
+
def update_registry(self, feature: str, validated_only: bool = True, **kwargs):
|
134
|
+
"""Register labels for a feature.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
feature: The name of the feature to register.
|
138
|
+
validated_only: Whether to register only validated labels.
|
139
|
+
**kwargs: Additional keyword arguments.
|
140
|
+
"""
|
141
|
+
if feature == "all":
|
142
|
+
self._update_registry_all(validated_only=validated_only, **kwargs)
|
143
|
+
elif feature == "feature":
|
144
|
+
self.register_features(validated_only=validated_only)
|
145
|
+
else:
|
146
|
+
if feature not in self.fields:
|
147
|
+
raise ValueError(f"Feature {feature} is not part of the fields!")
|
148
|
+
update_registry(
|
149
|
+
values=self._df[feature].unique().tolist(),
|
150
|
+
field=self.fields[feature],
|
151
|
+
feature_name=feature,
|
152
|
+
using=self._using,
|
153
|
+
validated_only=validated_only,
|
154
|
+
kwargs=kwargs,
|
155
|
+
)
|
156
|
+
|
157
|
+
def _update_registry_all(self, validated_only: bool = True, **kwargs):
|
158
|
+
"""Register labels for all features."""
|
159
|
+
for name in self.fields.keys():
|
160
|
+
logger.info(f"registering labels for '{name}'")
|
161
|
+
self.update_registry(feature=name, validated_only=validated_only, **kwargs)
|
162
|
+
|
163
|
+
def validate(self, **kwargs) -> bool:
|
164
|
+
"""Validate variables and categorical observations.
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
Whether the DataFrame is validated.
|
168
|
+
"""
|
169
|
+
self._kwargs.update(kwargs)
|
170
|
+
self._validated = validate_categories_in_df(
|
171
|
+
self._df,
|
172
|
+
fields=self.fields,
|
173
|
+
using=self._using,
|
174
|
+
**self._kwargs,
|
175
|
+
)
|
176
|
+
return self._validated
|
177
|
+
|
178
|
+
def register_artifact(self, description: str, **kwargs) -> Artifact:
|
179
|
+
"""Register the validated DataFrame and metadata.
|
180
|
+
|
181
|
+
Args:
|
182
|
+
description: Description of the DataFrame object.
|
183
|
+
**kwargs: Object level metadata.
|
184
|
+
|
185
|
+
Returns:
|
186
|
+
A registered artifact record.
|
187
|
+
"""
|
188
|
+
from lamindb.core._settings import settings
|
189
|
+
|
190
|
+
self._kwargs.update(kwargs)
|
191
|
+
if not self._validated:
|
192
|
+
raise ValidationError(
|
193
|
+
f"Data object is not validated, please run {colors.yellow('validate()')}!"
|
194
|
+
)
|
195
|
+
|
196
|
+
# Make sure all labels are registered in the current instance
|
197
|
+
verbosity = settings.verbosity
|
198
|
+
try:
|
199
|
+
settings.verbosity = "warning"
|
200
|
+
self.update_registry("all")
|
201
|
+
|
202
|
+
self._artifact = register_artifact(
|
203
|
+
self._df,
|
204
|
+
description=description,
|
205
|
+
fields=self.fields,
|
206
|
+
feature_field=self._feature_field,
|
207
|
+
**self._kwargs,
|
208
|
+
)
|
209
|
+
finally:
|
210
|
+
settings.verbosity = verbosity
|
211
|
+
|
212
|
+
return self._artifact
|
213
|
+
|
214
|
+
def register_collection(
|
215
|
+
self,
|
216
|
+
artifact: Artifact | Iterable[Artifact],
|
217
|
+
name: str,
|
218
|
+
description: Optional[str] = None,
|
219
|
+
reference: Optional[str] = None,
|
220
|
+
reference_type: Optional[str] = None,
|
221
|
+
) -> Collection:
|
222
|
+
"""Register a collection from artifact/artifacts.
|
223
|
+
|
224
|
+
Args:
|
225
|
+
artifact: One or several registered Artifacts.
|
226
|
+
name: Title of the publication.
|
227
|
+
description: Description of the publication.
|
228
|
+
reference: Accession number (e.g. GSE#, E-MTAB#, etc.).
|
229
|
+
reference_type: Source type (e.g. GEO, ArrayExpress, SRA, etc.).
|
230
|
+
"""
|
231
|
+
collection = Collection(
|
232
|
+
artifact,
|
233
|
+
name=name,
|
234
|
+
description=description,
|
235
|
+
reference=reference,
|
236
|
+
reference_type=reference_type,
|
237
|
+
)
|
238
|
+
slug = ln_setup.settings.instance.slug
|
239
|
+
if collection._state.adding:
|
240
|
+
collection.save()
|
241
|
+
logger.success(f"registered collection in {colors.italic(slug)}")
|
242
|
+
else:
|
243
|
+
collection.save()
|
244
|
+
logger.warning(f"collection already exists in {colors.italic(slug)}!")
|
245
|
+
if ln_setup.settings.instance.is_remote:
|
246
|
+
logger.print(f"🔗 https://lamin.ai/{slug}/collection/{collection.uid}")
|
247
|
+
self._collection = collection
|
248
|
+
return collection
|
249
|
+
|
250
|
+
def clean_up_failed_runs(self):
|
251
|
+
"""Clean up previous failed runs that don't register any outputs."""
|
252
|
+
from lamindb.core._run_context import run_context
|
253
|
+
|
254
|
+
if run_context.transform is not None:
|
255
|
+
Run.filter(transform=run_context.transform, output_artifacts=None).exclude(
|
256
|
+
uid=run_context.run.uid
|
257
|
+
).delete()
|
258
|
+
|
259
|
+
|
260
|
+
class AnnDataAnnotator(DataFrameAnnotator):
|
261
|
+
"""Annotation flow for an AnnData object.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
adata: The AnnData object to annotate.
|
265
|
+
var_field: The registry field to validate variables index against.
|
266
|
+
obs_fields: A dictionary mapping obs_column to registry_field.
|
267
|
+
For example:
|
268
|
+
{"cell_type_ontology_id": bt.CellType.ontology_id, "donor_id": ln.ULabel.name}
|
269
|
+
using: The reference instance containing registries to validate against.
|
270
|
+
"""
|
271
|
+
|
272
|
+
def __init__(
|
273
|
+
self,
|
274
|
+
adata: ad.AnnData,
|
275
|
+
var_field: FieldAttr,
|
276
|
+
obs_fields: Dict[str, FieldAttr],
|
277
|
+
using: str = "default",
|
278
|
+
verbosity: str = "hint",
|
279
|
+
**kwargs,
|
280
|
+
) -> None:
|
281
|
+
self._adata = adata
|
282
|
+
self._var_field = var_field
|
283
|
+
super().__init__(
|
284
|
+
df=self._adata.obs,
|
285
|
+
fields=obs_fields,
|
286
|
+
using=using,
|
287
|
+
verbosity=verbosity,
|
288
|
+
**kwargs,
|
289
|
+
)
|
290
|
+
self._obs_fields = obs_fields
|
291
|
+
self._register_variables()
|
292
|
+
|
293
|
+
@property
|
294
|
+
def var_field(self) -> FieldAttr:
|
295
|
+
"""Return the registry field to validate variables index against."""
|
296
|
+
return self._var_field
|
297
|
+
|
298
|
+
@property
|
299
|
+
def obs_fields(self) -> Dict:
|
300
|
+
"""Return the obs fields to validate against."""
|
301
|
+
return self._obs_fields
|
302
|
+
|
303
|
+
def lookup(self, using: Optional[str] = None) -> AnnotateLookup:
|
304
|
+
"""Lookup features and labels."""
|
305
|
+
fields = {
|
306
|
+
**{"feature": Feature.name, "variables": self.var_field},
|
307
|
+
**self.obs_fields,
|
308
|
+
}
|
309
|
+
return AnnotateLookup(fields=fields, using=using or self._using)
|
310
|
+
|
311
|
+
def _register_variables(self, validated_only: bool = True, **kwargs):
|
312
|
+
"""Register variable records."""
|
313
|
+
self._kwargs.update(kwargs)
|
314
|
+
update_registry(
|
315
|
+
values=self._adata.var_names,
|
316
|
+
field=self.var_field,
|
317
|
+
feature_name="variables",
|
318
|
+
using=self._using,
|
319
|
+
validated_only=validated_only,
|
320
|
+
kwargs=self._kwargs,
|
321
|
+
)
|
322
|
+
|
323
|
+
def validate(self, **kwargs) -> bool:
|
324
|
+
"""Validate variables and categorical observations."""
|
325
|
+
self._kwargs.update(kwargs)
|
326
|
+
self._validated = validate_anndata(
|
327
|
+
self._adata,
|
328
|
+
var_field=self.var_field,
|
329
|
+
obs_fields=self.obs_fields,
|
330
|
+
**self._kwargs,
|
331
|
+
)
|
332
|
+
return self._validated
|
333
|
+
|
334
|
+
def update_registry(self, feature: str, validated_only: bool = True, **kwargs):
|
335
|
+
"""Register labels for a feature."""
|
336
|
+
if feature == "variables":
|
337
|
+
self._register_variables(validated_only=validated_only, **kwargs)
|
338
|
+
else:
|
339
|
+
super().update_registry(feature, validated_only, **kwargs)
|
340
|
+
|
341
|
+
def register_artifact(self, description: str, **kwargs) -> Artifact:
|
342
|
+
"""Register the validated AnnData and metadata.
|
343
|
+
|
344
|
+
Args:
|
345
|
+
description: Description of the AnnData object.
|
346
|
+
**kwargs: Object level metadata.
|
347
|
+
|
348
|
+
Returns:
|
349
|
+
A registered artifact record.
|
350
|
+
"""
|
351
|
+
self._kwargs.update(kwargs)
|
352
|
+
if not self._validated:
|
353
|
+
raise ValidationError("Please run `validate()` first!")
|
354
|
+
|
355
|
+
self._artifact = register_artifact(
|
356
|
+
self._adata,
|
357
|
+
description=description,
|
358
|
+
feature_field=self.var_field,
|
359
|
+
fields=self.obs_fields,
|
360
|
+
**self._kwargs,
|
361
|
+
)
|
362
|
+
return self._artifact
|
363
|
+
|
364
|
+
|
365
|
+
class Annotate:
|
366
|
+
"""Annotation flow."""
|
367
|
+
|
368
|
+
@classmethod
|
369
|
+
def from_df(
|
370
|
+
cls,
|
371
|
+
df: pd.DataFrame,
|
372
|
+
fields: Optional[Dict[str, FieldAttr]] = None,
|
373
|
+
feature_field: FieldAttr = Feature.name,
|
374
|
+
using: Optional[str] = None,
|
375
|
+
verbosity: str = "hint",
|
376
|
+
**kwargs,
|
377
|
+
) -> DataFrameAnnotator:
|
378
|
+
return DataFrameAnnotator(
|
379
|
+
df=df,
|
380
|
+
fields=fields,
|
381
|
+
feature_field=feature_field,
|
382
|
+
using=using,
|
383
|
+
verbosity=verbosity,
|
384
|
+
**kwargs,
|
385
|
+
)
|
386
|
+
|
387
|
+
@classmethod
|
388
|
+
def from_anndata(
|
389
|
+
cls,
|
390
|
+
adata: ad.AnnData,
|
391
|
+
var_field: FieldAttr,
|
392
|
+
obs_fields: Dict[str, FieldAttr],
|
393
|
+
using: str = "default",
|
394
|
+
verbosity: str = "hint",
|
395
|
+
**kwargs,
|
396
|
+
) -> AnnDataAnnotator:
|
397
|
+
return AnnDataAnnotator(
|
398
|
+
adata=adata,
|
399
|
+
var_field=var_field,
|
400
|
+
obs_fields=obs_fields,
|
401
|
+
using=using,
|
402
|
+
verbosity=verbosity,
|
403
|
+
**kwargs,
|
404
|
+
)
|
405
|
+
|
406
|
+
|
407
|
+
def get_registry_instance(registry: Registry, using: Optional[str] = None) -> Registry:
|
408
|
+
"""Get a registry instance using a specific instance."""
|
409
|
+
if using is not None and using != "default":
|
410
|
+
return registry.using(using)
|
411
|
+
return registry
|
412
|
+
|
413
|
+
|
414
|
+
def standardize_and_inspect(
|
415
|
+
values: Iterable[str], field: FieldAttr, registry: Registry, **kwargs
|
416
|
+
):
|
417
|
+
"""Standardize and inspect values using a registry."""
|
418
|
+
if hasattr(registry, "standardize"):
|
419
|
+
values = registry.standardize(values, field=field, mute=True, **kwargs)
|
420
|
+
return registry.inspect(values, field=field, mute=True, **kwargs)
|
421
|
+
|
422
|
+
|
423
|
+
def check_registry_organism(
|
424
|
+
registry: Registry, organism: Optional[str] = None
|
425
|
+
) -> Optional[str]:
|
426
|
+
"""Check if a registry needs an organism and return the organism name."""
|
427
|
+
if hasattr(registry, "organism_id"):
|
428
|
+
import bionty as bt
|
429
|
+
|
430
|
+
if organism is None and bt.settings.organism is None:
|
431
|
+
raise ValueError(
|
432
|
+
f"{registry.__name__} registry requires an organism!\n"
|
433
|
+
" → please pass an organism name via organism="
|
434
|
+
)
|
435
|
+
return organism or bt.settings.organism.name
|
436
|
+
return None
|
437
|
+
|
438
|
+
|
439
|
+
def validate_categories(
|
440
|
+
values: Iterable[str],
|
441
|
+
field: FieldAttr,
|
442
|
+
feature_name: str,
|
443
|
+
using: Optional[str] = None,
|
444
|
+
**kwargs,
|
445
|
+
) -> bool:
|
446
|
+
"""Validate ontology terms in a pandas series using LaminDB registries."""
|
447
|
+
from lamindb._from_values import _print_values
|
448
|
+
|
449
|
+
model_field = f"{field.field.model.__name__}.{field.field.name}"
|
450
|
+
logger.indent = ""
|
451
|
+
logger.info(
|
452
|
+
f"inspecting '{colors.bold(feature_name)}' by {colors.italic(model_field)}"
|
453
|
+
)
|
454
|
+
logger.indent = " "
|
455
|
+
|
456
|
+
registry = field.field.model
|
457
|
+
filter_kwargs = {}
|
458
|
+
organism = check_registry_organism(registry, kwargs.get("organism"))
|
459
|
+
if organism is not None:
|
460
|
+
filter_kwargs["organism"] = organism
|
461
|
+
|
462
|
+
# Inspect the default instance
|
463
|
+
inspect_result = standardize_and_inspect(
|
464
|
+
values=values, field=field, registry=registry, **filter_kwargs
|
465
|
+
)
|
466
|
+
non_validated = inspect_result.non_validated
|
467
|
+
|
468
|
+
if using is not None and using != "default" and non_validated:
|
469
|
+
registry = get_registry_instance(registry, using)
|
470
|
+
# Inspect the using instance
|
471
|
+
inspect_result = standardize_and_inspect(
|
472
|
+
values=non_validated, field=field, registry=registry, **filter_kwargs
|
473
|
+
)
|
474
|
+
non_validated = inspect_result.non_validated
|
475
|
+
|
476
|
+
n_non_validated = len(non_validated)
|
477
|
+
if n_non_validated == 0:
|
478
|
+
logger.success(f"all {feature_name}s are validated")
|
479
|
+
return True
|
480
|
+
else:
|
481
|
+
are = "are" if n_non_validated > 1 else "is"
|
482
|
+
print_values = _print_values(non_validated)
|
483
|
+
feature_name_print = f".update_registry('{feature_name}')"
|
484
|
+
warning_message = (
|
485
|
+
f"{colors.yellow(f'{n_non_validated} terms')} {are} not validated: "
|
486
|
+
f"{colors.yellow(print_values)}\n → register terms via "
|
487
|
+
f"{colors.yellow(feature_name_print)}"
|
488
|
+
)
|
489
|
+
logger.warning(warning_message)
|
490
|
+
logger.indent = ""
|
491
|
+
return False
|
492
|
+
|
493
|
+
|
494
|
+
def validate_categories_in_df(
|
495
|
+
df: pd.DataFrame,
|
496
|
+
fields: Dict[str, FieldAttr],
|
497
|
+
using: Optional[str] = None,
|
498
|
+
**kwargs,
|
499
|
+
) -> bool:
|
500
|
+
"""Validate categories in DataFrame columns using LaminDB registries."""
|
501
|
+
validated = True
|
502
|
+
for feature_name, field in fields.items():
|
503
|
+
validated &= validate_categories(
|
504
|
+
df[feature_name],
|
505
|
+
field=field,
|
506
|
+
feature_name=feature_name,
|
507
|
+
using=using,
|
508
|
+
**kwargs,
|
509
|
+
)
|
510
|
+
return validated
|
511
|
+
|
512
|
+
|
513
|
+
def validate_anndata(
|
514
|
+
adata: ad.AnnData,
|
515
|
+
var_field: FieldAttr,
|
516
|
+
obs_fields: Dict[str, FieldAttr],
|
517
|
+
using: Optional[str] = None,
|
518
|
+
**kwargs,
|
519
|
+
) -> bool:
|
520
|
+
"""Inspect metadata in an AnnData object using LaminDB registries."""
|
521
|
+
if using is not None and using != "default":
|
522
|
+
logger.important(
|
523
|
+
f"validating metadata using registries of instance {colors.italic(using)}"
|
524
|
+
)
|
525
|
+
|
526
|
+
validated_var = validate_categories(
|
527
|
+
adata.var.index,
|
528
|
+
field=var_field,
|
529
|
+
feature_name="variables",
|
530
|
+
using=using,
|
531
|
+
**kwargs,
|
532
|
+
)
|
533
|
+
validated_obs = validate_categories_in_df(
|
534
|
+
adata.obs, fields=obs_fields, using=using, **kwargs
|
535
|
+
)
|
536
|
+
return validated_var and validated_obs
|
537
|
+
|
538
|
+
|
539
|
+
def register_artifact(
|
540
|
+
data: Union[pd.DataFrame, ad.AnnData],
|
541
|
+
description: str,
|
542
|
+
fields: Dict[str, FieldAttr],
|
543
|
+
feature_field: FieldAttr,
|
544
|
+
**kwargs,
|
545
|
+
) -> Artifact:
|
546
|
+
"""Register all metadata with an Artifact.
|
547
|
+
|
548
|
+
Args:
|
549
|
+
data: The DataFrame or AnnData object to register.
|
550
|
+
description: A description of the artifact.
|
551
|
+
fields: A dictionary mapping obs_column to registry_field.
|
552
|
+
feature_field: The registry field to validate variables index against.
|
553
|
+
kwargs: Additional keyword arguments to pass to the registry model.
|
554
|
+
|
555
|
+
Returns:
|
556
|
+
The registered Artifact.
|
557
|
+
"""
|
558
|
+
if isinstance(data, ad.AnnData):
|
559
|
+
artifact = Artifact.from_anndata(data, description=description)
|
560
|
+
artifact.n_observations = data.n_obs
|
561
|
+
elif isinstance(data, pd.DataFrame):
|
562
|
+
artifact = Artifact.from_df(data, description=description)
|
563
|
+
else:
|
564
|
+
raise ValueError("data must be a DataFrame or AnnData object")
|
565
|
+
artifact.save()
|
566
|
+
|
567
|
+
feature_kwargs: Dict = {}
|
568
|
+
organism = check_registry_organism(
|
569
|
+
feature_field.field.model, kwargs.pop("organism", None)
|
570
|
+
)
|
571
|
+
if organism is not None:
|
572
|
+
feature_kwargs["organism"] = organism
|
573
|
+
|
574
|
+
if isinstance(data, ad.AnnData):
|
575
|
+
artifact.features.add_from_anndata(var_field=feature_field, **feature_kwargs)
|
576
|
+
else:
|
577
|
+
artifact.features.add_from_df(field=feature_field, **feature_kwargs)
|
578
|
+
|
579
|
+
features = Feature.lookup().dict()
|
580
|
+
for feature_name, field in fields.items():
|
581
|
+
feature = features.get(feature_name)
|
582
|
+
registry = field.field.model
|
583
|
+
filter_kwargs = kwargs.copy()
|
584
|
+
organism = check_registry_organism(registry, organism)
|
585
|
+
if organism is not None:
|
586
|
+
filter_kwargs["organism"] = organism
|
587
|
+
df = data.obs if isinstance(data, ad.AnnData) else data
|
588
|
+
labels = registry.from_values(df[feature_name], field=field, **filter_kwargs)
|
589
|
+
artifact.labels.add(labels, feature)
|
590
|
+
|
591
|
+
slug = ln_setup.settings.instance.slug
|
592
|
+
logger.success(f"registered artifact in {colors.italic(slug)}")
|
593
|
+
if ln_setup.settings.instance.is_remote:
|
594
|
+
logger.info(f"🔗 https://lamin.ai/{slug}/artifact/{artifact.uid}")
|
595
|
+
|
596
|
+
return artifact
|
597
|
+
|
598
|
+
|
599
|
+
def update_registry(
|
600
|
+
values: List[str],
|
601
|
+
field: FieldAttr,
|
602
|
+
feature_name: str,
|
603
|
+
using: Optional[str] = None,
|
604
|
+
validated_only: bool = True,
|
605
|
+
kwargs: Optional[Dict] = None,
|
606
|
+
df: Optional[pd.DataFrame] = None,
|
607
|
+
) -> None:
|
608
|
+
"""Register features or labels records in the default instance from the using instance.
|
609
|
+
|
610
|
+
Args:
|
611
|
+
values: A list of values to be registered as labels.
|
612
|
+
field: The FieldAttr object representing the field for which labels are being registered.
|
613
|
+
feature_name: The name of the feature to register.
|
614
|
+
using: The name of the instance from which to transfer labels (if applicable).
|
615
|
+
validated_only: If True, only register validated labels.
|
616
|
+
kwargs: Additional keyword arguments to pass to the registry model.
|
617
|
+
df: A DataFrame to register labels from.
|
618
|
+
"""
|
619
|
+
from lamindb._save import save as ln_save
|
620
|
+
from lamindb.core._settings import settings
|
621
|
+
|
622
|
+
filter_kwargs = {} if kwargs is None else kwargs.copy()
|
623
|
+
registry = field.field.model
|
624
|
+
if registry == ULabel:
|
625
|
+
validated_only = False
|
626
|
+
|
627
|
+
organism = check_registry_organism(registry, filter_kwargs.pop("organism", None))
|
628
|
+
if organism is not None:
|
629
|
+
filter_kwargs["organism"] = organism
|
630
|
+
|
631
|
+
verbosity = settings.verbosity
|
632
|
+
try:
|
633
|
+
settings.verbosity = "error"
|
634
|
+
inspect_result_current = standardize_and_inspect(
|
635
|
+
values=values, field=field, registry=registry, **filter_kwargs
|
636
|
+
)
|
637
|
+
if not inspect_result_current.non_validated:
|
638
|
+
settings.verbosity = verbosity
|
639
|
+
return
|
640
|
+
|
641
|
+
labels_registered: Dict = {"from public": [], "without reference": []}
|
642
|
+
|
643
|
+
(
|
644
|
+
labels_registered[f"from {using}"],
|
645
|
+
non_validated_labels,
|
646
|
+
) = update_registry_from_using_instance(
|
647
|
+
inspect_result_current.non_validated,
|
648
|
+
field=field,
|
649
|
+
using=using,
|
650
|
+
kwargs=filter_kwargs,
|
651
|
+
)
|
652
|
+
|
653
|
+
public_records = (
|
654
|
+
registry.from_values(non_validated_labels, field=field, **filter_kwargs)
|
655
|
+
if non_validated_labels
|
656
|
+
else []
|
657
|
+
)
|
658
|
+
ln_save(public_records)
|
659
|
+
labels_registered["from public"] = [
|
660
|
+
getattr(r, field.field.name) for r in public_records
|
661
|
+
]
|
662
|
+
labels_registered["without reference"] = [
|
663
|
+
i for i in non_validated_labels if i not in labels_registered["from public"]
|
664
|
+
]
|
665
|
+
|
666
|
+
if not validated_only:
|
667
|
+
non_validated_records = []
|
668
|
+
if df is not None and registry == Feature:
|
669
|
+
non_validated_records = Feature.from_df(df)
|
670
|
+
else:
|
671
|
+
if "organism" in filter_kwargs:
|
672
|
+
filter_kwargs["organism"] = _register_organism(name=organism)
|
673
|
+
for value in labels_registered["without reference"]:
|
674
|
+
filter_kwargs[field.field.name] = value
|
675
|
+
if registry == Feature:
|
676
|
+
filter_kwargs["type"] = "category"
|
677
|
+
non_validated_records.append(registry(**filter_kwargs))
|
678
|
+
ln_save(non_validated_records)
|
679
|
+
|
680
|
+
if registry == ULabel and field.field.name == "name":
|
681
|
+
register_ulabels_with_parent(values, field=field, feature_name=feature_name)
|
682
|
+
finally:
|
683
|
+
settings.verbosity = verbosity
|
684
|
+
|
685
|
+
log_registered_labels(
|
686
|
+
labels_registered,
|
687
|
+
feature_name=feature_name,
|
688
|
+
model_field=f"{registry.__name__}.{field.field.name}",
|
689
|
+
validated_only=validated_only,
|
690
|
+
)
|
691
|
+
|
692
|
+
|
693
|
+
def log_registered_labels(
|
694
|
+
labels_registered: Dict,
|
695
|
+
feature_name: str,
|
696
|
+
model_field: str,
|
697
|
+
validated_only: bool = True,
|
698
|
+
) -> None:
|
699
|
+
"""Log the registered labels."""
|
700
|
+
labels_type = "features" if feature_name == "feature" else "labels"
|
701
|
+
model_field = colors.italic(model_field)
|
702
|
+
for key, labels in labels_registered.items():
|
703
|
+
if not labels:
|
704
|
+
continue
|
705
|
+
|
706
|
+
if key == "without reference" and validated_only:
|
707
|
+
msg = colors.yellow(
|
708
|
+
f"{len(labels)} non-validated {labels_type} are not registered with {model_field}: {labels}!"
|
709
|
+
)
|
710
|
+
lookup_print = f".lookup().['{feature_name}']"
|
711
|
+
msg += f"\n → to lookup categories, use {lookup_print}"
|
712
|
+
msg += (
|
713
|
+
f"\n → to register, run {colors.yellow('register_features(validated_only=False)')}"
|
714
|
+
if labels_type == "features"
|
715
|
+
else f"\n → to register, set {colors.yellow('validated_only=False')}"
|
716
|
+
)
|
717
|
+
logger.warning(msg)
|
718
|
+
else:
|
719
|
+
key = "" if key == "without reference" else f"{colors.green(key)} "
|
720
|
+
logger.success(
|
721
|
+
f"registered {len(labels)} {labels_type} {key}with {model_field}: {labels}"
|
722
|
+
)
|
723
|
+
|
724
|
+
|
725
|
+
def register_ulabels_with_parent(
|
726
|
+
values: List[str], field: FieldAttr, feature_name: str
|
727
|
+
) -> None:
|
728
|
+
"""Register a parent label for the given labels."""
|
729
|
+
registry = field.field.model
|
730
|
+
assert registry == ULabel
|
731
|
+
all_records = registry.from_values(values, field=field)
|
732
|
+
is_feature = registry.filter(name=f"is_{feature_name}").one_or_none()
|
733
|
+
if is_feature is None:
|
734
|
+
is_feature = registry(name=f"is_{feature_name}")
|
735
|
+
is_feature.save()
|
736
|
+
is_feature.children.add(*all_records)
|
737
|
+
|
738
|
+
|
739
|
+
def update_registry_from_using_instance(
|
740
|
+
values: List[str],
|
741
|
+
field: FieldAttr,
|
742
|
+
using: Optional[str] = None,
|
743
|
+
kwargs: Optional[Dict] = None,
|
744
|
+
) -> Tuple[List[str], List[str]]:
|
745
|
+
"""Register features or labels records from the using instance.
|
746
|
+
|
747
|
+
Args:
|
748
|
+
values: A list of values to be registered as labels.
|
749
|
+
field: The FieldAttr object representing the field for which labels are being registered.
|
750
|
+
using: The name of the instance from which to transfer labels (if applicable).
|
751
|
+
kwargs: Additional keyword arguments to pass to the registry model.
|
752
|
+
|
753
|
+
Returns:
|
754
|
+
A tuple containing the list of registered labels and the list of non-registered labels.
|
755
|
+
"""
|
756
|
+
kwargs = kwargs or {}
|
757
|
+
labels_registered = []
|
758
|
+
not_registered = values
|
759
|
+
|
760
|
+
if using is not None and using != "default":
|
761
|
+
registry = field.field.model
|
762
|
+
registry_using = get_registry_instance(registry, using)
|
763
|
+
inspect_result_using = standardize_and_inspect(
|
764
|
+
values=values, field=field, registry=registry_using, **kwargs
|
765
|
+
)
|
766
|
+
labels_using = registry_using.filter(
|
767
|
+
**{f"{field.field.name}__in": inspect_result_using.validated}
|
768
|
+
).all()
|
769
|
+
for label_using in labels_using:
|
770
|
+
label_using.save()
|
771
|
+
labels_registered.append(getattr(label_using, field.field.name))
|
772
|
+
not_registered = inspect_result_using.non_validated
|
773
|
+
|
774
|
+
return labels_registered, not_registered
|
775
|
+
|
776
|
+
|
777
|
+
def _register_organism(name: str):
|
778
|
+
"""Register an organism record."""
|
779
|
+
import bionty as bt
|
780
|
+
|
781
|
+
organism = bt.Organism.filter(name=name).one_or_none()
|
782
|
+
if organism is None:
|
783
|
+
organism = bt.Organism.from_public(name=name)
|
784
|
+
if organism is None:
|
785
|
+
raise ValueError(
|
786
|
+
f"Organism '{name}' not found\n"
|
787
|
+
f" → please register it: bt.Organism(name='{name}').save()"
|
788
|
+
)
|
789
|
+
organism.save()
|
790
|
+
return organism
|