lamindb 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +89 -49
- lamindb/_finish.py +17 -15
- lamindb/_tracked.py +2 -4
- lamindb/_view.py +1 -1
- lamindb/base/__init__.py +2 -1
- lamindb/base/dtypes.py +76 -0
- lamindb/core/_settings.py +45 -2
- lamindb/core/storage/_anndata_accessor.py +118 -26
- lamindb/core/storage/_backed_access.py +10 -7
- lamindb/core/storage/_spatialdata_accessor.py +15 -4
- lamindb/core/storage/_zarr.py +3 -0
- lamindb/curators/_legacy.py +16 -3
- lamindb/curators/core.py +449 -193
- lamindb/errors.py +6 -0
- lamindb/examples/cellxgene/__init__.py +8 -3
- lamindb/examples/cellxgene/_cellxgene.py +127 -13
- lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
- lamindb/examples/croissant/__init__.py +32 -6
- lamindb/examples/datasets/__init__.py +2 -2
- lamindb/examples/datasets/_core.py +9 -2
- lamindb/examples/datasets/_small.py +66 -22
- lamindb/examples/fixtures/sheets.py +8 -2
- lamindb/integrations/_croissant.py +34 -11
- lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
- lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
- lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
- lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
- lamindb/migrations/0121_recorduser.py +60 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +2 -2
- lamindb/models/_feature_manager.py +131 -71
- lamindb/models/_from_values.py +2 -2
- lamindb/models/_is_versioned.py +4 -4
- lamindb/models/_label_manager.py +4 -4
- lamindb/models/artifact.py +357 -192
- lamindb/models/artifact_set.py +45 -1
- lamindb/models/can_curate.py +1 -2
- lamindb/models/collection.py +3 -34
- lamindb/models/feature.py +111 -7
- lamindb/models/has_parents.py +11 -11
- lamindb/models/project.py +42 -2
- lamindb/models/query_manager.py +16 -7
- lamindb/models/query_set.py +191 -78
- lamindb/models/record.py +30 -5
- lamindb/models/run.py +10 -33
- lamindb/models/save.py +6 -8
- lamindb/models/schema.py +54 -26
- lamindb/models/sqlrecord.py +152 -40
- lamindb/models/storage.py +59 -14
- lamindb/models/transform.py +17 -17
- lamindb/models/ulabel.py +6 -1
- {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/METADATA +11 -16
- {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/RECORD +55 -50
- {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/LICENSE +0 -0
- {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/WHEEL +0 -0
lamindb/curators/core.py
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
|
6
6
|
Curator
|
7
7
|
SlotsCurator
|
8
|
+
ComponentCurator
|
8
9
|
CatVector
|
9
10
|
CatLookup
|
10
11
|
DataFrameCatManager
|
@@ -15,7 +16,6 @@ from __future__ import annotations
|
|
15
16
|
|
16
17
|
import copy
|
17
18
|
import re
|
18
|
-
from collections.abc import Iterable
|
19
19
|
from typing import TYPE_CHECKING, Any, Callable
|
20
20
|
|
21
21
|
import lamindb_setup as ln_setup
|
@@ -24,7 +24,9 @@ import pandas as pd
|
|
24
24
|
import pandera.pandas as pandera
|
25
25
|
from lamin_utils import colors, logger
|
26
26
|
from lamindb_setup.core._docs import doc_args
|
27
|
+
from lamindb_setup.core.upath import LocalPathClasses
|
27
28
|
|
29
|
+
from lamindb.base.dtypes import check_dtype
|
28
30
|
from lamindb.base.types import FieldAttr # noqa
|
29
31
|
from lamindb.models import (
|
30
32
|
Artifact,
|
@@ -48,6 +50,7 @@ from lamindb.models.feature import (
|
|
48
50
|
from ..errors import InvalidArgument, ValidationError
|
49
51
|
|
50
52
|
if TYPE_CHECKING:
|
53
|
+
from collections.abc import Iterable
|
51
54
|
from typing import Any
|
52
55
|
|
53
56
|
from anndata import AnnData
|
@@ -145,6 +148,7 @@ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
|
|
145
148
|
|
146
149
|
SLOTS_DOCSTRING = """Access sub curators by slot."""
|
147
150
|
|
151
|
+
SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures."""
|
148
152
|
|
149
153
|
VALIDATE_DOCSTRING = """Validate dataset against Schema.
|
150
154
|
|
@@ -197,7 +201,21 @@ class Curator:
|
|
197
201
|
"MuData",
|
198
202
|
"SpatialData",
|
199
203
|
}:
|
200
|
-
|
204
|
+
# Open remote AnnData Artifacts
|
205
|
+
if not isinstance(self._artifact.path, LocalPathClasses):
|
206
|
+
if self._artifact.otype in {
|
207
|
+
"AnnData",
|
208
|
+
}:
|
209
|
+
try:
|
210
|
+
self._dataset = self._dataset.open(mode="r")
|
211
|
+
# open can raise various errors. Fall back to loading into memory if open fails
|
212
|
+
except Exception as e:
|
213
|
+
logger.warning(
|
214
|
+
f"Unable to open remote AnnData Artifact: {e}. Falling back to loading into memory."
|
215
|
+
)
|
216
|
+
self._dataset = self._dataset.load(is_run_input=False)
|
217
|
+
else:
|
218
|
+
self._dataset = self._dataset.load(is_run_input=False)
|
201
219
|
self._schema: Schema | None = schema
|
202
220
|
self._is_validated: bool = False
|
203
221
|
|
@@ -284,9 +302,12 @@ class Curator:
|
|
284
302
|
)
|
285
303
|
|
286
304
|
|
305
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
287
306
|
class SlotsCurator(Curator):
|
288
307
|
"""Curator for a dataset with slots.
|
289
308
|
|
309
|
+
{}
|
310
|
+
|
290
311
|
Args:
|
291
312
|
dataset: The dataset to validate & annotate.
|
292
313
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
@@ -298,7 +319,7 @@ class SlotsCurator(Curator):
|
|
298
319
|
schema: Schema,
|
299
320
|
) -> None:
|
300
321
|
super().__init__(dataset=dataset, schema=schema)
|
301
|
-
self._slots: dict[str,
|
322
|
+
self._slots: dict[str, ComponentCurator] = {}
|
302
323
|
|
303
324
|
# used for multimodal data structures (not AnnData)
|
304
325
|
# in form of {table/modality_key: var_field}
|
@@ -308,7 +329,7 @@ class SlotsCurator(Curator):
|
|
308
329
|
|
309
330
|
@property
|
310
331
|
@doc_args(SLOTS_DOCSTRING)
|
311
|
-
def slots(self) -> dict[str,
|
332
|
+
def slots(self) -> dict[str, ComponentCurator]:
|
312
333
|
"""{}""" # noqa: D415
|
313
334
|
return self._slots
|
314
335
|
|
@@ -336,6 +357,10 @@ class SlotsCurator(Curator):
|
|
336
357
|
|
337
358
|
if self._artifact is None:
|
338
359
|
type_mapping = [
|
360
|
+
(
|
361
|
+
lambda dataset: isinstance(dataset, pd.DataFrame),
|
362
|
+
Artifact.from_dataframe,
|
363
|
+
),
|
339
364
|
(
|
340
365
|
lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
|
341
366
|
Artifact.from_anndata,
|
@@ -364,12 +389,13 @@ class SlotsCurator(Curator):
|
|
364
389
|
)
|
365
390
|
break
|
366
391
|
|
367
|
-
self._artifact.schema = self._schema
|
368
|
-
self._artifact.save()
|
369
392
|
cat_vectors = {}
|
370
393
|
for curator in self._slots.values():
|
371
394
|
for key, cat_vector in curator.cat._cat_vectors.items():
|
372
395
|
cat_vectors[key] = cat_vector
|
396
|
+
|
397
|
+
self._artifact.schema = self._schema
|
398
|
+
self._artifact.save()
|
373
399
|
return annotate_artifact( # type: ignore
|
374
400
|
self._artifact,
|
375
401
|
curator=self,
|
@@ -377,92 +403,21 @@ class SlotsCurator(Curator):
|
|
377
403
|
)
|
378
404
|
|
379
405
|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
return isinstance(value, expected_type)
|
386
|
-
|
387
|
-
|
388
|
-
def check_dtype(expected_type) -> Callable:
|
389
|
-
"""Creates a check function for Pandera that validates a column's dtype.
|
390
|
-
|
391
|
-
Supports both standard dtype checking and mixed list/single values for the same type.
|
392
|
-
For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
|
393
|
-
|
394
|
-
Args:
|
395
|
-
expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
|
396
|
-
|
397
|
-
Returns:
|
398
|
-
A function that checks if a series has the expected dtype or contains mixed types
|
399
|
-
"""
|
400
|
-
|
401
|
-
def check_function(series):
|
402
|
-
# first check if the series is entirely of the expected dtype (fast path)
|
403
|
-
if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
|
404
|
-
return True
|
405
|
-
elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
|
406
|
-
return True
|
407
|
-
elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
|
408
|
-
return True
|
409
|
-
elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
|
410
|
-
return True
|
411
|
-
elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
|
412
|
-
return True
|
413
|
-
|
414
|
-
# if we're here, it might be a mixed column with object dtype
|
415
|
-
# need to check each value individually
|
416
|
-
if series.dtype == "object" and expected_type.startswith("list"):
|
417
|
-
expected_type_member = expected_type.replace("list[", "").removesuffix("]")
|
418
|
-
if expected_type_member == "int":
|
419
|
-
return series.apply(lambda x: is_list_of_type(x, int)).all()
|
420
|
-
elif expected_type_member == "float":
|
421
|
-
return series.apply(lambda x: is_list_of_type(x, float)).all()
|
422
|
-
elif expected_type_member == "num":
|
423
|
-
# for numeric, accept either int or float
|
424
|
-
return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
|
425
|
-
elif (
|
426
|
-
expected_type_member == "str"
|
427
|
-
or expected_type_member == "path"
|
428
|
-
or expected_type_member.startswith("cat[")
|
429
|
-
):
|
430
|
-
return series.apply(lambda x: is_list_of_type(x, str)).all()
|
431
|
-
|
432
|
-
# if we get here, the validation failed
|
433
|
-
return False
|
434
|
-
|
435
|
-
return check_function
|
436
|
-
|
437
|
-
|
438
|
-
# this is also currently used as DictCurator
|
439
|
-
class DataFrameCurator(Curator):
|
440
|
-
# the example in the docstring is tested in test_curators_quickstart_example
|
406
|
+
# This is also currently used as DictCurator by flattening dictionaries into wide DataFrames.
|
407
|
+
# Such an approach was never intended and there is room for a DictCurator in the future.
|
408
|
+
# For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
|
409
|
+
# https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836
|
410
|
+
class ComponentCurator(Curator):
|
441
411
|
"""Curator for `DataFrame`.
|
442
412
|
|
413
|
+
Provides all key functionality to validate Pandas DataFrames.
|
414
|
+
This class is not user facing unlike :class:`~lamindb.curators.DataFrameCurator` which extends this
|
415
|
+
class with functionality to validate the `attrs` slot.
|
416
|
+
|
443
417
|
Args:
|
444
418
|
dataset: The DataFrame-like object to validate & annotate.
|
445
419
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
446
420
|
slot: Indicate the slot in a composite curator for a composite data structure.
|
447
|
-
|
448
|
-
Example:
|
449
|
-
|
450
|
-
For simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
|
451
|
-
|
452
|
-
Here is an example that enforces a minimal set of columns in the dataframe.
|
453
|
-
|
454
|
-
.. literalinclude:: scripts/curate_dataframe_minimal_errors.py
|
455
|
-
:language: python
|
456
|
-
|
457
|
-
Under-the-hood, this used the following schema.
|
458
|
-
|
459
|
-
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
460
|
-
:language: python
|
461
|
-
|
462
|
-
Valid features & labels were defined as:
|
463
|
-
|
464
|
-
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
465
|
-
:language: python
|
466
421
|
"""
|
467
422
|
|
468
423
|
def __init__(
|
@@ -478,18 +433,18 @@ class DataFrameCurator(Curator):
|
|
478
433
|
feature_ids: set[int] = set()
|
479
434
|
|
480
435
|
if schema.flexible:
|
481
|
-
features += Feature.filter(name__in=self._dataset.keys()).
|
436
|
+
features += Feature.filter(name__in=self._dataset.keys()).to_list()
|
482
437
|
feature_ids = {feature.id for feature in features}
|
483
438
|
|
484
439
|
if schema.n > 0:
|
485
440
|
if schema._index_feature_uid is not None:
|
486
441
|
schema_features = [
|
487
442
|
feature
|
488
|
-
for feature in schema.members.
|
443
|
+
for feature in schema.members.to_list()
|
489
444
|
if feature.uid != schema._index_feature_uid # type: ignore
|
490
445
|
]
|
491
446
|
else:
|
492
|
-
schema_features = schema.members.
|
447
|
+
schema_features = schema.members.to_list() # type: ignore
|
493
448
|
if feature_ids:
|
494
449
|
features.extend(
|
495
450
|
feature
|
@@ -580,9 +535,13 @@ class DataFrameCurator(Curator):
|
|
580
535
|
# in the DataFrameCatManager, we use the
|
581
536
|
# actual columns of the dataset, not the pandera columns
|
582
537
|
# the pandera columns might have additional optional columns
|
538
|
+
if schema.itype == "Composite":
|
539
|
+
columns_field = Feature.name
|
540
|
+
else:
|
541
|
+
columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"]
|
583
542
|
self._cat_manager = DataFrameCatManager(
|
584
543
|
self._dataset,
|
585
|
-
columns_field=
|
544
|
+
columns_field=columns_field,
|
586
545
|
categoricals=categoricals,
|
587
546
|
index=schema.index,
|
588
547
|
slot=slot,
|
@@ -601,6 +560,11 @@ class DataFrameCurator(Curator):
|
|
601
560
|
- Adds missing columns for features
|
602
561
|
- Fills missing values for features with default values
|
603
562
|
"""
|
563
|
+
if self._artifact is not None:
|
564
|
+
raise RuntimeError(
|
565
|
+
"Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
|
566
|
+
)
|
567
|
+
|
604
568
|
for feature in self._schema.members:
|
605
569
|
if feature.name not in self._dataset.columns:
|
606
570
|
if feature.default_value is not None or feature.nullable:
|
@@ -679,35 +643,262 @@ class DataFrameCurator(Curator):
|
|
679
643
|
if not self._is_validated:
|
680
644
|
self.validate() # raises ValidationError if doesn't validate
|
681
645
|
if self._artifact is None:
|
682
|
-
self._artifact = Artifact.
|
646
|
+
self._artifact = Artifact.from_dataframe(
|
683
647
|
self._dataset,
|
684
648
|
key=key,
|
685
649
|
description=description,
|
686
650
|
revises=revises,
|
687
651
|
run=run,
|
688
|
-
format=".csv" if key.endswith(".csv") else None,
|
652
|
+
format=".csv" if key is not None and key.endswith(".csv") else None,
|
689
653
|
)
|
690
|
-
|
691
|
-
|
654
|
+
|
655
|
+
self._artifact.schema = self._schema
|
656
|
+
self._artifact.save()
|
692
657
|
return annotate_artifact( # type: ignore
|
693
658
|
self._artifact,
|
694
659
|
cat_vectors=self.cat._cat_vectors,
|
695
660
|
)
|
696
661
|
|
697
662
|
|
663
|
+
class DataFrameCurator(SlotsCurator):
|
664
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
665
|
+
"""Curator for `DataFrame`.
|
666
|
+
|
667
|
+
Args:
|
668
|
+
dataset: The DataFrame-like object to validate & annotate.
|
669
|
+
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
670
|
+
slot: Indicate the slot in a composite curator for a composite data structure.
|
671
|
+
|
672
|
+
Examples:
|
673
|
+
|
674
|
+
For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_dataframe`.
|
675
|
+
|
676
|
+
Here is an example that enforces a minimal set of columns in the dataframe.
|
677
|
+
|
678
|
+
.. literalinclude:: scripts/curate_dataframe_minimal_errors.py
|
679
|
+
:language: python
|
680
|
+
|
681
|
+
Under-the-hood, this used the following schema.
|
682
|
+
|
683
|
+
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
684
|
+
:language: python
|
685
|
+
|
686
|
+
Valid features & labels were defined as:
|
687
|
+
|
688
|
+
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
689
|
+
:language: python
|
690
|
+
|
691
|
+
It is also possible to curate the `attrs` slot.
|
692
|
+
|
693
|
+
.. literalinclude:: scripts/curate_dataframe_attrs.py
|
694
|
+
:language: python
|
695
|
+
"""
|
696
|
+
|
697
|
+
def __init__(
|
698
|
+
self,
|
699
|
+
dataset: pd.DataFrame | Artifact,
|
700
|
+
schema: Schema,
|
701
|
+
slot: str | None = None,
|
702
|
+
) -> None:
|
703
|
+
super().__init__(dataset=dataset, schema=schema)
|
704
|
+
|
705
|
+
# Create atomic curator for features only
|
706
|
+
if len(self._schema.features.all()) > 0:
|
707
|
+
self._atomic_curator = ComponentCurator(
|
708
|
+
dataset=dataset,
|
709
|
+
schema=schema,
|
710
|
+
slot=slot,
|
711
|
+
)
|
712
|
+
|
713
|
+
# Handle (nested) attrs
|
714
|
+
if slot is None and schema.slots:
|
715
|
+
for slot_name, slot_schema in schema.slots.items():
|
716
|
+
if slot_name.startswith("attrs"):
|
717
|
+
path_parts = slot_name.split(":")
|
718
|
+
attrs_dict = getattr(self._dataset, "attrs", None)
|
719
|
+
if attrs_dict is not None:
|
720
|
+
if len(path_parts) == 1:
|
721
|
+
data = attrs_dict
|
722
|
+
else:
|
723
|
+
deeper_keys = path_parts[1:]
|
724
|
+
data = _resolve_schema_slot_path(
|
725
|
+
attrs_dict, deeper_keys, slot_name, "attrs"
|
726
|
+
)
|
727
|
+
df = pd.DataFrame([data])
|
728
|
+
self._slots[slot_name] = ComponentCurator(
|
729
|
+
df, slot_schema, slot=slot_name
|
730
|
+
)
|
731
|
+
else:
|
732
|
+
raise ValueError(
|
733
|
+
f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
|
734
|
+
)
|
735
|
+
|
736
|
+
@property
|
737
|
+
def cat(self) -> DataFrameCatManager:
|
738
|
+
"""Manage categoricals by updating registries."""
|
739
|
+
if hasattr(self, "_atomic_curator"):
|
740
|
+
return self._atomic_curator.cat
|
741
|
+
raise AttributeError("cat is only available for slots DataFrameCurator")
|
742
|
+
|
743
|
+
def standardize(self) -> None:
|
744
|
+
"""Standardize the dataset.
|
745
|
+
|
746
|
+
- Adds missing columns for features
|
747
|
+
- Fills missing values for features with default values
|
748
|
+
"""
|
749
|
+
if hasattr(self, "_atomic_curator"):
|
750
|
+
self._atomic_curator.standardize()
|
751
|
+
else:
|
752
|
+
for slot_curator in self._slots.values():
|
753
|
+
slot_curator.standardize()
|
754
|
+
|
755
|
+
@doc_args(VALIDATE_DOCSTRING)
|
756
|
+
def validate(self) -> None:
|
757
|
+
"""{}."""
|
758
|
+
if hasattr(self, "_atomic_curator"):
|
759
|
+
self._atomic_curator.validate()
|
760
|
+
self._is_validated = self._atomic_curator._is_validated
|
761
|
+
if self._schema.itype == "Composite":
|
762
|
+
super().validate()
|
763
|
+
|
764
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
765
|
+
def save_artifact(
|
766
|
+
self, *, key=None, description=None, revises=None, run=None
|
767
|
+
) -> Artifact:
|
768
|
+
"""{}."""
|
769
|
+
if not self._is_validated:
|
770
|
+
self.validate()
|
771
|
+
|
772
|
+
if self._slots:
|
773
|
+
self._slots["columns"] = self._atomic_curator
|
774
|
+
try:
|
775
|
+
return super().save_artifact(
|
776
|
+
key=key, description=description, revises=revises, run=run
|
777
|
+
)
|
778
|
+
finally:
|
779
|
+
del self._slots["columns"]
|
780
|
+
else:
|
781
|
+
return self._atomic_curator.save_artifact(
|
782
|
+
key=key, description=description, revises=revises, run=run
|
783
|
+
)
|
784
|
+
|
785
|
+
|
786
|
+
def _resolve_schema_slot_path(
|
787
|
+
target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
|
788
|
+
) -> Any:
|
789
|
+
"""Resolve a schema slot path by traversing nested dictionary keys.
|
790
|
+
|
791
|
+
Args:
|
792
|
+
target_dict: Root dictionary to traverse
|
793
|
+
slot_keys: Sequence of keys defining the paths to traverse
|
794
|
+
slot_name: Schema slot identifier for error context
|
795
|
+
base_path: Base path string for error context
|
796
|
+
|
797
|
+
Returns:
|
798
|
+
The value at the resolved path
|
799
|
+
"""
|
800
|
+
current = target_dict
|
801
|
+
|
802
|
+
for key in slot_keys:
|
803
|
+
base_path += f"['{key}']"
|
804
|
+
try:
|
805
|
+
current = current[key]
|
806
|
+
except KeyError:
|
807
|
+
available = (
|
808
|
+
list(current.keys()) if isinstance(current, dict) else "not a dict"
|
809
|
+
)
|
810
|
+
raise InvalidArgument(
|
811
|
+
f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
|
812
|
+
f"not found. Available keys at this level: {available}"
|
813
|
+
) from None
|
814
|
+
|
815
|
+
return current
|
816
|
+
|
817
|
+
|
818
|
+
def _handle_dict_slots(
|
819
|
+
dataset: ScverseDataStructures, slot: str
|
820
|
+
) -> tuple[pd.DataFrame | None, str | None, str | None]:
|
821
|
+
"""Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.
|
822
|
+
|
823
|
+
Supports two patterns:
|
824
|
+
- Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key"
|
825
|
+
- Modality dict access: "modality:uns"
|
826
|
+
|
827
|
+
Args:
|
828
|
+
dataset: The scverse datastructure object
|
829
|
+
slot: The slot path string to parse like 'uns:path:to'.
|
830
|
+
|
831
|
+
Returns:
|
832
|
+
tuple: (dataframe, modality_key, remaining_slot_path)
|
833
|
+
- dataframe: Single-row DataFrame containing the resolved data
|
834
|
+
- modality_key: Modality identifier if slot targets modality dict, else None
|
835
|
+
- remaining_slot_path: The dict attribute and nested keys as string
|
836
|
+
"""
|
837
|
+
path_parts = slot.split(":")
|
838
|
+
|
839
|
+
# Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..."
|
840
|
+
if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]:
|
841
|
+
dict_attr = getattr(dataset, path_parts[0], None)
|
842
|
+
if dict_attr is not None:
|
843
|
+
if len(path_parts) == 1:
|
844
|
+
return pd.DataFrame([dict_attr]), None, path_parts[0]
|
845
|
+
|
846
|
+
deeper_keys = path_parts[1:]
|
847
|
+
data = _resolve_schema_slot_path(
|
848
|
+
dict_attr, deeper_keys, slot, path_parts[0]
|
849
|
+
)
|
850
|
+
return pd.DataFrame([data]), None, ":".join(path_parts[1:])
|
851
|
+
|
852
|
+
# Handle modality dict slots: "modality:uns", "modality:uns:key1:key2"
|
853
|
+
elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]:
|
854
|
+
modality, dict_name = path_parts[0], path_parts[1]
|
855
|
+
try:
|
856
|
+
modality_dataset = dataset[modality]
|
857
|
+
dict_attr = getattr(modality_dataset, dict_name, None)
|
858
|
+
if dict_attr is not None:
|
859
|
+
if len(path_parts) == 2:
|
860
|
+
return pd.DataFrame([dict_attr]), modality, dict_name
|
861
|
+
|
862
|
+
deeper_keys = path_parts[2:]
|
863
|
+
data = _resolve_schema_slot_path(
|
864
|
+
dict_attr, deeper_keys, slot, f"{modality}.{dict_name}"
|
865
|
+
)
|
866
|
+
return pd.DataFrame([data]), modality, ":".join(path_parts[1:])
|
867
|
+
except (KeyError, AttributeError):
|
868
|
+
pass
|
869
|
+
else:
|
870
|
+
raise InvalidArgument(
|
871
|
+
f"Invalid dict slot pattern '{slot}'. Expected formats: "
|
872
|
+
f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'"
|
873
|
+
)
|
874
|
+
|
875
|
+
return None, None, None
|
876
|
+
|
877
|
+
|
878
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
698
879
|
class AnnDataCurator(SlotsCurator):
|
699
880
|
"""Curator for `AnnData`.
|
700
881
|
|
882
|
+
{}
|
883
|
+
|
701
884
|
Args:
|
702
885
|
dataset: The AnnData-like object to validate & annotate.
|
703
886
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
704
887
|
|
705
|
-
|
888
|
+
Examples:
|
889
|
+
|
890
|
+
Curate Ensembl gene IDs and valid features in obs:
|
706
891
|
|
707
892
|
.. literalinclude:: scripts/curate_anndata_flexible.py
|
708
893
|
:language: python
|
709
894
|
:caption: curate_anndata_flexible.py
|
710
895
|
|
896
|
+
Curate `uns` dictionary:
|
897
|
+
|
898
|
+
.. literalinclude:: scripts/curate_anndata_uns.py
|
899
|
+
:language: python
|
900
|
+
:caption: curate_anndata_uns.py
|
901
|
+
|
711
902
|
See Also:
|
712
903
|
:meth:`~lamindb.Artifact.from_anndata`.
|
713
904
|
"""
|
@@ -720,34 +911,37 @@ class AnnDataCurator(SlotsCurator):
|
|
720
911
|
super().__init__(dataset=dataset, schema=schema)
|
721
912
|
if not data_is_scversedatastructure(self._dataset, "AnnData"):
|
722
913
|
raise InvalidArgument("dataset must be AnnData-like.")
|
723
|
-
if schema.otype != "AnnData":
|
914
|
+
if schema.otype and schema.otype != "AnnData":
|
724
915
|
raise InvalidArgument("Schema otype must be 'AnnData'.")
|
725
|
-
|
726
|
-
|
727
|
-
|
916
|
+
|
917
|
+
for slot, slot_schema in schema.slots.items():
|
918
|
+
if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"):
|
919
|
+
raise ValueError(
|
920
|
+
f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}"
|
921
|
+
)
|
922
|
+
if slot.startswith("uns"):
|
923
|
+
df, _, _ = _handle_dict_slots(self._dataset, slot)
|
924
|
+
elif slot in {"obs", "var", "var.T"}:
|
925
|
+
df = (
|
728
926
|
getattr(self._dataset, slot.strip(".T")).T
|
729
927
|
if slot == "var.T"
|
730
928
|
or (
|
731
|
-
# backward compat
|
732
929
|
slot == "var"
|
733
930
|
and schema.slots["var"].itype not in {None, "Feature"}
|
734
931
|
)
|
735
932
|
else getattr(self._dataset, slot)
|
736
|
-
)
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
"var"
|
749
|
-
].cat._cat_vectors.pop("columns")
|
750
|
-
self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
|
933
|
+
)
|
934
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
|
935
|
+
|
936
|
+
# Handle var index naming for backward compat
|
937
|
+
if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}:
|
938
|
+
logger.warning(
|
939
|
+
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
940
|
+
)
|
941
|
+
self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
|
942
|
+
"var"
|
943
|
+
].cat._cat_vectors.pop("columns")
|
944
|
+
self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
|
751
945
|
|
752
946
|
|
753
947
|
def _assign_var_fields_categoricals_multimodal(
|
@@ -757,11 +951,10 @@ def _assign_var_fields_categoricals_multimodal(
|
|
757
951
|
slot_schema: Schema,
|
758
952
|
var_fields: dict[str, FieldAttr],
|
759
953
|
cat_vectors: dict[str, dict[str, CatVector]],
|
760
|
-
slots: dict[str,
|
954
|
+
slots: dict[str, ComponentCurator],
|
761
955
|
) -> None:
|
762
956
|
"""Assigns var_fields and categoricals for multimodal data curators."""
|
763
957
|
if modality is not None:
|
764
|
-
# Makes sure that all tables are present
|
765
958
|
var_fields[modality] = None
|
766
959
|
cat_vectors[modality] = {}
|
767
960
|
|
@@ -782,15 +975,17 @@ def _assign_var_fields_categoricals_multimodal(
|
|
782
975
|
cat_vectors[modality] = obs_fields
|
783
976
|
|
784
977
|
|
978
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
785
979
|
class MuDataCurator(SlotsCurator):
|
786
980
|
"""Curator for `MuData`.
|
787
981
|
|
982
|
+
{}
|
983
|
+
|
788
984
|
Args:
|
789
985
|
dataset: The MuData-like object to validate & annotate.
|
790
986
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
791
987
|
|
792
988
|
Example:
|
793
|
-
|
794
989
|
.. literalinclude:: scripts/curate_mudata.py
|
795
990
|
:language: python
|
796
991
|
:caption: curate_mudata.py
|
@@ -811,12 +1006,32 @@ class MuDataCurator(SlotsCurator):
|
|
811
1006
|
raise InvalidArgument("Schema otype must be 'MuData'.")
|
812
1007
|
|
813
1008
|
for slot, slot_schema in schema.slots.items():
|
814
|
-
|
815
|
-
|
816
|
-
|
1009
|
+
# Handle slots: "mdata.uns", "modality:uns"
|
1010
|
+
if "uns" in slot:
|
1011
|
+
df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)
|
817
1012
|
else:
|
818
|
-
|
819
|
-
|
1013
|
+
# Handle slots: "modality:obs", "modality:var"
|
1014
|
+
parts = slot.split(":")
|
1015
|
+
if len(parts) == 2:
|
1016
|
+
modality, modality_slot = parts
|
1017
|
+
try:
|
1018
|
+
schema_dataset = self._dataset[modality]
|
1019
|
+
df = getattr(schema_dataset, modality_slot.rstrip(".T"))
|
1020
|
+
except KeyError:
|
1021
|
+
raise InvalidArgument(
|
1022
|
+
f"Modality '{modality}' not found in MuData"
|
1023
|
+
) from None
|
1024
|
+
except AttributeError:
|
1025
|
+
raise InvalidArgument(
|
1026
|
+
f"Attribute '{modality_slot}' not found on modality '{modality}'"
|
1027
|
+
) from None
|
1028
|
+
else:
|
1029
|
+
# Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above)
|
1030
|
+
modality, modality_slot = None, slot
|
1031
|
+
schema_dataset = self._dataset
|
1032
|
+
df = getattr(schema_dataset, modality_slot.rstrip(".T"))
|
1033
|
+
|
1034
|
+
# Transpose var if necessary
|
820
1035
|
if modality_slot == "var" and schema.slots[slot].itype not in {
|
821
1036
|
None,
|
822
1037
|
"Feature",
|
@@ -824,19 +1039,12 @@ class MuDataCurator(SlotsCurator):
|
|
824
1039
|
logger.warning(
|
825
1040
|
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
826
1041
|
)
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
modality_slot == "var"
|
834
|
-
and schema.slots[slot].itype not in {None, "Feature"}
|
835
|
-
)
|
836
|
-
else getattr(schema_dataset, modality_slot)
|
837
|
-
),
|
838
|
-
slot_schema,
|
839
|
-
)
|
1042
|
+
df = df.T
|
1043
|
+
elif modality_slot == "var.T":
|
1044
|
+
df = df.T
|
1045
|
+
|
1046
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
|
1047
|
+
|
840
1048
|
_assign_var_fields_categoricals_multimodal(
|
841
1049
|
modality=modality,
|
842
1050
|
slot_type=modality_slot,
|
@@ -846,18 +1054,21 @@ class MuDataCurator(SlotsCurator):
|
|
846
1054
|
cat_vectors=self._cat_vectors,
|
847
1055
|
slots=self._slots,
|
848
1056
|
)
|
1057
|
+
|
849
1058
|
self._columns_field = self._var_fields
|
850
1059
|
|
851
1060
|
|
1061
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
852
1062
|
class SpatialDataCurator(SlotsCurator):
|
853
1063
|
"""Curator for `SpatialData`.
|
854
1064
|
|
1065
|
+
{}
|
1066
|
+
|
855
1067
|
Args:
|
856
1068
|
dataset: The SpatialData-like object to validate & annotate.
|
857
1069
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
858
1070
|
|
859
1071
|
Example:
|
860
|
-
|
861
1072
|
.. literalinclude:: scripts/curate_spatialdata.py
|
862
1073
|
:language: python
|
863
1074
|
:caption: curate_spatialdata.py
|
@@ -878,69 +1089,75 @@ class SpatialDataCurator(SlotsCurator):
|
|
878
1089
|
raise InvalidArgument("Schema otype must be 'SpatialData'.")
|
879
1090
|
|
880
1091
|
for slot, slot_schema in schema.slots.items():
|
881
|
-
|
882
|
-
if (
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
1092
|
+
# Handle slots: "sdata:attrs"
|
1093
|
+
if slot.startswith("attrs"):
|
1094
|
+
df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)
|
1095
|
+
else:
|
1096
|
+
parts = slot.split(":")
|
1097
|
+
# Handle slots: "tables:table_key:obs", "tables:table_key:var"
|
1098
|
+
if len(parts) == 3 and parts[0] == "tables":
|
1099
|
+
table_key, table_slot = parts[1], parts[2]
|
1100
|
+
try:
|
1101
|
+
slot_object = self._dataset.tables[table_key]
|
1102
|
+
df = getattr(slot_object, table_slot.rstrip(".T"))
|
1103
|
+
except KeyError:
|
1104
|
+
raise InvalidArgument(
|
1105
|
+
f"Table '{table_key}' not found in sdata.tables"
|
1106
|
+
) from None
|
1107
|
+
except AttributeError:
|
1108
|
+
raise InvalidArgument(
|
1109
|
+
f"Attribute '{table_slot}' not found on table '{table_key}'"
|
1110
|
+
) from None
|
890
1111
|
else:
|
891
|
-
|
892
|
-
|
893
|
-
if sub_slot == "var" and schema.slots[slot].itype not in {
|
894
|
-
None,
|
895
|
-
"Feature",
|
896
|
-
}:
|
897
|
-
logger.warning(
|
898
|
-
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
899
|
-
)
|
900
|
-
data_object = (
|
901
|
-
getattr(slot_object, sub_slot.rstrip(".T")).T
|
902
|
-
if sub_slot == "var.T"
|
903
|
-
or (
|
904
|
-
# backward compat
|
905
|
-
sub_slot == "var"
|
906
|
-
and schema.slots[slot].itype not in {None, "Feature"}
|
907
|
-
)
|
908
|
-
else getattr(slot_object, sub_slot)
|
909
|
-
)
|
910
|
-
elif len(split_result) == 1 or (
|
911
|
-
len(split_result) > 1 and split_result[0] == "attrs"
|
912
|
-
):
|
913
|
-
table_key = None
|
914
|
-
if len(split_result) == 1:
|
915
|
-
if split_result[0] != "attrs":
|
1112
|
+
# Handle legacy single keys for backward compatibility
|
1113
|
+
if len(parts) == 1 and parts[0] != "attrs":
|
916
1114
|
logger.warning(
|
917
1115
|
f"please prefix slot {slot} with 'attrs:' going forward"
|
918
1116
|
)
|
919
|
-
|
920
|
-
|
1117
|
+
try:
|
1118
|
+
df = pd.DataFrame([self._dataset.attrs[slot]])
|
1119
|
+
table_key = None
|
1120
|
+
table_slot = slot
|
1121
|
+
except KeyError:
|
1122
|
+
raise InvalidArgument(
|
1123
|
+
f"Slot '{slot}' not found in sdata.attrs"
|
1124
|
+
) from None
|
921
1125
|
else:
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
1126
|
+
raise InvalidArgument(f"Unrecognized slot format: {slot}")
|
1127
|
+
|
1128
|
+
# Handle var transposition logic
|
1129
|
+
if table_slot == "var" and schema.slots[slot].itype not in {
|
1130
|
+
None,
|
1131
|
+
"Feature",
|
1132
|
+
}:
|
1133
|
+
logger.warning(
|
1134
|
+
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
1135
|
+
)
|
1136
|
+
df = df.T
|
1137
|
+
elif table_slot == "var.T":
|
1138
|
+
df = df.T
|
1139
|
+
|
1140
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot)
|
1141
|
+
|
929
1142
|
_assign_var_fields_categoricals_multimodal(
|
930
1143
|
modality=table_key,
|
931
|
-
slot_type=
|
1144
|
+
slot_type=table_slot,
|
932
1145
|
slot=slot,
|
933
1146
|
slot_schema=slot_schema,
|
934
1147
|
var_fields=self._var_fields,
|
935
1148
|
cat_vectors=self._cat_vectors,
|
936
1149
|
slots=self._slots,
|
937
1150
|
)
|
1151
|
+
|
938
1152
|
self._columns_field = self._var_fields
|
939
1153
|
|
940
1154
|
|
1155
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
941
1156
|
class TiledbsomaExperimentCurator(SlotsCurator):
|
942
1157
|
"""Curator for `tiledbsoma.Experiment`.
|
943
1158
|
|
1159
|
+
{}
|
1160
|
+
|
944
1161
|
Args:
|
945
1162
|
dataset: The `tiledbsoma.Experiment` object.
|
946
1163
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
@@ -977,7 +1194,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
977
1194
|
.drop("soma_joinid", axis=1, errors="ignore")
|
978
1195
|
)
|
979
1196
|
|
980
|
-
self._slots[slot] =
|
1197
|
+
self._slots[slot] = ComponentCurator(
|
981
1198
|
(schema_dataset.T if modality_slot == "var.T" else schema_dataset),
|
982
1199
|
slot_schema,
|
983
1200
|
)
|
@@ -990,7 +1207,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
990
1207
|
.to_pandas()
|
991
1208
|
.drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
|
992
1209
|
)
|
993
|
-
self._slots[slot] =
|
1210
|
+
self._slots[slot] = ComponentCurator(
|
994
1211
|
schema_dataset,
|
995
1212
|
slot_schema,
|
996
1213
|
)
|
@@ -1040,9 +1257,12 @@ class CatVector:
|
|
1040
1257
|
self._maximal_set = maximal_set
|
1041
1258
|
|
1042
1259
|
self._all_filters = {"source": self._source, "organism": self._organism}
|
1260
|
+
|
1043
1261
|
if self._subtype_str and "=" in self._subtype_str:
|
1044
1262
|
self._all_filters.update(
|
1045
|
-
resolve_relation_filters(
|
1263
|
+
resolve_relation_filters(
|
1264
|
+
parse_filter_string(self._subtype_str), self._field.field.model
|
1265
|
+
) # type: ignore
|
1046
1266
|
)
|
1047
1267
|
|
1048
1268
|
if hasattr(field.field.model, "_name_field"):
|
@@ -1241,7 +1461,7 @@ class CatVector:
|
|
1241
1461
|
type_record = registry.get(name=self._subtype_str)
|
1242
1462
|
if df is not None and registry == Feature:
|
1243
1463
|
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
1244
|
-
non_validated_records = Feature.
|
1464
|
+
non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])
|
1245
1465
|
else:
|
1246
1466
|
if (
|
1247
1467
|
self._organism
|
@@ -1343,7 +1563,7 @@ class CatVector:
|
|
1343
1563
|
warning_message += "\n for remaining terms:\n"
|
1344
1564
|
warning_message += f" → fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}"
|
1345
1565
|
if self._subtype_query_set is not None:
|
1346
|
-
warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.
|
1566
|
+
warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.to_list('name')}"
|
1347
1567
|
logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
|
1348
1568
|
logger.warning(warning_message)
|
1349
1569
|
if self._cat_manager is not None:
|
@@ -1493,6 +1713,30 @@ class DataFrameCatManager:
|
|
1493
1713
|
"""The categorical features."""
|
1494
1714
|
return self._categoricals
|
1495
1715
|
|
1716
|
+
def __repr__(self) -> str:
|
1717
|
+
cls_name = colors.green(self.__class__.__name__)
|
1718
|
+
|
1719
|
+
status_str = (
|
1720
|
+
f"{colors.green('validated')}"
|
1721
|
+
if self._is_validated
|
1722
|
+
else f"{colors.yellow('unvalidated')}"
|
1723
|
+
)
|
1724
|
+
|
1725
|
+
info_parts = []
|
1726
|
+
|
1727
|
+
cat_count = len(self._categoricals)
|
1728
|
+
if cat_count > 0:
|
1729
|
+
info_parts.append(f"categorical_features={cat_count}")
|
1730
|
+
|
1731
|
+
if self._slot:
|
1732
|
+
info_parts.append(f"slot: {colors.italic(self._slot)}")
|
1733
|
+
|
1734
|
+
info_str = ", ".join(info_parts)
|
1735
|
+
if info_str:
|
1736
|
+
return f"{cls_name}({info_str}, {status_str})"
|
1737
|
+
else:
|
1738
|
+
return f"{cls_name}({status_str})"
|
1739
|
+
|
1496
1740
|
def lookup(self, public: bool = False) -> CatLookup:
|
1497
1741
|
"""Lookup categories.
|
1498
1742
|
|
@@ -1537,7 +1781,9 @@ class DataFrameCatManager:
|
|
1537
1781
|
key: The key referencing the column in the DataFrame to standardize.
|
1538
1782
|
"""
|
1539
1783
|
if self._artifact is not None:
|
1540
|
-
raise RuntimeError(
|
1784
|
+
raise RuntimeError(
|
1785
|
+
"Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
|
1786
|
+
)
|
1541
1787
|
|
1542
1788
|
if key == "all":
|
1543
1789
|
logger.warning(
|
@@ -1610,7 +1856,7 @@ def get_organism_kwargs(
|
|
1610
1856
|
def annotate_artifact(
|
1611
1857
|
artifact: Artifact,
|
1612
1858
|
*,
|
1613
|
-
curator:
|
1859
|
+
curator: SlotsCurator | None = None,
|
1614
1860
|
cat_vectors: dict[str, CatVector] | None = None,
|
1615
1861
|
) -> Artifact:
|
1616
1862
|
from .. import settings
|
@@ -1643,7 +1889,9 @@ def annotate_artifact(
|
|
1643
1889
|
)
|
1644
1890
|
|
1645
1891
|
# annotate with inferred schemas aka feature sets
|
1646
|
-
if
|
1892
|
+
if (
|
1893
|
+
artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None
|
1894
|
+
): # Prevent overwriting user-defined schemas that contain slots
|
1647
1895
|
features = cat_vectors["columns"].records
|
1648
1896
|
if features is not None:
|
1649
1897
|
index_feature = artifact.schema.index
|
@@ -1663,7 +1911,11 @@ def annotate_artifact(
|
|
1663
1911
|
logger.important(
|
1664
1912
|
f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
|
1665
1913
|
)
|
1666
|
-
itype =
|
1914
|
+
itype = (
|
1915
|
+
Feature.name
|
1916
|
+
if artifact.schema.itype == "Composite"
|
1917
|
+
else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
|
1918
|
+
)
|
1667
1919
|
feature_set = Schema(itype=itype, n=len(features))
|
1668
1920
|
artifact.feature_sets.add(
|
1669
1921
|
feature_set.save(), through_defaults={"slot": "columns"}
|
@@ -1698,9 +1950,13 @@ def annotate_artifact(
|
|
1698
1950
|
logger.important(
|
1699
1951
|
f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
|
1700
1952
|
)
|
1701
|
-
itype =
|
1702
|
-
|
1703
|
-
|
1953
|
+
itype = (
|
1954
|
+
Feature.name
|
1955
|
+
if artifact.schema.slots[slot].itype == "Composite"
|
1956
|
+
else parse_cat_dtype(
|
1957
|
+
artifact.schema.slots[slot].itype, is_itype=True
|
1958
|
+
)["field"]
|
1959
|
+
)
|
1704
1960
|
feature_set = Schema(itype=itype, n=len(features))
|
1705
1961
|
artifact.feature_sets.add(
|
1706
1962
|
feature_set.save(), through_defaults={"slot": slot}
|