lamindb 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +89 -49
- lamindb/_finish.py +17 -15
- lamindb/_tracked.py +2 -4
- lamindb/_view.py +1 -1
- lamindb/base/__init__.py +2 -1
- lamindb/base/dtypes.py +76 -0
- lamindb/core/_settings.py +2 -2
- lamindb/core/storage/_anndata_accessor.py +29 -9
- lamindb/curators/_legacy.py +16 -3
- lamindb/curators/core.py +442 -188
- lamindb/errors.py +6 -0
- lamindb/examples/cellxgene/__init__.py +8 -3
- lamindb/examples/cellxgene/_cellxgene.py +127 -13
- lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
- lamindb/examples/croissant/__init__.py +32 -6
- lamindb/examples/datasets/__init__.py +2 -2
- lamindb/examples/datasets/_core.py +9 -2
- lamindb/examples/datasets/_small.py +66 -22
- lamindb/examples/fixtures/sheets.py +8 -2
- lamindb/integrations/_croissant.py +34 -11
- lamindb/migrations/0119_squashed.py +5 -2
- lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
- lamindb/migrations/0121_recorduser.py +60 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +2 -2
- lamindb/models/_feature_manager.py +131 -71
- lamindb/models/_from_values.py +2 -2
- lamindb/models/_is_versioned.py +4 -4
- lamindb/models/_label_manager.py +4 -4
- lamindb/models/artifact.py +326 -172
- lamindb/models/artifact_set.py +45 -1
- lamindb/models/can_curate.py +1 -2
- lamindb/models/collection.py +3 -34
- lamindb/models/feature.py +111 -7
- lamindb/models/has_parents.py +11 -11
- lamindb/models/project.py +18 -0
- lamindb/models/query_manager.py +16 -7
- lamindb/models/query_set.py +191 -78
- lamindb/models/record.py +30 -5
- lamindb/models/run.py +10 -33
- lamindb/models/save.py +6 -8
- lamindb/models/schema.py +54 -26
- lamindb/models/sqlrecord.py +152 -40
- lamindb/models/storage.py +59 -14
- lamindb/models/transform.py +17 -17
- lamindb/models/ulabel.py +6 -1
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/METADATA +12 -18
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/RECORD +50 -47
- {lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/WHEEL +1 -1
- {lamindb-1.10.2.dist-info/licenses → lamindb-1.11.0.dist-info}/LICENSE +0 -0
lamindb/curators/core.py
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
|
6
6
|
Curator
|
7
7
|
SlotsCurator
|
8
|
+
ComponentCurator
|
8
9
|
CatVector
|
9
10
|
CatLookup
|
10
11
|
DataFrameCatManager
|
@@ -15,7 +16,6 @@ from __future__ import annotations
|
|
15
16
|
|
16
17
|
import copy
|
17
18
|
import re
|
18
|
-
from collections.abc import Iterable
|
19
19
|
from typing import TYPE_CHECKING, Any, Callable
|
20
20
|
|
21
21
|
import lamindb_setup as ln_setup
|
@@ -24,7 +24,9 @@ import pandas as pd
|
|
24
24
|
import pandera.pandas as pandera
|
25
25
|
from lamin_utils import colors, logger
|
26
26
|
from lamindb_setup.core._docs import doc_args
|
27
|
+
from lamindb_setup.core.upath import LocalPathClasses
|
27
28
|
|
29
|
+
from lamindb.base.dtypes import check_dtype
|
28
30
|
from lamindb.base.types import FieldAttr # noqa
|
29
31
|
from lamindb.models import (
|
30
32
|
Artifact,
|
@@ -48,6 +50,7 @@ from lamindb.models.feature import (
|
|
48
50
|
from ..errors import InvalidArgument, ValidationError
|
49
51
|
|
50
52
|
if TYPE_CHECKING:
|
53
|
+
from collections.abc import Iterable
|
51
54
|
from typing import Any
|
52
55
|
|
53
56
|
from anndata import AnnData
|
@@ -145,6 +148,7 @@ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
|
|
145
148
|
|
146
149
|
SLOTS_DOCSTRING = """Access sub curators by slot."""
|
147
150
|
|
151
|
+
SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures."""
|
148
152
|
|
149
153
|
VALIDATE_DOCSTRING = """Validate dataset against Schema.
|
150
154
|
|
@@ -197,7 +201,21 @@ class Curator:
|
|
197
201
|
"MuData",
|
198
202
|
"SpatialData",
|
199
203
|
}:
|
200
|
-
|
204
|
+
# Open remote AnnData Artifacts
|
205
|
+
if not isinstance(self._artifact.path, LocalPathClasses):
|
206
|
+
if self._artifact.otype in {
|
207
|
+
"AnnData",
|
208
|
+
}:
|
209
|
+
try:
|
210
|
+
self._dataset = self._dataset.open(mode="r")
|
211
|
+
# open can raise various errors. Fall back to loading into memory if open fails
|
212
|
+
except Exception as e:
|
213
|
+
logger.warning(
|
214
|
+
f"Unable to open remote AnnData Artifact: {e}. Falling back to loading into memory."
|
215
|
+
)
|
216
|
+
self._dataset = self._dataset.load(is_run_input=False)
|
217
|
+
else:
|
218
|
+
self._dataset = self._dataset.load(is_run_input=False)
|
201
219
|
self._schema: Schema | None = schema
|
202
220
|
self._is_validated: bool = False
|
203
221
|
|
@@ -284,9 +302,12 @@ class Curator:
|
|
284
302
|
)
|
285
303
|
|
286
304
|
|
305
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
287
306
|
class SlotsCurator(Curator):
|
288
307
|
"""Curator for a dataset with slots.
|
289
308
|
|
309
|
+
{}
|
310
|
+
|
290
311
|
Args:
|
291
312
|
dataset: The dataset to validate & annotate.
|
292
313
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
@@ -298,7 +319,7 @@ class SlotsCurator(Curator):
|
|
298
319
|
schema: Schema,
|
299
320
|
) -> None:
|
300
321
|
super().__init__(dataset=dataset, schema=schema)
|
301
|
-
self._slots: dict[str,
|
322
|
+
self._slots: dict[str, ComponentCurator] = {}
|
302
323
|
|
303
324
|
# used for multimodal data structures (not AnnData)
|
304
325
|
# in form of {table/modality_key: var_field}
|
@@ -308,7 +329,7 @@ class SlotsCurator(Curator):
|
|
308
329
|
|
309
330
|
@property
|
310
331
|
@doc_args(SLOTS_DOCSTRING)
|
311
|
-
def slots(self) -> dict[str,
|
332
|
+
def slots(self) -> dict[str, ComponentCurator]:
|
312
333
|
"""{}""" # noqa: D415
|
313
334
|
return self._slots
|
314
335
|
|
@@ -336,6 +357,10 @@ class SlotsCurator(Curator):
|
|
336
357
|
|
337
358
|
if self._artifact is None:
|
338
359
|
type_mapping = [
|
360
|
+
(
|
361
|
+
lambda dataset: isinstance(dataset, pd.DataFrame),
|
362
|
+
Artifact.from_dataframe,
|
363
|
+
),
|
339
364
|
(
|
340
365
|
lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
|
341
366
|
Artifact.from_anndata,
|
@@ -378,92 +403,21 @@ class SlotsCurator(Curator):
|
|
378
403
|
)
|
379
404
|
|
380
405
|
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
return isinstance(value, expected_type)
|
387
|
-
|
388
|
-
|
389
|
-
def check_dtype(expected_type) -> Callable:
|
390
|
-
"""Creates a check function for Pandera that validates a column's dtype.
|
391
|
-
|
392
|
-
Supports both standard dtype checking and mixed list/single values for the same type.
|
393
|
-
For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
|
394
|
-
|
395
|
-
Args:
|
396
|
-
expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
|
397
|
-
|
398
|
-
Returns:
|
399
|
-
A function that checks if a series has the expected dtype or contains mixed types
|
400
|
-
"""
|
401
|
-
|
402
|
-
def check_function(series):
|
403
|
-
# first check if the series is entirely of the expected dtype (fast path)
|
404
|
-
if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
|
405
|
-
return True
|
406
|
-
elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
|
407
|
-
return True
|
408
|
-
elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
|
409
|
-
return True
|
410
|
-
elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
|
411
|
-
return True
|
412
|
-
elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
|
413
|
-
return True
|
414
|
-
|
415
|
-
# if we're here, it might be a mixed column with object dtype
|
416
|
-
# need to check each value individually
|
417
|
-
if series.dtype == "object" and expected_type.startswith("list"):
|
418
|
-
expected_type_member = expected_type.replace("list[", "").removesuffix("]")
|
419
|
-
if expected_type_member == "int":
|
420
|
-
return series.apply(lambda x: is_list_of_type(x, int)).all()
|
421
|
-
elif expected_type_member == "float":
|
422
|
-
return series.apply(lambda x: is_list_of_type(x, float)).all()
|
423
|
-
elif expected_type_member == "num":
|
424
|
-
# for numeric, accept either int or float
|
425
|
-
return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
|
426
|
-
elif (
|
427
|
-
expected_type_member == "str"
|
428
|
-
or expected_type_member == "path"
|
429
|
-
or expected_type_member.startswith("cat[")
|
430
|
-
):
|
431
|
-
return series.apply(lambda x: is_list_of_type(x, str)).all()
|
432
|
-
|
433
|
-
# if we get here, the validation failed
|
434
|
-
return False
|
435
|
-
|
436
|
-
return check_function
|
437
|
-
|
438
|
-
|
439
|
-
# this is also currently used as DictCurator
|
440
|
-
class DataFrameCurator(Curator):
|
441
|
-
# the example in the docstring is tested in test_curators_quickstart_example
|
406
|
+
# This is also currently used as DictCurator by flattening dictionaries into wide DataFrames.
|
407
|
+
# Such an approach was never intended and there is room for a DictCurator in the future.
|
408
|
+
# For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
|
409
|
+
# https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836
|
410
|
+
class ComponentCurator(Curator):
|
442
411
|
"""Curator for `DataFrame`.
|
443
412
|
|
413
|
+
Provides all key functionality to validate Pandas DataFrames.
|
414
|
+
This class is not user facing unlike :class:`~lamindb.curators.DataFrameCurator` which extends this
|
415
|
+
class with functionality to validate the `attrs` slot.
|
416
|
+
|
444
417
|
Args:
|
445
418
|
dataset: The DataFrame-like object to validate & annotate.
|
446
419
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
447
420
|
slot: Indicate the slot in a composite curator for a composite data structure.
|
448
|
-
|
449
|
-
Example:
|
450
|
-
|
451
|
-
For simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
|
452
|
-
|
453
|
-
Here is an example that enforces a minimal set of columns in the dataframe.
|
454
|
-
|
455
|
-
.. literalinclude:: scripts/curate_dataframe_minimal_errors.py
|
456
|
-
:language: python
|
457
|
-
|
458
|
-
Under-the-hood, this used the following schema.
|
459
|
-
|
460
|
-
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
461
|
-
:language: python
|
462
|
-
|
463
|
-
Valid features & labels were defined as:
|
464
|
-
|
465
|
-
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
466
|
-
:language: python
|
467
421
|
"""
|
468
422
|
|
469
423
|
def __init__(
|
@@ -479,18 +433,18 @@ class DataFrameCurator(Curator):
|
|
479
433
|
feature_ids: set[int] = set()
|
480
434
|
|
481
435
|
if schema.flexible:
|
482
|
-
features += Feature.filter(name__in=self._dataset.keys()).
|
436
|
+
features += Feature.filter(name__in=self._dataset.keys()).to_list()
|
483
437
|
feature_ids = {feature.id for feature in features}
|
484
438
|
|
485
439
|
if schema.n > 0:
|
486
440
|
if schema._index_feature_uid is not None:
|
487
441
|
schema_features = [
|
488
442
|
feature
|
489
|
-
for feature in schema.members.
|
443
|
+
for feature in schema.members.to_list()
|
490
444
|
if feature.uid != schema._index_feature_uid # type: ignore
|
491
445
|
]
|
492
446
|
else:
|
493
|
-
schema_features = schema.members.
|
447
|
+
schema_features = schema.members.to_list() # type: ignore
|
494
448
|
if feature_ids:
|
495
449
|
features.extend(
|
496
450
|
feature
|
@@ -581,9 +535,13 @@ class DataFrameCurator(Curator):
|
|
581
535
|
# in the DataFrameCatManager, we use the
|
582
536
|
# actual columns of the dataset, not the pandera columns
|
583
537
|
# the pandera columns might have additional optional columns
|
538
|
+
if schema.itype == "Composite":
|
539
|
+
columns_field = Feature.name
|
540
|
+
else:
|
541
|
+
columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"]
|
584
542
|
self._cat_manager = DataFrameCatManager(
|
585
543
|
self._dataset,
|
586
|
-
columns_field=
|
544
|
+
columns_field=columns_field,
|
587
545
|
categoricals=categoricals,
|
588
546
|
index=schema.index,
|
589
547
|
slot=slot,
|
@@ -602,6 +560,11 @@ class DataFrameCurator(Curator):
|
|
602
560
|
- Adds missing columns for features
|
603
561
|
- Fills missing values for features with default values
|
604
562
|
"""
|
563
|
+
if self._artifact is not None:
|
564
|
+
raise RuntimeError(
|
565
|
+
"Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
|
566
|
+
)
|
567
|
+
|
605
568
|
for feature in self._schema.members:
|
606
569
|
if feature.name not in self._dataset.columns:
|
607
570
|
if feature.default_value is not None or feature.nullable:
|
@@ -680,7 +643,7 @@ class DataFrameCurator(Curator):
|
|
680
643
|
if not self._is_validated:
|
681
644
|
self.validate() # raises ValidationError if doesn't validate
|
682
645
|
if self._artifact is None:
|
683
|
-
self._artifact = Artifact.
|
646
|
+
self._artifact = Artifact.from_dataframe(
|
684
647
|
self._dataset,
|
685
648
|
key=key,
|
686
649
|
description=description,
|
@@ -697,19 +660,245 @@ class DataFrameCurator(Curator):
|
|
697
660
|
)
|
698
661
|
|
699
662
|
|
663
|
+
class DataFrameCurator(SlotsCurator):
|
664
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
665
|
+
"""Curator for `DataFrame`.
|
666
|
+
|
667
|
+
Args:
|
668
|
+
dataset: The DataFrame-like object to validate & annotate.
|
669
|
+
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
670
|
+
slot: Indicate the slot in a composite curator for a composite data structure.
|
671
|
+
|
672
|
+
Examples:
|
673
|
+
|
674
|
+
For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_dataframe`.
|
675
|
+
|
676
|
+
Here is an example that enforces a minimal set of columns in the dataframe.
|
677
|
+
|
678
|
+
.. literalinclude:: scripts/curate_dataframe_minimal_errors.py
|
679
|
+
:language: python
|
680
|
+
|
681
|
+
Under-the-hood, this used the following schema.
|
682
|
+
|
683
|
+
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
684
|
+
:language: python
|
685
|
+
|
686
|
+
Valid features & labels were defined as:
|
687
|
+
|
688
|
+
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
689
|
+
:language: python
|
690
|
+
|
691
|
+
It is also possible to curate the `attrs` slot.
|
692
|
+
|
693
|
+
.. literalinclude:: scripts/curate_dataframe_attrs.py
|
694
|
+
:language: python
|
695
|
+
"""
|
696
|
+
|
697
|
+
def __init__(
|
698
|
+
self,
|
699
|
+
dataset: pd.DataFrame | Artifact,
|
700
|
+
schema: Schema,
|
701
|
+
slot: str | None = None,
|
702
|
+
) -> None:
|
703
|
+
super().__init__(dataset=dataset, schema=schema)
|
704
|
+
|
705
|
+
# Create atomic curator for features only
|
706
|
+
if len(self._schema.features.all()) > 0:
|
707
|
+
self._atomic_curator = ComponentCurator(
|
708
|
+
dataset=dataset,
|
709
|
+
schema=schema,
|
710
|
+
slot=slot,
|
711
|
+
)
|
712
|
+
|
713
|
+
# Handle (nested) attrs
|
714
|
+
if slot is None and schema.slots:
|
715
|
+
for slot_name, slot_schema in schema.slots.items():
|
716
|
+
if slot_name.startswith("attrs"):
|
717
|
+
path_parts = slot_name.split(":")
|
718
|
+
attrs_dict = getattr(self._dataset, "attrs", None)
|
719
|
+
if attrs_dict is not None:
|
720
|
+
if len(path_parts) == 1:
|
721
|
+
data = attrs_dict
|
722
|
+
else:
|
723
|
+
deeper_keys = path_parts[1:]
|
724
|
+
data = _resolve_schema_slot_path(
|
725
|
+
attrs_dict, deeper_keys, slot_name, "attrs"
|
726
|
+
)
|
727
|
+
df = pd.DataFrame([data])
|
728
|
+
self._slots[slot_name] = ComponentCurator(
|
729
|
+
df, slot_schema, slot=slot_name
|
730
|
+
)
|
731
|
+
else:
|
732
|
+
raise ValueError(
|
733
|
+
f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
|
734
|
+
)
|
735
|
+
|
736
|
+
@property
|
737
|
+
def cat(self) -> DataFrameCatManager:
|
738
|
+
"""Manage categoricals by updating registries."""
|
739
|
+
if hasattr(self, "_atomic_curator"):
|
740
|
+
return self._atomic_curator.cat
|
741
|
+
raise AttributeError("cat is only available for slots DataFrameCurator")
|
742
|
+
|
743
|
+
def standardize(self) -> None:
|
744
|
+
"""Standardize the dataset.
|
745
|
+
|
746
|
+
- Adds missing columns for features
|
747
|
+
- Fills missing values for features with default values
|
748
|
+
"""
|
749
|
+
if hasattr(self, "_atomic_curator"):
|
750
|
+
self._atomic_curator.standardize()
|
751
|
+
else:
|
752
|
+
for slot_curator in self._slots.values():
|
753
|
+
slot_curator.standardize()
|
754
|
+
|
755
|
+
@doc_args(VALIDATE_DOCSTRING)
|
756
|
+
def validate(self) -> None:
|
757
|
+
"""{}."""
|
758
|
+
if hasattr(self, "_atomic_curator"):
|
759
|
+
self._atomic_curator.validate()
|
760
|
+
self._is_validated = self._atomic_curator._is_validated
|
761
|
+
if self._schema.itype == "Composite":
|
762
|
+
super().validate()
|
763
|
+
|
764
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
765
|
+
def save_artifact(
|
766
|
+
self, *, key=None, description=None, revises=None, run=None
|
767
|
+
) -> Artifact:
|
768
|
+
"""{}."""
|
769
|
+
if not self._is_validated:
|
770
|
+
self.validate()
|
771
|
+
|
772
|
+
if self._slots:
|
773
|
+
self._slots["columns"] = self._atomic_curator
|
774
|
+
try:
|
775
|
+
return super().save_artifact(
|
776
|
+
key=key, description=description, revises=revises, run=run
|
777
|
+
)
|
778
|
+
finally:
|
779
|
+
del self._slots["columns"]
|
780
|
+
else:
|
781
|
+
return self._atomic_curator.save_artifact(
|
782
|
+
key=key, description=description, revises=revises, run=run
|
783
|
+
)
|
784
|
+
|
785
|
+
|
786
|
+
def _resolve_schema_slot_path(
|
787
|
+
target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
|
788
|
+
) -> Any:
|
789
|
+
"""Resolve a schema slot path by traversing nested dictionary keys.
|
790
|
+
|
791
|
+
Args:
|
792
|
+
target_dict: Root dictionary to traverse
|
793
|
+
slot_keys: Sequence of keys defining the paths to traverse
|
794
|
+
slot_name: Schema slot identifier for error context
|
795
|
+
base_path: Base path string for error context
|
796
|
+
|
797
|
+
Returns:
|
798
|
+
The value at the resolved path
|
799
|
+
"""
|
800
|
+
current = target_dict
|
801
|
+
|
802
|
+
for key in slot_keys:
|
803
|
+
base_path += f"['{key}']"
|
804
|
+
try:
|
805
|
+
current = current[key]
|
806
|
+
except KeyError:
|
807
|
+
available = (
|
808
|
+
list(current.keys()) if isinstance(current, dict) else "not a dict"
|
809
|
+
)
|
810
|
+
raise InvalidArgument(
|
811
|
+
f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
|
812
|
+
f"not found. Available keys at this level: {available}"
|
813
|
+
) from None
|
814
|
+
|
815
|
+
return current
|
816
|
+
|
817
|
+
|
818
|
+
def _handle_dict_slots(
|
819
|
+
dataset: ScverseDataStructures, slot: str
|
820
|
+
) -> tuple[pd.DataFrame | None, str | None, str | None]:
|
821
|
+
"""Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.
|
822
|
+
|
823
|
+
Supports two patterns:
|
824
|
+
- Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key"
|
825
|
+
- Modality dict access: "modality:uns"
|
826
|
+
|
827
|
+
Args:
|
828
|
+
dataset: The scverse datastructure object
|
829
|
+
slot: The slot path string to parse like 'uns:path:to'.
|
830
|
+
|
831
|
+
Returns:
|
832
|
+
tuple: (dataframe, modality_key, remaining_slot_path)
|
833
|
+
- dataframe: Single-row DataFrame containing the resolved data
|
834
|
+
- modality_key: Modality identifier if slot targets modality dict, else None
|
835
|
+
- remaining_slot_path: The dict attribute and nested keys as string
|
836
|
+
"""
|
837
|
+
path_parts = slot.split(":")
|
838
|
+
|
839
|
+
# Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..."
|
840
|
+
if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]:
|
841
|
+
dict_attr = getattr(dataset, path_parts[0], None)
|
842
|
+
if dict_attr is not None:
|
843
|
+
if len(path_parts) == 1:
|
844
|
+
return pd.DataFrame([dict_attr]), None, path_parts[0]
|
845
|
+
|
846
|
+
deeper_keys = path_parts[1:]
|
847
|
+
data = _resolve_schema_slot_path(
|
848
|
+
dict_attr, deeper_keys, slot, path_parts[0]
|
849
|
+
)
|
850
|
+
return pd.DataFrame([data]), None, ":".join(path_parts[1:])
|
851
|
+
|
852
|
+
# Handle modality dict slots: "modality:uns", "modality:uns:key1:key2"
|
853
|
+
elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]:
|
854
|
+
modality, dict_name = path_parts[0], path_parts[1]
|
855
|
+
try:
|
856
|
+
modality_dataset = dataset[modality]
|
857
|
+
dict_attr = getattr(modality_dataset, dict_name, None)
|
858
|
+
if dict_attr is not None:
|
859
|
+
if len(path_parts) == 2:
|
860
|
+
return pd.DataFrame([dict_attr]), modality, dict_name
|
861
|
+
|
862
|
+
deeper_keys = path_parts[2:]
|
863
|
+
data = _resolve_schema_slot_path(
|
864
|
+
dict_attr, deeper_keys, slot, f"{modality}.{dict_name}"
|
865
|
+
)
|
866
|
+
return pd.DataFrame([data]), modality, ":".join(path_parts[1:])
|
867
|
+
except (KeyError, AttributeError):
|
868
|
+
pass
|
869
|
+
else:
|
870
|
+
raise InvalidArgument(
|
871
|
+
f"Invalid dict slot pattern '{slot}'. Expected formats: "
|
872
|
+
f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'"
|
873
|
+
)
|
874
|
+
|
875
|
+
return None, None, None
|
876
|
+
|
877
|
+
|
878
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
700
879
|
class AnnDataCurator(SlotsCurator):
|
701
880
|
"""Curator for `AnnData`.
|
702
881
|
|
882
|
+
{}
|
883
|
+
|
703
884
|
Args:
|
704
885
|
dataset: The AnnData-like object to validate & annotate.
|
705
886
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
706
887
|
|
707
|
-
|
888
|
+
Examples:
|
889
|
+
|
890
|
+
Curate Ensembl gene IDs and valid features in obs:
|
708
891
|
|
709
892
|
.. literalinclude:: scripts/curate_anndata_flexible.py
|
710
893
|
:language: python
|
711
894
|
:caption: curate_anndata_flexible.py
|
712
895
|
|
896
|
+
Curate `uns` dictionary:
|
897
|
+
|
898
|
+
.. literalinclude:: scripts/curate_anndata_uns.py
|
899
|
+
:language: python
|
900
|
+
:caption: curate_anndata_uns.py
|
901
|
+
|
713
902
|
See Also:
|
714
903
|
:meth:`~lamindb.Artifact.from_anndata`.
|
715
904
|
"""
|
@@ -722,34 +911,37 @@ class AnnDataCurator(SlotsCurator):
|
|
722
911
|
super().__init__(dataset=dataset, schema=schema)
|
723
912
|
if not data_is_scversedatastructure(self._dataset, "AnnData"):
|
724
913
|
raise InvalidArgument("dataset must be AnnData-like.")
|
725
|
-
if schema.otype != "AnnData":
|
914
|
+
if schema.otype and schema.otype != "AnnData":
|
726
915
|
raise InvalidArgument("Schema otype must be 'AnnData'.")
|
727
|
-
|
728
|
-
|
729
|
-
|
916
|
+
|
917
|
+
for slot, slot_schema in schema.slots.items():
|
918
|
+
if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"):
|
919
|
+
raise ValueError(
|
920
|
+
f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}"
|
921
|
+
)
|
922
|
+
if slot.startswith("uns"):
|
923
|
+
df, _, _ = _handle_dict_slots(self._dataset, slot)
|
924
|
+
elif slot in {"obs", "var", "var.T"}:
|
925
|
+
df = (
|
730
926
|
getattr(self._dataset, slot.strip(".T")).T
|
731
927
|
if slot == "var.T"
|
732
928
|
or (
|
733
|
-
# backward compat
|
734
929
|
slot == "var"
|
735
930
|
and schema.slots["var"].itype not in {None, "Feature"}
|
736
931
|
)
|
737
932
|
else getattr(self._dataset, slot)
|
738
|
-
)
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
"var"
|
751
|
-
].cat._cat_vectors.pop("columns")
|
752
|
-
self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
|
933
|
+
)
|
934
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
|
935
|
+
|
936
|
+
# Handle var index naming for backward compat
|
937
|
+
if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}:
|
938
|
+
logger.warning(
|
939
|
+
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
940
|
+
)
|
941
|
+
self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
|
942
|
+
"var"
|
943
|
+
].cat._cat_vectors.pop("columns")
|
944
|
+
self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
|
753
945
|
|
754
946
|
|
755
947
|
def _assign_var_fields_categoricals_multimodal(
|
@@ -759,11 +951,10 @@ def _assign_var_fields_categoricals_multimodal(
|
|
759
951
|
slot_schema: Schema,
|
760
952
|
var_fields: dict[str, FieldAttr],
|
761
953
|
cat_vectors: dict[str, dict[str, CatVector]],
|
762
|
-
slots: dict[str,
|
954
|
+
slots: dict[str, ComponentCurator],
|
763
955
|
) -> None:
|
764
956
|
"""Assigns var_fields and categoricals for multimodal data curators."""
|
765
957
|
if modality is not None:
|
766
|
-
# Makes sure that all tables are present
|
767
958
|
var_fields[modality] = None
|
768
959
|
cat_vectors[modality] = {}
|
769
960
|
|
@@ -784,15 +975,17 @@ def _assign_var_fields_categoricals_multimodal(
|
|
784
975
|
cat_vectors[modality] = obs_fields
|
785
976
|
|
786
977
|
|
978
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
787
979
|
class MuDataCurator(SlotsCurator):
|
788
980
|
"""Curator for `MuData`.
|
789
981
|
|
982
|
+
{}
|
983
|
+
|
790
984
|
Args:
|
791
985
|
dataset: The MuData-like object to validate & annotate.
|
792
986
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
793
987
|
|
794
988
|
Example:
|
795
|
-
|
796
989
|
.. literalinclude:: scripts/curate_mudata.py
|
797
990
|
:language: python
|
798
991
|
:caption: curate_mudata.py
|
@@ -813,12 +1006,32 @@ class MuDataCurator(SlotsCurator):
|
|
813
1006
|
raise InvalidArgument("Schema otype must be 'MuData'.")
|
814
1007
|
|
815
1008
|
for slot, slot_schema in schema.slots.items():
|
816
|
-
|
817
|
-
|
818
|
-
|
1009
|
+
# Handle slots: "mdata.uns", "modality:uns"
|
1010
|
+
if "uns" in slot:
|
1011
|
+
df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)
|
819
1012
|
else:
|
820
|
-
|
821
|
-
|
1013
|
+
# Handle slots: "modality:obs", "modality:var"
|
1014
|
+
parts = slot.split(":")
|
1015
|
+
if len(parts) == 2:
|
1016
|
+
modality, modality_slot = parts
|
1017
|
+
try:
|
1018
|
+
schema_dataset = self._dataset[modality]
|
1019
|
+
df = getattr(schema_dataset, modality_slot.rstrip(".T"))
|
1020
|
+
except KeyError:
|
1021
|
+
raise InvalidArgument(
|
1022
|
+
f"Modality '{modality}' not found in MuData"
|
1023
|
+
) from None
|
1024
|
+
except AttributeError:
|
1025
|
+
raise InvalidArgument(
|
1026
|
+
f"Attribute '{modality_slot}' not found on modality '{modality}'"
|
1027
|
+
) from None
|
1028
|
+
else:
|
1029
|
+
# Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above)
|
1030
|
+
modality, modality_slot = None, slot
|
1031
|
+
schema_dataset = self._dataset
|
1032
|
+
df = getattr(schema_dataset, modality_slot.rstrip(".T"))
|
1033
|
+
|
1034
|
+
# Transpose var if necessary
|
822
1035
|
if modality_slot == "var" and schema.slots[slot].itype not in {
|
823
1036
|
None,
|
824
1037
|
"Feature",
|
@@ -826,19 +1039,12 @@ class MuDataCurator(SlotsCurator):
|
|
826
1039
|
logger.warning(
|
827
1040
|
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
828
1041
|
)
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
modality_slot == "var"
|
836
|
-
and schema.slots[slot].itype not in {None, "Feature"}
|
837
|
-
)
|
838
|
-
else getattr(schema_dataset, modality_slot)
|
839
|
-
),
|
840
|
-
slot_schema,
|
841
|
-
)
|
1042
|
+
df = df.T
|
1043
|
+
elif modality_slot == "var.T":
|
1044
|
+
df = df.T
|
1045
|
+
|
1046
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
|
1047
|
+
|
842
1048
|
_assign_var_fields_categoricals_multimodal(
|
843
1049
|
modality=modality,
|
844
1050
|
slot_type=modality_slot,
|
@@ -848,18 +1054,21 @@ class MuDataCurator(SlotsCurator):
|
|
848
1054
|
cat_vectors=self._cat_vectors,
|
849
1055
|
slots=self._slots,
|
850
1056
|
)
|
1057
|
+
|
851
1058
|
self._columns_field = self._var_fields
|
852
1059
|
|
853
1060
|
|
1061
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
854
1062
|
class SpatialDataCurator(SlotsCurator):
|
855
1063
|
"""Curator for `SpatialData`.
|
856
1064
|
|
1065
|
+
{}
|
1066
|
+
|
857
1067
|
Args:
|
858
1068
|
dataset: The SpatialData-like object to validate & annotate.
|
859
1069
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
860
1070
|
|
861
1071
|
Example:
|
862
|
-
|
863
1072
|
.. literalinclude:: scripts/curate_spatialdata.py
|
864
1073
|
:language: python
|
865
1074
|
:caption: curate_spatialdata.py
|
@@ -880,69 +1089,75 @@ class SpatialDataCurator(SlotsCurator):
|
|
880
1089
|
raise InvalidArgument("Schema otype must be 'SpatialData'.")
|
881
1090
|
|
882
1091
|
for slot, slot_schema in schema.slots.items():
|
883
|
-
|
884
|
-
if (
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
1092
|
+
# Handle slots: "sdata:attrs"
|
1093
|
+
if slot.startswith("attrs"):
|
1094
|
+
df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)
|
1095
|
+
else:
|
1096
|
+
parts = slot.split(":")
|
1097
|
+
# Handle slots: "tables:table_key:obs", "tables:table_key:var"
|
1098
|
+
if len(parts) == 3 and parts[0] == "tables":
|
1099
|
+
table_key, table_slot = parts[1], parts[2]
|
1100
|
+
try:
|
1101
|
+
slot_object = self._dataset.tables[table_key]
|
1102
|
+
df = getattr(slot_object, table_slot.rstrip(".T"))
|
1103
|
+
except KeyError:
|
1104
|
+
raise InvalidArgument(
|
1105
|
+
f"Table '{table_key}' not found in sdata.tables"
|
1106
|
+
) from None
|
1107
|
+
except AttributeError:
|
1108
|
+
raise InvalidArgument(
|
1109
|
+
f"Attribute '{table_slot}' not found on table '{table_key}'"
|
1110
|
+
) from None
|
892
1111
|
else:
|
893
|
-
|
894
|
-
|
895
|
-
if sub_slot == "var" and schema.slots[slot].itype not in {
|
896
|
-
None,
|
897
|
-
"Feature",
|
898
|
-
}:
|
899
|
-
logger.warning(
|
900
|
-
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
901
|
-
)
|
902
|
-
data_object = (
|
903
|
-
getattr(slot_object, sub_slot.rstrip(".T")).T
|
904
|
-
if sub_slot == "var.T"
|
905
|
-
or (
|
906
|
-
# backward compat
|
907
|
-
sub_slot == "var"
|
908
|
-
and schema.slots[slot].itype not in {None, "Feature"}
|
909
|
-
)
|
910
|
-
else getattr(slot_object, sub_slot)
|
911
|
-
)
|
912
|
-
elif len(split_result) == 1 or (
|
913
|
-
len(split_result) > 1 and split_result[0] == "attrs"
|
914
|
-
):
|
915
|
-
table_key = None
|
916
|
-
if len(split_result) == 1:
|
917
|
-
if split_result[0] != "attrs":
|
1112
|
+
# Handle legacy single keys for backward compatibility
|
1113
|
+
if len(parts) == 1 and parts[0] != "attrs":
|
918
1114
|
logger.warning(
|
919
1115
|
f"please prefix slot {slot} with 'attrs:' going forward"
|
920
1116
|
)
|
921
|
-
|
922
|
-
|
1117
|
+
try:
|
1118
|
+
df = pd.DataFrame([self._dataset.attrs[slot]])
|
1119
|
+
table_key = None
|
1120
|
+
table_slot = slot
|
1121
|
+
except KeyError:
|
1122
|
+
raise InvalidArgument(
|
1123
|
+
f"Slot '{slot}' not found in sdata.attrs"
|
1124
|
+
) from None
|
923
1125
|
else:
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
1126
|
+
raise InvalidArgument(f"Unrecognized slot format: {slot}")
|
1127
|
+
|
1128
|
+
# Handle var transposition logic
|
1129
|
+
if table_slot == "var" and schema.slots[slot].itype not in {
|
1130
|
+
None,
|
1131
|
+
"Feature",
|
1132
|
+
}:
|
1133
|
+
logger.warning(
|
1134
|
+
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
1135
|
+
)
|
1136
|
+
df = df.T
|
1137
|
+
elif table_slot == "var.T":
|
1138
|
+
df = df.T
|
1139
|
+
|
1140
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot)
|
1141
|
+
|
931
1142
|
_assign_var_fields_categoricals_multimodal(
|
932
1143
|
modality=table_key,
|
933
|
-
slot_type=
|
1144
|
+
slot_type=table_slot,
|
934
1145
|
slot=slot,
|
935
1146
|
slot_schema=slot_schema,
|
936
1147
|
var_fields=self._var_fields,
|
937
1148
|
cat_vectors=self._cat_vectors,
|
938
1149
|
slots=self._slots,
|
939
1150
|
)
|
1151
|
+
|
940
1152
|
self._columns_field = self._var_fields
|
941
1153
|
|
942
1154
|
|
1155
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
943
1156
|
class TiledbsomaExperimentCurator(SlotsCurator):
|
944
1157
|
"""Curator for `tiledbsoma.Experiment`.
|
945
1158
|
|
1159
|
+
{}
|
1160
|
+
|
946
1161
|
Args:
|
947
1162
|
dataset: The `tiledbsoma.Experiment` object.
|
948
1163
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
@@ -979,7 +1194,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
979
1194
|
.drop("soma_joinid", axis=1, errors="ignore")
|
980
1195
|
)
|
981
1196
|
|
982
|
-
self._slots[slot] =
|
1197
|
+
self._slots[slot] = ComponentCurator(
|
983
1198
|
(schema_dataset.T if modality_slot == "var.T" else schema_dataset),
|
984
1199
|
slot_schema,
|
985
1200
|
)
|
@@ -992,7 +1207,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
992
1207
|
.to_pandas()
|
993
1208
|
.drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
|
994
1209
|
)
|
995
|
-
self._slots[slot] =
|
1210
|
+
self._slots[slot] = ComponentCurator(
|
996
1211
|
schema_dataset,
|
997
1212
|
slot_schema,
|
998
1213
|
)
|
@@ -1042,9 +1257,12 @@ class CatVector:
|
|
1042
1257
|
self._maximal_set = maximal_set
|
1043
1258
|
|
1044
1259
|
self._all_filters = {"source": self._source, "organism": self._organism}
|
1260
|
+
|
1045
1261
|
if self._subtype_str and "=" in self._subtype_str:
|
1046
1262
|
self._all_filters.update(
|
1047
|
-
resolve_relation_filters(
|
1263
|
+
resolve_relation_filters(
|
1264
|
+
parse_filter_string(self._subtype_str), self._field.field.model
|
1265
|
+
) # type: ignore
|
1048
1266
|
)
|
1049
1267
|
|
1050
1268
|
if hasattr(field.field.model, "_name_field"):
|
@@ -1243,7 +1461,7 @@ class CatVector:
|
|
1243
1461
|
type_record = registry.get(name=self._subtype_str)
|
1244
1462
|
if df is not None and registry == Feature:
|
1245
1463
|
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
1246
|
-
non_validated_records = Feature.
|
1464
|
+
non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])
|
1247
1465
|
else:
|
1248
1466
|
if (
|
1249
1467
|
self._organism
|
@@ -1345,7 +1563,7 @@ class CatVector:
|
|
1345
1563
|
warning_message += "\n for remaining terms:\n"
|
1346
1564
|
warning_message += f" → fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}"
|
1347
1565
|
if self._subtype_query_set is not None:
|
1348
|
-
warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.
|
1566
|
+
warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.to_list('name')}"
|
1349
1567
|
logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
|
1350
1568
|
logger.warning(warning_message)
|
1351
1569
|
if self._cat_manager is not None:
|
@@ -1495,6 +1713,30 @@ class DataFrameCatManager:
|
|
1495
1713
|
"""The categorical features."""
|
1496
1714
|
return self._categoricals
|
1497
1715
|
|
1716
|
+
def __repr__(self) -> str:
|
1717
|
+
cls_name = colors.green(self.__class__.__name__)
|
1718
|
+
|
1719
|
+
status_str = (
|
1720
|
+
f"{colors.green('validated')}"
|
1721
|
+
if self._is_validated
|
1722
|
+
else f"{colors.yellow('unvalidated')}"
|
1723
|
+
)
|
1724
|
+
|
1725
|
+
info_parts = []
|
1726
|
+
|
1727
|
+
cat_count = len(self._categoricals)
|
1728
|
+
if cat_count > 0:
|
1729
|
+
info_parts.append(f"categorical_features={cat_count}")
|
1730
|
+
|
1731
|
+
if self._slot:
|
1732
|
+
info_parts.append(f"slot: {colors.italic(self._slot)}")
|
1733
|
+
|
1734
|
+
info_str = ", ".join(info_parts)
|
1735
|
+
if info_str:
|
1736
|
+
return f"{cls_name}({info_str}, {status_str})"
|
1737
|
+
else:
|
1738
|
+
return f"{cls_name}({status_str})"
|
1739
|
+
|
1498
1740
|
def lookup(self, public: bool = False) -> CatLookup:
|
1499
1741
|
"""Lookup categories.
|
1500
1742
|
|
@@ -1539,7 +1781,9 @@ class DataFrameCatManager:
|
|
1539
1781
|
key: The key referencing the column in the DataFrame to standardize.
|
1540
1782
|
"""
|
1541
1783
|
if self._artifact is not None:
|
1542
|
-
raise RuntimeError(
|
1784
|
+
raise RuntimeError(
|
1785
|
+
"Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
|
1786
|
+
)
|
1543
1787
|
|
1544
1788
|
if key == "all":
|
1545
1789
|
logger.warning(
|
@@ -1612,7 +1856,7 @@ def get_organism_kwargs(
|
|
1612
1856
|
def annotate_artifact(
|
1613
1857
|
artifact: Artifact,
|
1614
1858
|
*,
|
1615
|
-
curator:
|
1859
|
+
curator: SlotsCurator | None = None,
|
1616
1860
|
cat_vectors: dict[str, CatVector] | None = None,
|
1617
1861
|
) -> Artifact:
|
1618
1862
|
from .. import settings
|
@@ -1645,7 +1889,9 @@ def annotate_artifact(
|
|
1645
1889
|
)
|
1646
1890
|
|
1647
1891
|
# annotate with inferred schemas aka feature sets
|
1648
|
-
if
|
1892
|
+
if (
|
1893
|
+
artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None
|
1894
|
+
): # Prevent overwriting user-defined schemas that contain slots
|
1649
1895
|
features = cat_vectors["columns"].records
|
1650
1896
|
if features is not None:
|
1651
1897
|
index_feature = artifact.schema.index
|
@@ -1665,7 +1911,11 @@ def annotate_artifact(
|
|
1665
1911
|
logger.important(
|
1666
1912
|
f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
|
1667
1913
|
)
|
1668
|
-
itype =
|
1914
|
+
itype = (
|
1915
|
+
Feature.name
|
1916
|
+
if artifact.schema.itype == "Composite"
|
1917
|
+
else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
|
1918
|
+
)
|
1669
1919
|
feature_set = Schema(itype=itype, n=len(features))
|
1670
1920
|
artifact.feature_sets.add(
|
1671
1921
|
feature_set.save(), through_defaults={"slot": "columns"}
|
@@ -1700,9 +1950,13 @@ def annotate_artifact(
|
|
1700
1950
|
logger.important(
|
1701
1951
|
f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
|
1702
1952
|
)
|
1703
|
-
itype =
|
1704
|
-
|
1705
|
-
|
1953
|
+
itype = (
|
1954
|
+
Feature.name
|
1955
|
+
if artifact.schema.slots[slot].itype == "Composite"
|
1956
|
+
else parse_cat_dtype(
|
1957
|
+
artifact.schema.slots[slot].itype, is_itype=True
|
1958
|
+
)["field"]
|
1959
|
+
)
|
1706
1960
|
feature_set = Schema(itype=itype, n=len(features))
|
1707
1961
|
artifact.feature_sets.add(
|
1708
1962
|
feature_set.save(), through_defaults={"slot": slot}
|