lamindb 1.10.1__py3-none-any.whl → 1.11a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +89 -49
- lamindb/_finish.py +14 -12
- lamindb/_tracked.py +2 -4
- lamindb/_view.py +1 -1
- lamindb/base/__init__.py +2 -1
- lamindb/base/dtypes.py +76 -0
- lamindb/core/_settings.py +45 -2
- lamindb/core/storage/_anndata_accessor.py +118 -26
- lamindb/core/storage/_backed_access.py +10 -7
- lamindb/core/storage/_spatialdata_accessor.py +15 -4
- lamindb/core/storage/_zarr.py +3 -0
- lamindb/curators/_legacy.py +16 -3
- lamindb/curators/core.py +439 -191
- lamindb/examples/cellxgene/__init__.py +8 -3
- lamindb/examples/cellxgene/_cellxgene.py +127 -13
- lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
- lamindb/examples/croissant/__init__.py +12 -2
- lamindb/examples/datasets/__init__.py +2 -2
- lamindb/examples/datasets/_core.py +1 -1
- lamindb/examples/datasets/_small.py +66 -22
- lamindb/examples/datasets/mini_immuno.py +1 -0
- lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
- lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
- lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
- lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
- lamindb/migrations/0121_recorduser.py +53 -0
- lamindb/models/__init__.py +3 -1
- lamindb/models/_describe.py +2 -2
- lamindb/models/_feature_manager.py +53 -53
- lamindb/models/_from_values.py +2 -2
- lamindb/models/_is_versioned.py +4 -4
- lamindb/models/_label_manager.py +4 -4
- lamindb/models/artifact.py +336 -136
- lamindb/models/artifact_set.py +36 -1
- lamindb/models/can_curate.py +1 -2
- lamindb/models/collection.py +3 -34
- lamindb/models/feature.py +111 -7
- lamindb/models/has_parents.py +11 -11
- lamindb/models/project.py +42 -2
- lamindb/models/query_manager.py +16 -7
- lamindb/models/query_set.py +59 -34
- lamindb/models/record.py +25 -4
- lamindb/models/run.py +8 -6
- lamindb/models/schema.py +54 -26
- lamindb/models/sqlrecord.py +123 -25
- lamindb/models/storage.py +59 -14
- lamindb/models/transform.py +17 -17
- lamindb/models/ulabel.py +6 -1
- {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/METADATA +3 -3
- {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/RECORD +52 -47
- {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/LICENSE +0 -0
- {lamindb-1.10.1.dist-info → lamindb-1.11a1.dist-info}/WHEEL +0 -0
lamindb/curators/core.py
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
|
6
6
|
Curator
|
7
7
|
SlotsCurator
|
8
|
+
ComponentCurator
|
8
9
|
CatVector
|
9
10
|
CatLookup
|
10
11
|
DataFrameCatManager
|
@@ -15,7 +16,6 @@ from __future__ import annotations
|
|
15
16
|
|
16
17
|
import copy
|
17
18
|
import re
|
18
|
-
from collections.abc import Iterable
|
19
19
|
from typing import TYPE_CHECKING, Any, Callable
|
20
20
|
|
21
21
|
import lamindb_setup as ln_setup
|
@@ -24,7 +24,9 @@ import pandas as pd
|
|
24
24
|
import pandera.pandas as pandera
|
25
25
|
from lamin_utils import colors, logger
|
26
26
|
from lamindb_setup.core._docs import doc_args
|
27
|
+
from lamindb_setup.core.upath import LocalPathClasses
|
27
28
|
|
29
|
+
from lamindb.base.dtypes import check_dtype
|
28
30
|
from lamindb.base.types import FieldAttr # noqa
|
29
31
|
from lamindb.models import (
|
30
32
|
Artifact,
|
@@ -48,6 +50,7 @@ from lamindb.models.feature import (
|
|
48
50
|
from ..errors import InvalidArgument, ValidationError
|
49
51
|
|
50
52
|
if TYPE_CHECKING:
|
53
|
+
from collections.abc import Iterable
|
51
54
|
from typing import Any
|
52
55
|
|
53
56
|
from anndata import AnnData
|
@@ -145,6 +148,7 @@ CAT_MANAGER_DOCSTRING = """Manage categoricals by updating registries."""
|
|
145
148
|
|
146
149
|
SLOTS_DOCSTRING = """Access sub curators by slot."""
|
147
150
|
|
151
|
+
SLOTS_DETAILS_DOCSTRING = """Uses **slots** to specify which component contains which schema. Slots are keys that identify where features are stored within composite data structures."""
|
148
152
|
|
149
153
|
VALIDATE_DOCSTRING = """Validate dataset against Schema.
|
150
154
|
|
@@ -197,7 +201,21 @@ class Curator:
|
|
197
201
|
"MuData",
|
198
202
|
"SpatialData",
|
199
203
|
}:
|
200
|
-
|
204
|
+
# Open remote AnnData Artifacts
|
205
|
+
if not isinstance(self._artifact.path, LocalPathClasses):
|
206
|
+
if self._artifact.otype in {
|
207
|
+
"AnnData",
|
208
|
+
}:
|
209
|
+
try:
|
210
|
+
self._dataset = self._dataset.open(mode="r")
|
211
|
+
# open can raise various errors. Fall back to loading into memory if open fails
|
212
|
+
except Exception as e:
|
213
|
+
logger.warning(
|
214
|
+
f"Unable to open remote AnnData Artifact: {e}. Falling back to loading into memory."
|
215
|
+
)
|
216
|
+
self._dataset = self._dataset.load(is_run_input=False)
|
217
|
+
else:
|
218
|
+
self._dataset = self._dataset.load(is_run_input=False)
|
201
219
|
self._schema: Schema | None = schema
|
202
220
|
self._is_validated: bool = False
|
203
221
|
|
@@ -284,9 +302,12 @@ class Curator:
|
|
284
302
|
)
|
285
303
|
|
286
304
|
|
305
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
287
306
|
class SlotsCurator(Curator):
|
288
307
|
"""Curator for a dataset with slots.
|
289
308
|
|
309
|
+
{}
|
310
|
+
|
290
311
|
Args:
|
291
312
|
dataset: The dataset to validate & annotate.
|
292
313
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
@@ -298,7 +319,7 @@ class SlotsCurator(Curator):
|
|
298
319
|
schema: Schema,
|
299
320
|
) -> None:
|
300
321
|
super().__init__(dataset=dataset, schema=schema)
|
301
|
-
self._slots: dict[str,
|
322
|
+
self._slots: dict[str, ComponentCurator] = {}
|
302
323
|
|
303
324
|
# used for multimodal data structures (not AnnData)
|
304
325
|
# in form of {table/modality_key: var_field}
|
@@ -308,7 +329,7 @@ class SlotsCurator(Curator):
|
|
308
329
|
|
309
330
|
@property
|
310
331
|
@doc_args(SLOTS_DOCSTRING)
|
311
|
-
def slots(self) -> dict[str,
|
332
|
+
def slots(self) -> dict[str, ComponentCurator]:
|
312
333
|
"""{}""" # noqa: D415
|
313
334
|
return self._slots
|
314
335
|
|
@@ -336,6 +357,10 @@ class SlotsCurator(Curator):
|
|
336
357
|
|
337
358
|
if self._artifact is None:
|
338
359
|
type_mapping = [
|
360
|
+
(
|
361
|
+
lambda dataset: isinstance(dataset, pd.DataFrame),
|
362
|
+
Artifact.from_dataframe,
|
363
|
+
),
|
339
364
|
(
|
340
365
|
lambda dataset: data_is_scversedatastructure(dataset, "AnnData"),
|
341
366
|
Artifact.from_anndata,
|
@@ -364,12 +389,13 @@ class SlotsCurator(Curator):
|
|
364
389
|
)
|
365
390
|
break
|
366
391
|
|
367
|
-
self._artifact.schema = self._schema
|
368
|
-
self._artifact.save()
|
369
392
|
cat_vectors = {}
|
370
393
|
for curator in self._slots.values():
|
371
394
|
for key, cat_vector in curator.cat._cat_vectors.items():
|
372
395
|
cat_vectors[key] = cat_vector
|
396
|
+
|
397
|
+
self._artifact.schema = self._schema
|
398
|
+
self._artifact.save()
|
373
399
|
return annotate_artifact( # type: ignore
|
374
400
|
self._artifact,
|
375
401
|
curator=self,
|
@@ -377,92 +403,21 @@ class SlotsCurator(Curator):
|
|
377
403
|
)
|
378
404
|
|
379
405
|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
return isinstance(value, expected_type)
|
386
|
-
|
387
|
-
|
388
|
-
def check_dtype(expected_type) -> Callable:
|
389
|
-
"""Creates a check function for Pandera that validates a column's dtype.
|
390
|
-
|
391
|
-
Supports both standard dtype checking and mixed list/single values for the same type.
|
392
|
-
For example, a column with expected_type 'float' would also accept a mix of float values and lists of floats.
|
393
|
-
|
394
|
-
Args:
|
395
|
-
expected_type: String identifier for the expected type ('int', 'float', 'num', 'str')
|
396
|
-
|
397
|
-
Returns:
|
398
|
-
A function that checks if a series has the expected dtype or contains mixed types
|
399
|
-
"""
|
400
|
-
|
401
|
-
def check_function(series):
|
402
|
-
# first check if the series is entirely of the expected dtype (fast path)
|
403
|
-
if expected_type == "int" and pd.api.types.is_integer_dtype(series.dtype):
|
404
|
-
return True
|
405
|
-
elif expected_type == "float" and pd.api.types.is_float_dtype(series.dtype):
|
406
|
-
return True
|
407
|
-
elif expected_type == "num" and pd.api.types.is_numeric_dtype(series.dtype):
|
408
|
-
return True
|
409
|
-
elif expected_type == "str" and pd.api.types.is_string_dtype(series.dtype):
|
410
|
-
return True
|
411
|
-
elif expected_type == "path" and pd.api.types.is_string_dtype(series.dtype):
|
412
|
-
return True
|
413
|
-
|
414
|
-
# if we're here, it might be a mixed column with object dtype
|
415
|
-
# need to check each value individually
|
416
|
-
if series.dtype == "object" and expected_type.startswith("list"):
|
417
|
-
expected_type_member = expected_type.replace("list[", "").removesuffix("]")
|
418
|
-
if expected_type_member == "int":
|
419
|
-
return series.apply(lambda x: is_list_of_type(x, int)).all()
|
420
|
-
elif expected_type_member == "float":
|
421
|
-
return series.apply(lambda x: is_list_of_type(x, float)).all()
|
422
|
-
elif expected_type_member == "num":
|
423
|
-
# for numeric, accept either int or float
|
424
|
-
return series.apply(lambda x: is_list_of_type(x, (int, float))).all()
|
425
|
-
elif (
|
426
|
-
expected_type_member == "str"
|
427
|
-
or expected_type_member == "path"
|
428
|
-
or expected_type_member.startswith("cat[")
|
429
|
-
):
|
430
|
-
return series.apply(lambda x: is_list_of_type(x, str)).all()
|
431
|
-
|
432
|
-
# if we get here, the validation failed
|
433
|
-
return False
|
434
|
-
|
435
|
-
return check_function
|
436
|
-
|
437
|
-
|
438
|
-
# this is also currently used as DictCurator
|
439
|
-
class DataFrameCurator(Curator):
|
440
|
-
# the example in the docstring is tested in test_curators_quickstart_example
|
406
|
+
# This is also currently used as DictCurator by flattening dictionaries into wide DataFrames.
|
407
|
+
# Such an approach was never intended and there is room for a DictCurator in the future.
|
408
|
+
# For more context, read https://laminlabs.slack.com/archives/C07DB677JF6/p1753994077716099 and
|
409
|
+
# https://www.notion.so/laminlabs/Add-a-DictCurator-2422aeaa55e180b9a513f91d13970836
|
410
|
+
class ComponentCurator(Curator):
|
441
411
|
"""Curator for `DataFrame`.
|
442
412
|
|
413
|
+
Provides all key functionality to validate Pandas DataFrames.
|
414
|
+
This class is not user facing unlike :class:`~lamindb.DataFrameCurator` which extends this
|
415
|
+
class with functionality to validate the `attrs` slot.
|
416
|
+
|
443
417
|
Args:
|
444
418
|
dataset: The DataFrame-like object to validate & annotate.
|
445
419
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
446
420
|
slot: Indicate the slot in a composite curator for a composite data structure.
|
447
|
-
|
448
|
-
Example:
|
449
|
-
|
450
|
-
For simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
|
451
|
-
|
452
|
-
Here is an example that enforces a minimal set of columns in the dataframe.
|
453
|
-
|
454
|
-
.. literalinclude:: scripts/curate_dataframe_minimal_errors.py
|
455
|
-
:language: python
|
456
|
-
|
457
|
-
Under-the-hood, this used the following schema.
|
458
|
-
|
459
|
-
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
460
|
-
:language: python
|
461
|
-
|
462
|
-
Valid features & labels were defined as:
|
463
|
-
|
464
|
-
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
465
|
-
:language: python
|
466
421
|
"""
|
467
422
|
|
468
423
|
def __init__(
|
@@ -478,18 +433,18 @@ class DataFrameCurator(Curator):
|
|
478
433
|
feature_ids: set[int] = set()
|
479
434
|
|
480
435
|
if schema.flexible:
|
481
|
-
features += Feature.filter(name__in=self._dataset.keys()).
|
436
|
+
features += Feature.filter(name__in=self._dataset.keys()).to_list()
|
482
437
|
feature_ids = {feature.id for feature in features}
|
483
438
|
|
484
439
|
if schema.n > 0:
|
485
440
|
if schema._index_feature_uid is not None:
|
486
441
|
schema_features = [
|
487
442
|
feature
|
488
|
-
for feature in schema.members.
|
443
|
+
for feature in schema.members.to_list()
|
489
444
|
if feature.uid != schema._index_feature_uid # type: ignore
|
490
445
|
]
|
491
446
|
else:
|
492
|
-
schema_features = schema.members.
|
447
|
+
schema_features = schema.members.to_list() # type: ignore
|
493
448
|
if feature_ids:
|
494
449
|
features.extend(
|
495
450
|
feature
|
@@ -580,9 +535,13 @@ class DataFrameCurator(Curator):
|
|
580
535
|
# in the DataFrameCatManager, we use the
|
581
536
|
# actual columns of the dataset, not the pandera columns
|
582
537
|
# the pandera columns might have additional optional columns
|
538
|
+
if schema.itype == "Composite":
|
539
|
+
columns_field = Feature.name
|
540
|
+
else:
|
541
|
+
columns_field = parse_cat_dtype(schema.itype, is_itype=True)["field"]
|
583
542
|
self._cat_manager = DataFrameCatManager(
|
584
543
|
self._dataset,
|
585
|
-
columns_field=
|
544
|
+
columns_field=columns_field,
|
586
545
|
categoricals=categoricals,
|
587
546
|
index=schema.index,
|
588
547
|
slot=slot,
|
@@ -601,6 +560,11 @@ class DataFrameCurator(Curator):
|
|
601
560
|
- Adds missing columns for features
|
602
561
|
- Fills missing values for features with default values
|
603
562
|
"""
|
563
|
+
if self._artifact is not None:
|
564
|
+
raise RuntimeError(
|
565
|
+
"Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
|
566
|
+
)
|
567
|
+
|
604
568
|
for feature in self._schema.members:
|
605
569
|
if feature.name not in self._dataset.columns:
|
606
570
|
if feature.default_value is not None or feature.nullable:
|
@@ -679,25 +643,244 @@ class DataFrameCurator(Curator):
|
|
679
643
|
if not self._is_validated:
|
680
644
|
self.validate() # raises ValidationError if doesn't validate
|
681
645
|
if self._artifact is None:
|
682
|
-
self._artifact = Artifact.
|
646
|
+
self._artifact = Artifact.from_dataframe(
|
683
647
|
self._dataset,
|
684
648
|
key=key,
|
685
649
|
description=description,
|
686
650
|
revises=revises,
|
687
651
|
run=run,
|
688
|
-
format=".csv" if key.endswith(".csv") else None,
|
652
|
+
format=".csv" if key is not None and key.endswith(".csv") else None,
|
689
653
|
)
|
690
|
-
|
691
|
-
|
654
|
+
|
655
|
+
self._artifact.schema = self._schema
|
656
|
+
self._artifact.save()
|
692
657
|
return annotate_artifact( # type: ignore
|
693
658
|
self._artifact,
|
694
659
|
cat_vectors=self.cat._cat_vectors,
|
695
660
|
)
|
696
661
|
|
697
662
|
|
663
|
+
class DataFrameCurator(SlotsCurator):
|
664
|
+
# the example in the docstring is tested in test_curators_quickstart_example
|
665
|
+
"""Curator for `DataFrame`.
|
666
|
+
|
667
|
+
Args:
|
668
|
+
dataset: The DataFrame-like object to validate & annotate.
|
669
|
+
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
670
|
+
slot: Indicate the slot in a composite curator for a composite data structure.
|
671
|
+
|
672
|
+
Examples:
|
673
|
+
|
674
|
+
For a simple example using a flexible schema, see :meth:`~lamindb.Artifact.from_df`.
|
675
|
+
|
676
|
+
Here is an example that enforces a minimal set of columns in the dataframe.
|
677
|
+
|
678
|
+
.. literalinclude:: scripts/curate_dataframe_minimal_errors.py
|
679
|
+
:language: python
|
680
|
+
|
681
|
+
Under-the-hood, this used the following schema.
|
682
|
+
|
683
|
+
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
684
|
+
:language: python
|
685
|
+
|
686
|
+
Valid features & labels were defined as:
|
687
|
+
|
688
|
+
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
689
|
+
:language: python
|
690
|
+
|
691
|
+
It is also possible to curate the `attrs` slot.
|
692
|
+
|
693
|
+
.. literalinclude:: scripts/curate_dataframe_attrs.py
|
694
|
+
:language: python
|
695
|
+
"""
|
696
|
+
|
697
|
+
def __init__(
|
698
|
+
self,
|
699
|
+
dataset: pd.DataFrame | Artifact,
|
700
|
+
schema: Schema,
|
701
|
+
slot: str | None = None,
|
702
|
+
) -> None:
|
703
|
+
super().__init__(dataset=dataset, schema=schema)
|
704
|
+
|
705
|
+
# Create atomic curator for features only
|
706
|
+
if len(self._schema.features.all()) > 0:
|
707
|
+
self._atomic_curator = ComponentCurator(
|
708
|
+
dataset=dataset,
|
709
|
+
schema=schema,
|
710
|
+
slot=slot,
|
711
|
+
)
|
712
|
+
|
713
|
+
# Handle (nested) attrs
|
714
|
+
if slot is None and schema.slots:
|
715
|
+
for slot_name, slot_schema in schema.slots.items():
|
716
|
+
if slot_name.startswith("attrs"):
|
717
|
+
path_parts = slot_name.split(":")
|
718
|
+
attrs_dict = getattr(self._dataset, "attrs", None)
|
719
|
+
if attrs_dict is not None:
|
720
|
+
if len(path_parts) == 1:
|
721
|
+
data = attrs_dict
|
722
|
+
else:
|
723
|
+
deeper_keys = path_parts[1:]
|
724
|
+
data = _resolve_schema_slot_path(
|
725
|
+
attrs_dict, deeper_keys, slot_name, "attrs"
|
726
|
+
)
|
727
|
+
df = pd.DataFrame([data])
|
728
|
+
self._slots[slot_name] = ComponentCurator(
|
729
|
+
df, slot_schema, slot=slot_name
|
730
|
+
)
|
731
|
+
else:
|
732
|
+
raise ValueError(
|
733
|
+
f"Slot '{slot_name}' is not supported for DataFrameCurator. Must be 'attrs'."
|
734
|
+
)
|
735
|
+
|
736
|
+
@property
|
737
|
+
def cat(self) -> DataFrameCatManager:
|
738
|
+
"""Manage categoricals by updating registries."""
|
739
|
+
if hasattr(self, "_atomic_curator"):
|
740
|
+
return self._atomic_curator.cat
|
741
|
+
raise AttributeError("cat is only available for slots DataFrameCurator")
|
742
|
+
|
743
|
+
def standardize(self) -> None:
|
744
|
+
"""Standardize the dataset.
|
745
|
+
|
746
|
+
- Adds missing columns for features
|
747
|
+
- Fills missing values for features with default values
|
748
|
+
"""
|
749
|
+
if hasattr(self, "_atomic_curator"):
|
750
|
+
self._atomic_curator.standardize()
|
751
|
+
else:
|
752
|
+
for slot_curator in self._slots.values():
|
753
|
+
slot_curator.standardize()
|
754
|
+
|
755
|
+
@doc_args(VALIDATE_DOCSTRING)
|
756
|
+
def validate(self) -> None:
|
757
|
+
"""{}."""
|
758
|
+
if hasattr(self, "_atomic_curator"):
|
759
|
+
self._atomic_curator.validate()
|
760
|
+
self._is_validated = self._atomic_curator._is_validated
|
761
|
+
if self._schema.itype == "Composite":
|
762
|
+
super().validate()
|
763
|
+
|
764
|
+
@doc_args(SAVE_ARTIFACT_DOCSTRING)
|
765
|
+
def save_artifact(
|
766
|
+
self, *, key=None, description=None, revises=None, run=None
|
767
|
+
) -> Artifact:
|
768
|
+
"""{}."""
|
769
|
+
if not self._is_validated:
|
770
|
+
self.validate()
|
771
|
+
|
772
|
+
if self._slots:
|
773
|
+
self._slots["columns"] = self._atomic_curator
|
774
|
+
try:
|
775
|
+
return super().save_artifact(
|
776
|
+
key=key, description=description, revises=revises, run=run
|
777
|
+
)
|
778
|
+
finally:
|
779
|
+
del self._slots["columns"]
|
780
|
+
else:
|
781
|
+
return self._atomic_curator.save_artifact(
|
782
|
+
key=key, description=description, revises=revises, run=run
|
783
|
+
)
|
784
|
+
|
785
|
+
|
786
|
+
def _resolve_schema_slot_path(
|
787
|
+
target_dict: dict[str, Any], slot_keys: Iterable[str], slot: str, base_path: str
|
788
|
+
) -> Any:
|
789
|
+
"""Resolve a schema slot path by traversing nested dictionary keys.
|
790
|
+
|
791
|
+
Args:
|
792
|
+
target_dict: Root dictionary to traverse
|
793
|
+
slot_keys: Sequence of keys defining the paths to traverse
|
794
|
+
slot_name: Schema slot identifier for error context
|
795
|
+
base_path: Base path string for error context
|
796
|
+
|
797
|
+
Returns:
|
798
|
+
The value at the resolved path
|
799
|
+
"""
|
800
|
+
current = target_dict
|
801
|
+
|
802
|
+
for key in slot_keys:
|
803
|
+
base_path += f"['{key}']"
|
804
|
+
try:
|
805
|
+
current = current[key]
|
806
|
+
except KeyError:
|
807
|
+
available = (
|
808
|
+
list(current.keys()) if isinstance(current, dict) else "not a dict"
|
809
|
+
)
|
810
|
+
raise InvalidArgument(
|
811
|
+
f"Schema slot '{slot}' requires keys {base_path} but key '{key}' "
|
812
|
+
f"not found. Available keys at this level: {available}"
|
813
|
+
) from None
|
814
|
+
|
815
|
+
return current
|
816
|
+
|
817
|
+
|
818
|
+
def _handle_dict_slots(
|
819
|
+
dataset: ScverseDataStructures, slot: str
|
820
|
+
) -> tuple[pd.DataFrame | None, str | None, str | None]:
|
821
|
+
"""Handle dict-based slot paths (uns/attrs standalone or of modalities) for all ScverseCurators.
|
822
|
+
|
823
|
+
Supports two patterns:
|
824
|
+
- Direct dict access: "uns", "attrs", "uns:key1:key2", "attrs:key"
|
825
|
+
- Modality dict access: "modality:uns"
|
826
|
+
|
827
|
+
Args:
|
828
|
+
dataset: The scverse datastructure object
|
829
|
+
slot: The slot path string to parse like 'uns:path:to'.
|
830
|
+
|
831
|
+
Returns:
|
832
|
+
tuple: (dataframe, modality_key, remaining_slot_path)
|
833
|
+
- dataframe: Single-row DataFrame containing the resolved data
|
834
|
+
- modality_key: Modality identifier if slot targets modality dict, else None
|
835
|
+
- remaining_slot_path: The dict attribute and nested keys as string
|
836
|
+
"""
|
837
|
+
path_parts = slot.split(":")
|
838
|
+
|
839
|
+
# Handle direct dict slots: "uns", "attrs", "uns:key1:key2:..."
|
840
|
+
if len(path_parts) >= 1 and path_parts[0] in ["uns", "attrs"]:
|
841
|
+
dict_attr = getattr(dataset, path_parts[0], None)
|
842
|
+
if dict_attr is not None:
|
843
|
+
if len(path_parts) == 1:
|
844
|
+
return pd.DataFrame([dict_attr]), None, path_parts[0]
|
845
|
+
|
846
|
+
deeper_keys = path_parts[1:]
|
847
|
+
data = _resolve_schema_slot_path(
|
848
|
+
dict_attr, deeper_keys, slot, path_parts[0]
|
849
|
+
)
|
850
|
+
return pd.DataFrame([data]), None, ":".join(path_parts[1:])
|
851
|
+
|
852
|
+
# Handle modality dict slots: "modality:uns", "modality:uns:key1:key2"
|
853
|
+
elif len(path_parts) >= 2 and path_parts[1] in ["uns", "attrs"]:
|
854
|
+
modality, dict_name = path_parts[0], path_parts[1]
|
855
|
+
try:
|
856
|
+
modality_dataset = dataset[modality]
|
857
|
+
dict_attr = getattr(modality_dataset, dict_name, None)
|
858
|
+
if dict_attr is not None:
|
859
|
+
if len(path_parts) == 2:
|
860
|
+
return pd.DataFrame([dict_attr]), modality, dict_name
|
861
|
+
|
862
|
+
deeper_keys = path_parts[2:]
|
863
|
+
data = _resolve_schema_slot_path(
|
864
|
+
dict_attr, deeper_keys, slot, f"{modality}.{dict_name}"
|
865
|
+
)
|
866
|
+
return pd.DataFrame([data]), modality, ":".join(path_parts[1:])
|
867
|
+
except (KeyError, AttributeError):
|
868
|
+
pass
|
869
|
+
else:
|
870
|
+
raise InvalidArgument(
|
871
|
+
f"Invalid dict slot pattern '{slot}'. Expected formats: "
|
872
|
+
f"'uns', 'attrs', 'uns:key', 'attrs:key', 'modality:uns'"
|
873
|
+
)
|
874
|
+
|
875
|
+
return None, None, None
|
876
|
+
|
877
|
+
|
878
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
698
879
|
class AnnDataCurator(SlotsCurator):
|
699
880
|
"""Curator for `AnnData`.
|
700
881
|
|
882
|
+
{}
|
883
|
+
|
701
884
|
Args:
|
702
885
|
dataset: The AnnData-like object to validate & annotate.
|
703
886
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
@@ -722,32 +905,35 @@ class AnnDataCurator(SlotsCurator):
|
|
722
905
|
raise InvalidArgument("dataset must be AnnData-like.")
|
723
906
|
if schema.otype != "AnnData":
|
724
907
|
raise InvalidArgument("Schema otype must be 'AnnData'.")
|
725
|
-
|
726
|
-
|
727
|
-
|
908
|
+
|
909
|
+
for slot, slot_schema in schema.slots.items():
|
910
|
+
if slot not in {"var", "var.T", "obs"} and not slot.startswith("uns"):
|
911
|
+
raise ValueError(
|
912
|
+
f"AnnDataCurator currently only supports the slots 'var', 'var.T', 'obs', and 'uns', not {slot}"
|
913
|
+
)
|
914
|
+
if slot.startswith("uns"):
|
915
|
+
df, _, _ = _handle_dict_slots(self._dataset, slot)
|
916
|
+
elif slot in {"obs", "var", "var.T"}:
|
917
|
+
df = (
|
728
918
|
getattr(self._dataset, slot.strip(".T")).T
|
729
919
|
if slot == "var.T"
|
730
920
|
or (
|
731
|
-
# backward compat
|
732
921
|
slot == "var"
|
733
922
|
and schema.slots["var"].itype not in {None, "Feature"}
|
734
923
|
)
|
735
924
|
else getattr(self._dataset, slot)
|
736
|
-
)
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
"var"
|
749
|
-
].cat._cat_vectors.pop("columns")
|
750
|
-
self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
|
925
|
+
)
|
926
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
|
927
|
+
|
928
|
+
# Handle var index naming for backward compat
|
929
|
+
if slot == "var" and schema.slots["var"].itype not in {None, "Feature"}:
|
930
|
+
logger.warning(
|
931
|
+
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
932
|
+
)
|
933
|
+
self._slots["var"].cat._cat_vectors["var_index"] = self._slots[
|
934
|
+
"var"
|
935
|
+
].cat._cat_vectors.pop("columns")
|
936
|
+
self._slots["var"].cat._cat_vectors["var_index"]._key = "var_index"
|
751
937
|
|
752
938
|
|
753
939
|
def _assign_var_fields_categoricals_multimodal(
|
@@ -757,11 +943,10 @@ def _assign_var_fields_categoricals_multimodal(
|
|
757
943
|
slot_schema: Schema,
|
758
944
|
var_fields: dict[str, FieldAttr],
|
759
945
|
cat_vectors: dict[str, dict[str, CatVector]],
|
760
|
-
slots: dict[str,
|
946
|
+
slots: dict[str, ComponentCurator],
|
761
947
|
) -> None:
|
762
948
|
"""Assigns var_fields and categoricals for multimodal data curators."""
|
763
949
|
if modality is not None:
|
764
|
-
# Makes sure that all tables are present
|
765
950
|
var_fields[modality] = None
|
766
951
|
cat_vectors[modality] = {}
|
767
952
|
|
@@ -782,15 +967,17 @@ def _assign_var_fields_categoricals_multimodal(
|
|
782
967
|
cat_vectors[modality] = obs_fields
|
783
968
|
|
784
969
|
|
970
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
785
971
|
class MuDataCurator(SlotsCurator):
|
786
972
|
"""Curator for `MuData`.
|
787
973
|
|
974
|
+
{}
|
975
|
+
|
788
976
|
Args:
|
789
977
|
dataset: The MuData-like object to validate & annotate.
|
790
978
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
791
979
|
|
792
980
|
Example:
|
793
|
-
|
794
981
|
.. literalinclude:: scripts/curate_mudata.py
|
795
982
|
:language: python
|
796
983
|
:caption: curate_mudata.py
|
@@ -811,12 +998,32 @@ class MuDataCurator(SlotsCurator):
|
|
811
998
|
raise InvalidArgument("Schema otype must be 'MuData'.")
|
812
999
|
|
813
1000
|
for slot, slot_schema in schema.slots.items():
|
814
|
-
|
815
|
-
|
816
|
-
|
1001
|
+
# Handle slots: "mdata.uns", "modality:uns"
|
1002
|
+
if "uns" in slot:
|
1003
|
+
df, modality, modality_slot = _handle_dict_slots(self._dataset, slot)
|
817
1004
|
else:
|
818
|
-
|
819
|
-
|
1005
|
+
# Handle slots: "modality:obs", "modality:var"
|
1006
|
+
parts = slot.split(":")
|
1007
|
+
if len(parts) == 2:
|
1008
|
+
modality, modality_slot = parts
|
1009
|
+
try:
|
1010
|
+
schema_dataset = self._dataset[modality]
|
1011
|
+
df = getattr(schema_dataset, modality_slot.rstrip(".T"))
|
1012
|
+
except KeyError:
|
1013
|
+
raise InvalidArgument(
|
1014
|
+
f"Modality '{modality}' not found in MuData"
|
1015
|
+
) from None
|
1016
|
+
except AttributeError:
|
1017
|
+
raise InvalidArgument(
|
1018
|
+
f"Attribute '{modality_slot}' not found on modality '{modality}'"
|
1019
|
+
) from None
|
1020
|
+
else:
|
1021
|
+
# Handle slots: "mdata:obs", "mdata:var" (uns is a dictionary and gets handled above)
|
1022
|
+
modality, modality_slot = None, slot
|
1023
|
+
schema_dataset = self._dataset
|
1024
|
+
df = getattr(schema_dataset, modality_slot.rstrip(".T"))
|
1025
|
+
|
1026
|
+
# Transpose var if necessary
|
820
1027
|
if modality_slot == "var" and schema.slots[slot].itype not in {
|
821
1028
|
None,
|
822
1029
|
"Feature",
|
@@ -824,19 +1031,12 @@ class MuDataCurator(SlotsCurator):
|
|
824
1031
|
logger.warning(
|
825
1032
|
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
826
1033
|
)
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
modality_slot == "var"
|
834
|
-
and schema.slots[slot].itype not in {None, "Feature"}
|
835
|
-
)
|
836
|
-
else getattr(schema_dataset, modality_slot)
|
837
|
-
),
|
838
|
-
slot_schema,
|
839
|
-
)
|
1034
|
+
df = df.T
|
1035
|
+
elif modality_slot == "var.T":
|
1036
|
+
df = df.T
|
1037
|
+
|
1038
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot=slot)
|
1039
|
+
|
840
1040
|
_assign_var_fields_categoricals_multimodal(
|
841
1041
|
modality=modality,
|
842
1042
|
slot_type=modality_slot,
|
@@ -846,18 +1046,21 @@ class MuDataCurator(SlotsCurator):
|
|
846
1046
|
cat_vectors=self._cat_vectors,
|
847
1047
|
slots=self._slots,
|
848
1048
|
)
|
1049
|
+
|
849
1050
|
self._columns_field = self._var_fields
|
850
1051
|
|
851
1052
|
|
1053
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
852
1054
|
class SpatialDataCurator(SlotsCurator):
|
853
1055
|
"""Curator for `SpatialData`.
|
854
1056
|
|
1057
|
+
{}
|
1058
|
+
|
855
1059
|
Args:
|
856
1060
|
dataset: The SpatialData-like object to validate & annotate.
|
857
1061
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
858
1062
|
|
859
1063
|
Example:
|
860
|
-
|
861
1064
|
.. literalinclude:: scripts/curate_spatialdata.py
|
862
1065
|
:language: python
|
863
1066
|
:caption: curate_spatialdata.py
|
@@ -878,69 +1081,75 @@ class SpatialDataCurator(SlotsCurator):
|
|
878
1081
|
raise InvalidArgument("Schema otype must be 'SpatialData'.")
|
879
1082
|
|
880
1083
|
for slot, slot_schema in schema.slots.items():
|
881
|
-
|
882
|
-
if (
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
1084
|
+
# Handle slots: "sdata:attrs"
|
1085
|
+
if slot.startswith("attrs"):
|
1086
|
+
df, table_key, table_slot = _handle_dict_slots(self._dataset, slot)
|
1087
|
+
else:
|
1088
|
+
parts = slot.split(":")
|
1089
|
+
# Handle slots: "tables:table_key:obs", "tables:table_key:var"
|
1090
|
+
if len(parts) == 3 and parts[0] == "tables":
|
1091
|
+
table_key, table_slot = parts[1], parts[2]
|
1092
|
+
try:
|
1093
|
+
slot_object = self._dataset.tables[table_key]
|
1094
|
+
df = getattr(slot_object, table_slot.rstrip(".T"))
|
1095
|
+
except KeyError:
|
1096
|
+
raise InvalidArgument(
|
1097
|
+
f"Table '{table_key}' not found in sdata.tables"
|
1098
|
+
) from None
|
1099
|
+
except AttributeError:
|
1100
|
+
raise InvalidArgument(
|
1101
|
+
f"Attribute '{table_slot}' not found on table '{table_key}'"
|
1102
|
+
) from None
|
890
1103
|
else:
|
891
|
-
|
892
|
-
|
893
|
-
if sub_slot == "var" and schema.slots[slot].itype not in {
|
894
|
-
None,
|
895
|
-
"Feature",
|
896
|
-
}:
|
897
|
-
logger.warning(
|
898
|
-
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
899
|
-
)
|
900
|
-
data_object = (
|
901
|
-
getattr(slot_object, sub_slot.rstrip(".T")).T
|
902
|
-
if sub_slot == "var.T"
|
903
|
-
or (
|
904
|
-
# backward compat
|
905
|
-
sub_slot == "var"
|
906
|
-
and schema.slots[slot].itype not in {None, "Feature"}
|
907
|
-
)
|
908
|
-
else getattr(slot_object, sub_slot)
|
909
|
-
)
|
910
|
-
elif len(split_result) == 1 or (
|
911
|
-
len(split_result) > 1 and split_result[0] == "attrs"
|
912
|
-
):
|
913
|
-
table_key = None
|
914
|
-
if len(split_result) == 1:
|
915
|
-
if split_result[0] != "attrs":
|
1104
|
+
# Handle legacy single keys for backward compatibility
|
1105
|
+
if len(parts) == 1 and parts[0] != "attrs":
|
916
1106
|
logger.warning(
|
917
1107
|
f"please prefix slot {slot} with 'attrs:' going forward"
|
918
1108
|
)
|
919
|
-
|
920
|
-
|
1109
|
+
try:
|
1110
|
+
df = pd.DataFrame([self._dataset.attrs[slot]])
|
1111
|
+
table_key = None
|
1112
|
+
table_slot = slot
|
1113
|
+
except KeyError:
|
1114
|
+
raise InvalidArgument(
|
1115
|
+
f"Slot '{slot}' not found in sdata.attrs"
|
1116
|
+
) from None
|
921
1117
|
else:
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
1118
|
+
raise InvalidArgument(f"Unrecognized slot format: {slot}")
|
1119
|
+
|
1120
|
+
# Handle var transposition logic
|
1121
|
+
if table_slot == "var" and schema.slots[slot].itype not in {
|
1122
|
+
None,
|
1123
|
+
"Feature",
|
1124
|
+
}:
|
1125
|
+
logger.warning(
|
1126
|
+
"auto-transposed `var` for backward compat, please indicate transposition in the schema definition by calling out `.T`: slots={'var.T': itype=bt.Gene.ensembl_gene_id}"
|
1127
|
+
)
|
1128
|
+
df = df.T
|
1129
|
+
elif table_slot == "var.T":
|
1130
|
+
df = df.T
|
1131
|
+
|
1132
|
+
self._slots[slot] = ComponentCurator(df, slot_schema, slot)
|
1133
|
+
|
929
1134
|
_assign_var_fields_categoricals_multimodal(
|
930
1135
|
modality=table_key,
|
931
|
-
slot_type=
|
1136
|
+
slot_type=table_slot,
|
932
1137
|
slot=slot,
|
933
1138
|
slot_schema=slot_schema,
|
934
1139
|
var_fields=self._var_fields,
|
935
1140
|
cat_vectors=self._cat_vectors,
|
936
1141
|
slots=self._slots,
|
937
1142
|
)
|
1143
|
+
|
938
1144
|
self._columns_field = self._var_fields
|
939
1145
|
|
940
1146
|
|
1147
|
+
@doc_args(SLOTS_DETAILS_DOCSTRING)
|
941
1148
|
class TiledbsomaExperimentCurator(SlotsCurator):
|
942
1149
|
"""Curator for `tiledbsoma.Experiment`.
|
943
1150
|
|
1151
|
+
{}
|
1152
|
+
|
944
1153
|
Args:
|
945
1154
|
dataset: The `tiledbsoma.Experiment` object.
|
946
1155
|
schema: A :class:`~lamindb.Schema` object that defines the validation constraints.
|
@@ -977,7 +1186,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
977
1186
|
.drop("soma_joinid", axis=1, errors="ignore")
|
978
1187
|
)
|
979
1188
|
|
980
|
-
self._slots[slot] =
|
1189
|
+
self._slots[slot] = ComponentCurator(
|
981
1190
|
(schema_dataset.T if modality_slot == "var.T" else schema_dataset),
|
982
1191
|
slot_schema,
|
983
1192
|
)
|
@@ -990,7 +1199,7 @@ class TiledbsomaExperimentCurator(SlotsCurator):
|
|
990
1199
|
.to_pandas()
|
991
1200
|
.drop(["soma_joinid", "obs_id"], axis=1, errors="ignore")
|
992
1201
|
)
|
993
|
-
self._slots[slot] =
|
1202
|
+
self._slots[slot] = ComponentCurator(
|
994
1203
|
schema_dataset,
|
995
1204
|
slot_schema,
|
996
1205
|
)
|
@@ -1040,9 +1249,12 @@ class CatVector:
|
|
1040
1249
|
self._maximal_set = maximal_set
|
1041
1250
|
|
1042
1251
|
self._all_filters = {"source": self._source, "organism": self._organism}
|
1252
|
+
|
1043
1253
|
if self._subtype_str and "=" in self._subtype_str:
|
1044
1254
|
self._all_filters.update(
|
1045
|
-
resolve_relation_filters(
|
1255
|
+
resolve_relation_filters(
|
1256
|
+
parse_filter_string(self._subtype_str), self._field.field.model
|
1257
|
+
) # type: ignore
|
1046
1258
|
)
|
1047
1259
|
|
1048
1260
|
if hasattr(field.field.model, "_name_field"):
|
@@ -1241,7 +1453,7 @@ class CatVector:
|
|
1241
1453
|
type_record = registry.get(name=self._subtype_str)
|
1242
1454
|
if df is not None and registry == Feature:
|
1243
1455
|
nonval_columns = Feature.inspect(df.columns, mute=True).non_validated
|
1244
|
-
non_validated_records = Feature.
|
1456
|
+
non_validated_records = Feature.from_dataframe(df.loc[:, nonval_columns])
|
1245
1457
|
else:
|
1246
1458
|
if (
|
1247
1459
|
self._organism
|
@@ -1343,7 +1555,7 @@ class CatVector:
|
|
1343
1555
|
warning_message += "\n for remaining terms:\n"
|
1344
1556
|
warning_message += f" → fix typos, remove non-existent values, or save terms via: {colors.cyan(non_validated_hint_print)}"
|
1345
1557
|
if self._subtype_query_set is not None:
|
1346
|
-
warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.
|
1558
|
+
warning_message += f"\n → a valid label for subtype '{self._subtype_str}' has to be one of {self._subtype_query_set.to_list('name')}"
|
1347
1559
|
logger.info(f'mapping "{self._key}" on {colors.italic(model_field)}')
|
1348
1560
|
logger.warning(warning_message)
|
1349
1561
|
if self._cat_manager is not None:
|
@@ -1493,6 +1705,30 @@ class DataFrameCatManager:
|
|
1493
1705
|
"""The categorical features."""
|
1494
1706
|
return self._categoricals
|
1495
1707
|
|
1708
|
+
def __repr__(self) -> str:
|
1709
|
+
cls_name = colors.green(self.__class__.__name__)
|
1710
|
+
|
1711
|
+
status_str = (
|
1712
|
+
f"{colors.green('validated')}"
|
1713
|
+
if self._is_validated
|
1714
|
+
else f"{colors.yellow('unvalidated')}"
|
1715
|
+
)
|
1716
|
+
|
1717
|
+
info_parts = []
|
1718
|
+
|
1719
|
+
cat_count = len(self._categoricals)
|
1720
|
+
if cat_count > 0:
|
1721
|
+
info_parts.append(f"categorical_features={cat_count}")
|
1722
|
+
|
1723
|
+
if self._slot:
|
1724
|
+
info_parts.append(f"slot: {colors.italic(self._slot)}")
|
1725
|
+
|
1726
|
+
info_str = ", ".join(info_parts)
|
1727
|
+
if info_str:
|
1728
|
+
return f"{cls_name}({info_str}, {status_str})"
|
1729
|
+
else:
|
1730
|
+
return f"{cls_name}({status_str})"
|
1731
|
+
|
1496
1732
|
def lookup(self, public: bool = False) -> CatLookup:
|
1497
1733
|
"""Lookup categories.
|
1498
1734
|
|
@@ -1537,7 +1773,9 @@ class DataFrameCatManager:
|
|
1537
1773
|
key: The key referencing the column in the DataFrame to standardize.
|
1538
1774
|
"""
|
1539
1775
|
if self._artifact is not None:
|
1540
|
-
raise RuntimeError(
|
1776
|
+
raise RuntimeError(
|
1777
|
+
"Cannot mutate the dataset when an artifact is passed! Please load the dataset into memory using `dataset.load()` and pass it to a curator."
|
1778
|
+
)
|
1541
1779
|
|
1542
1780
|
if key == "all":
|
1543
1781
|
logger.warning(
|
@@ -1610,7 +1848,7 @@ def get_organism_kwargs(
|
|
1610
1848
|
def annotate_artifact(
|
1611
1849
|
artifact: Artifact,
|
1612
1850
|
*,
|
1613
|
-
curator:
|
1851
|
+
curator: SlotsCurator | None = None,
|
1614
1852
|
cat_vectors: dict[str, CatVector] | None = None,
|
1615
1853
|
) -> Artifact:
|
1616
1854
|
from .. import settings
|
@@ -1643,7 +1881,9 @@ def annotate_artifact(
|
|
1643
1881
|
)
|
1644
1882
|
|
1645
1883
|
# annotate with inferred schemas aka feature sets
|
1646
|
-
if
|
1884
|
+
if (
|
1885
|
+
artifact.otype == "DataFrame" and getattr(curator, "_schema", None) is None
|
1886
|
+
): # Prevent overwriting user-defined schemas that contain slots
|
1647
1887
|
features = cat_vectors["columns"].records
|
1648
1888
|
if features is not None:
|
1649
1889
|
index_feature = artifact.schema.index
|
@@ -1663,7 +1903,11 @@ def annotate_artifact(
|
|
1663
1903
|
logger.important(
|
1664
1904
|
f"not annotating with {len(features)} features as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
|
1665
1905
|
)
|
1666
|
-
itype =
|
1906
|
+
itype = (
|
1907
|
+
Feature.name
|
1908
|
+
if artifact.schema.itype == "Composite"
|
1909
|
+
else parse_cat_dtype(artifact.schema.itype, is_itype=True)["field"]
|
1910
|
+
)
|
1667
1911
|
feature_set = Schema(itype=itype, n=len(features))
|
1668
1912
|
artifact.feature_sets.add(
|
1669
1913
|
feature_set.save(), through_defaults={"slot": "columns"}
|
@@ -1698,9 +1942,13 @@ def annotate_artifact(
|
|
1698
1942
|
logger.important(
|
1699
1943
|
f"not annotating with {len(features)} features for slot {slot} as it exceeds {settings.annotation.n_max_records} (ln.settings.annotation.n_max_records)"
|
1700
1944
|
)
|
1701
|
-
itype =
|
1702
|
-
|
1703
|
-
|
1945
|
+
itype = (
|
1946
|
+
Feature.name
|
1947
|
+
if artifact.schema.slots[slot].itype == "Composite"
|
1948
|
+
else parse_cat_dtype(
|
1949
|
+
artifact.schema.slots[slot].itype, is_itype=True
|
1950
|
+
)["field"]
|
1951
|
+
)
|
1704
1952
|
feature_set = Schema(itype=itype, n=len(features))
|
1705
1953
|
artifact.feature_sets.add(
|
1706
1954
|
feature_set.save(), through_defaults={"slot": slot}
|