lamindb 0.77.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. lamindb/__init__.py +39 -32
  2. lamindb/_artifact.py +95 -64
  3. lamindb/_can_curate.py +13 -6
  4. lamindb/_collection.py +51 -49
  5. lamindb/_feature.py +9 -9
  6. lamindb/_finish.py +92 -79
  7. lamindb/_from_values.py +13 -10
  8. lamindb/_is_versioned.py +2 -1
  9. lamindb/_parents.py +23 -16
  10. lamindb/_query_manager.py +3 -3
  11. lamindb/_query_set.py +85 -18
  12. lamindb/_record.py +114 -41
  13. lamindb/_run.py +3 -3
  14. lamindb/_save.py +5 -6
  15. lamindb/{_feature_set.py → _schema.py} +34 -31
  16. lamindb/_storage.py +2 -1
  17. lamindb/_transform.py +51 -23
  18. lamindb/_ulabel.py +17 -8
  19. lamindb/_view.py +13 -13
  20. lamindb/base/__init__.py +24 -0
  21. lamindb/base/fields.py +281 -0
  22. lamindb/base/ids.py +103 -0
  23. lamindb/base/types.py +51 -0
  24. lamindb/base/users.py +30 -0
  25. lamindb/base/validation.py +67 -0
  26. lamindb/core/__init__.py +18 -15
  27. lamindb/core/_context.py +295 -224
  28. lamindb/core/_data.py +44 -49
  29. lamindb/core/_describe.py +41 -31
  30. lamindb/core/_django.py +29 -27
  31. lamindb/core/_feature_manager.py +130 -129
  32. lamindb/core/_label_manager.py +7 -8
  33. lamindb/core/_mapped_collection.py +17 -14
  34. lamindb/core/_settings.py +1 -12
  35. lamindb/core/_sync_git.py +56 -9
  36. lamindb/core/_track_environment.py +1 -1
  37. lamindb/core/datasets/_core.py +5 -6
  38. lamindb/core/exceptions.py +0 -7
  39. lamindb/core/fields.py +1 -1
  40. lamindb/core/loaders.py +0 -1
  41. lamindb/core/{schema.py → relations.py} +22 -19
  42. lamindb/core/storage/_anndata_accessor.py +1 -2
  43. lamindb/core/storage/_backed_access.py +2 -1
  44. lamindb/core/storage/_tiledbsoma.py +38 -13
  45. lamindb/core/storage/objects.py +1 -1
  46. lamindb/core/storage/paths.py +13 -8
  47. lamindb/core/subsettings/__init__.py +0 -2
  48. lamindb/core/types.py +2 -23
  49. lamindb/core/versioning.py +11 -7
  50. lamindb/{_curate.py → curators/__init__.py} +122 -23
  51. lamindb/curators/_spatial.py +528 -0
  52. lamindb/integrations/_vitessce.py +1 -3
  53. lamindb/migrations/0052_squashed.py +1261 -0
  54. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +57 -0
  55. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +35 -0
  56. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +61 -0
  57. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +22 -0
  58. lamindb/migrations/0057_link_models_latest_report_and_others.py +356 -0
  59. lamindb/migrations/0058_artifact__actions_collection__actions.py +22 -0
  60. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +31 -0
  61. lamindb/migrations/0060_alter_artifact__actions.py +22 -0
  62. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +45 -0
  63. lamindb/migrations/0062_add_is_latest_field.py +32 -0
  64. lamindb/migrations/0063_populate_latest_field.py +45 -0
  65. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +33 -0
  66. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +22 -0
  67. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +352 -0
  68. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +20 -0
  69. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +20 -0
  70. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +1294 -0
  71. lamindb/migrations/0069_squashed.py +1770 -0
  72. lamindb/migrations/0070_lamindbv1_migrate_data.py +78 -0
  73. lamindb/migrations/0071_lamindbv1_migrate_schema.py +741 -0
  74. lamindb/migrations/0072_remove_user__branch_code_remove_user_aux_and_more.py +148 -0
  75. lamindb/migrations/0073_merge_ourprojects.py +945 -0
  76. lamindb/migrations/0074_lamindbv1_part4.py +374 -0
  77. lamindb/migrations/0075_lamindbv1_part5.py +276 -0
  78. lamindb/migrations/0076_lamindbv1_part6.py +621 -0
  79. lamindb/migrations/0077_lamindbv1_part6b.py +228 -0
  80. lamindb/migrations/0078_lamindbv1_part6c.py +468 -0
  81. lamindb/migrations/0079_alter_rundata_value_json_and_more.py +36 -0
  82. lamindb/migrations/__init__.py +0 -0
  83. lamindb/models.py +4064 -0
  84. {lamindb-0.77.3.dist-info → lamindb-1.0.0.dist-info}/METADATA +13 -19
  85. lamindb-1.0.0.dist-info/RECORD +100 -0
  86. {lamindb-0.77.3.dist-info → lamindb-1.0.0.dist-info}/WHEEL +1 -1
  87. lamindb/core/subsettings/_transform_settings.py +0 -21
  88. lamindb-0.77.3.dist-info/RECORD +0 -63
  89. {lamindb-0.77.3.dist-info → lamindb-1.0.0.dist-info}/LICENSE +0 -0
@@ -14,34 +14,16 @@ from anndata import AnnData
14
14
  from django.contrib.postgres.aggregates import ArrayAgg
15
15
  from django.db import connections
16
16
  from django.db.models import Aggregate
17
- from lamin_utils import colors, logger
17
+ from lamin_utils import logger
18
18
  from lamindb_setup.core.hashing import hash_set
19
19
  from lamindb_setup.core.upath import create_path
20
- from lnschema_core.models import (
21
- Artifact,
22
- Collection,
23
- Feature,
24
- FeatureManager,
25
- FeatureValue,
26
- LinkORM,
27
- Param,
28
- ParamManager,
29
- ParamManagerArtifact,
30
- ParamManagerRun,
31
- ParamValue,
32
- Record,
33
- Run,
34
- ULabel,
35
- )
36
20
  from rich.table import Column, Table
37
21
  from rich.text import Text
38
22
 
39
23
  from lamindb._feature import (
40
- FEATURE_DTYPES,
41
24
  convert_pandas_dtype_to_lamin_dtype,
42
25
  suggest_categorical_for_str_iterable,
43
26
  )
44
- from lamindb._feature_set import DICT_KEYS_TYPE, FeatureSet
45
27
  from lamindb._from_values import _format_values
46
28
  from lamindb._record import (
47
29
  REGISTRY_UNIQUE_FIELD,
@@ -50,8 +32,25 @@ from lamindb._record import (
50
32
  transfer_to_default_db,
51
33
  )
52
34
  from lamindb._save import save
35
+ from lamindb._schema import DICT_KEYS_TYPE, Schema
53
36
  from lamindb.core.exceptions import DoesNotExist, ValidationError
54
37
  from lamindb.core.storage import LocalPathClasses
38
+ from lamindb.models import (
39
+ Artifact,
40
+ Collection,
41
+ Feature,
42
+ FeatureManager,
43
+ FeatureValue,
44
+ LinkORM,
45
+ Param,
46
+ ParamManager,
47
+ ParamManagerArtifact,
48
+ ParamManagerRun,
49
+ ParamValue,
50
+ Record,
51
+ Run,
52
+ ULabel,
53
+ )
55
54
 
56
55
  from ._describe import (
57
56
  NAME_WIDTH,
@@ -63,15 +62,15 @@ from ._describe import (
63
62
  from ._django import get_artifact_with_related
64
63
  from ._label_manager import _get_labels, describe_labels
65
64
  from ._settings import settings
66
- from .schema import (
65
+ from .relations import (
67
66
  dict_related_model_to_related_name,
68
67
  )
69
68
 
70
69
  if TYPE_CHECKING:
71
- from lnschema_core.types import FieldAttr
72
70
  from rich.tree import Tree
73
71
 
74
72
  from lamindb._query_set import QuerySet
73
+ from lamindb.base.types import FieldAttr
75
74
 
76
75
 
77
76
  def get_host_id_field(host: Artifact | Collection) -> str:
@@ -84,7 +83,7 @@ def get_host_id_field(host: Artifact | Collection) -> str:
84
83
 
85
84
  def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
86
85
  dictionary = {
87
- field.related_model.__get_name_with_schema__(): field.name
86
+ field.related_model.__get_name_with_module__(): field.name
88
87
  for field in host._meta.related_objects
89
88
  }
90
89
  dictionary["Feature"] = "features"
@@ -92,25 +91,25 @@ def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
92
91
  return dictionary
93
92
 
94
93
 
95
- def get_feature_set_by_slot_(host: Artifact | Collection) -> dict:
94
+ def get_schema_by_slot_(host: Artifact | Collection) -> dict:
96
95
  if isinstance(host, Collection):
97
96
  return {}
98
97
  # if the host is not yet saved
99
98
  if host._state.adding:
100
- if hasattr(host, "_feature_sets"):
101
- return host._feature_sets
99
+ if hasattr(host, "_staged__schemas_m2m"):
100
+ return host._staged__schemas_m2m
102
101
  else:
103
102
  return {}
104
103
  host_db = host._state.db
105
104
  host_id_field = get_host_id_field(host)
106
105
  kwargs = {host_id_field: host.id}
107
106
  # otherwise, we need a query
108
- links_feature_set = (
109
- host.feature_sets.through.objects.using(host_db)
107
+ links_schema = (
108
+ host._schemas_m2m.through.objects.using(host_db)
110
109
  .filter(**kwargs)
111
- .select_related("featureset")
110
+ .select_related("schema")
112
111
  )
113
- return {fsl.slot: fsl.featureset for fsl in links_feature_set}
112
+ return {fsl.slot: fsl.schema for fsl in links_schema}
114
113
 
115
114
 
116
115
  def get_label_links(
@@ -126,11 +125,11 @@ def get_label_links(
126
125
  return link_records
127
126
 
128
127
 
129
- def get_feature_set_links(host: Artifact | Collection) -> QuerySet:
128
+ def get_schema_links(host: Artifact | Collection) -> QuerySet:
130
129
  host_id_field = get_host_id_field(host)
131
130
  kwargs = {host_id_field: host.id}
132
- links_feature_set = host.feature_sets.through.objects.filter(**kwargs)
133
- return links_feature_set
131
+ links_schema = host._schemas_m2m.through.objects.filter(**kwargs)
132
+ return links_schema
134
133
 
135
134
 
136
135
  def get_link_attr(link: LinkORM | type[LinkORM], data: Artifact | Collection) -> str:
@@ -270,15 +269,15 @@ def _get_non_categoricals(
270
269
  return non_categoricals
271
270
 
272
271
 
273
- def _get_featuresets_postgres(
272
+ def _get_schemas_postgres(
274
273
  self: Artifact | Collection,
275
274
  related_data: dict | None = None,
276
275
  ) -> dict:
277
276
  if not related_data:
278
- artifact_meta = get_artifact_with_related(self, include_featureset=True)
277
+ artifact_meta = get_artifact_with_related(self, include_schema=True)
279
278
  related_data = artifact_meta.get("related_data", {})
280
279
 
281
- fs_data = related_data.get("featuresets", {}) if related_data else {}
280
+ fs_data = related_data.get("schemas", {}) if related_data else {}
282
281
  return fs_data
283
282
 
284
283
 
@@ -326,35 +325,35 @@ def describe_features(
326
325
  return dictionary if to_dict else tree
327
326
 
328
327
  # feature sets
329
- feature_set_data: dict[str, tuple[str, list[str]]] = {}
328
+ schema_data: dict[str, tuple[str, list[str]]] = {}
330
329
  feature_data: dict[str, tuple[str, list[str]]] = {}
331
330
  if not print_params and not to_dict:
332
331
  if self.id is not None and connections[self._state.db].vendor == "postgresql":
333
- fs_data = _get_featuresets_postgres(self, related_data=related_data)
332
+ fs_data = _get_schemas_postgres(self, related_data=related_data)
334
333
  for fs_id, (slot, data) in fs_data.items():
335
334
  for registry_str, feature_names in data.items():
336
- feature_set = FeatureSet.objects.using(self._state.db).get(id=fs_id)
337
- feature_set_data[slot] = (feature_set, feature_names)
335
+ schema = Schema.objects.using(self._state.db).get(id=fs_id)
336
+ schema_data[slot] = (schema, feature_names)
338
337
  for feature_name in feature_names:
339
338
  feature_data[feature_name] = (slot, registry_str)
340
339
  else:
341
- for slot, feature_set in get_feature_set_by_slot_(self).items():
342
- features = feature_set.members
340
+ for slot, schema in get_schema_by_slot_(self).items():
341
+ features = schema.members
343
342
  # features.first() is a lot slower than features[0] here
344
343
  name_field = get_name_field(features[0])
345
344
  feature_names = list(features.values_list(name_field, flat=True)[:20])
346
- feature_set_data[slot] = (feature_set, feature_names)
345
+ schema_data[slot] = (schema, feature_names)
347
346
  for feature_name in feature_names:
348
- feature_data[feature_name] = (slot, feature_set.registry)
347
+ feature_data[feature_name] = (slot, schema.itype)
349
348
 
350
349
  internal_feature_names: dict[str, str] = {}
351
350
  if isinstance(self, Artifact):
352
- feature_sets = self.feature_sets.filter(registry="Feature").all()
351
+ _schemas_m2m = self._schemas_m2m.filter(itype="Feature").all()
353
352
  internal_feature_names = {}
354
- if len(feature_sets) > 0:
355
- for feature_set in feature_sets:
353
+ if len(_schemas_m2m) > 0:
354
+ for schema in _schemas_m2m:
356
355
  internal_feature_names.update(
357
- dict(feature_set.members.values_list("name", "dtype"))
356
+ dict(schema.members.values_list("name", "dtype"))
358
357
  )
359
358
 
360
359
  # categorical feature values
@@ -417,7 +416,7 @@ def describe_features(
417
416
  internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
418
417
 
419
418
  int_features_tree_children = []
420
- for slot, (feature_set, feature_names) in feature_set_data.items():
419
+ for slot, (schema, feature_names) in schema_data.items():
421
420
  if slot in internal_feature_labels_slot:
422
421
  # add internal Feature features with labels
423
422
  feature_rows = internal_feature_labels_slot[slot]
@@ -440,7 +439,7 @@ def describe_features(
440
439
  str(
441
440
  internal_feature_names.get(feature_name)
442
441
  if feature_name in internal_feature_names
443
- else feature_set.dtype
442
+ else schema.dtype
444
443
  ),
445
444
  style="dim",
446
445
  ),
@@ -454,9 +453,9 @@ def describe_features(
454
453
  Text.assemble(
455
454
  (slot, "violet"),
456
455
  (" • ", "dim"),
457
- (str(feature_set.n), "pink1"),
456
+ (str(schema.n), "pink1"),
458
457
  ),
459
- Text.assemble((f"[{feature_set.registry}]", "pink1")),
458
+ Text.assemble((f"[{schema.itype}]", "pink1")),
460
459
  feature_rows,
461
460
  show_header=True,
462
461
  )
@@ -467,7 +466,7 @@ def describe_features(
467
466
  Text.assemble(
468
467
  ("Dataset features", "bold bright_magenta"),
469
468
  ("/", "dim"),
470
- (".feature_sets", "dim bold"),
469
+ ("._schemas_m2m", "dim bold"),
471
470
  )
472
471
  )
473
472
  for child in int_features_tree_children:
@@ -501,7 +500,7 @@ def describe_features(
501
500
  return tree
502
501
 
503
502
 
504
- def parse_feature_sets_from_anndata(
503
+ def parse_staged__schemas_m2m_from_anndata(
505
504
  adata: AnnData,
506
505
  var_field: FieldAttr | None = None,
507
506
  obs_field: FieldAttr = Feature.name,
@@ -525,11 +524,11 @@ def parse_feature_sets_from_anndata(
525
524
  if adata.X is None
526
525
  else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
527
526
  )
528
- feature_sets = {}
527
+ _schemas_m2m = {}
529
528
  if var_field is not None:
530
529
  logger.info("parsing feature names of X stored in slot 'var'")
531
530
  logger.indent = " "
532
- feature_set_var = FeatureSet.from_values(
531
+ schema_var = Schema.from_values(
533
532
  data_parse.var.index,
534
533
  var_field,
535
534
  type=type,
@@ -537,28 +536,28 @@ def parse_feature_sets_from_anndata(
537
536
  organism=organism,
538
537
  raise_validation_error=False,
539
538
  )
540
- if feature_set_var is not None:
541
- feature_sets["var"] = feature_set_var
542
- logger.save(f"linked: {feature_set_var}")
539
+ if schema_var is not None:
540
+ _schemas_m2m["var"] = schema_var
541
+ logger.save(f"linked: {schema_var}")
543
542
  logger.indent = ""
544
- if feature_set_var is None:
543
+ if schema_var is None:
545
544
  logger.warning("skip linking features to artifact in slot 'var'")
546
545
  if len(data_parse.obs.columns) > 0:
547
546
  logger.info("parsing feature names of slot 'obs'")
548
547
  logger.indent = " "
549
- feature_set_obs = FeatureSet.from_df(
548
+ schema_obs = Schema.from_df(
550
549
  df=data_parse.obs,
551
550
  field=obs_field,
552
551
  mute=mute,
553
552
  organism=organism,
554
553
  )
555
- if feature_set_obs is not None:
556
- feature_sets["obs"] = feature_set_obs
557
- logger.save(f"linked: {feature_set_obs}")
554
+ if schema_obs is not None:
555
+ _schemas_m2m["obs"] = schema_obs
556
+ logger.save(f"linked: {schema_obs}")
558
557
  logger.indent = ""
559
- if feature_set_obs is None:
558
+ if schema_obs is None:
560
559
  logger.warning("skip linking features to artifact in slot 'obs'")
561
- return feature_sets
560
+ return _schemas_m2m
562
561
 
563
562
 
564
563
  def is_valid_datetime_str(date_string: str) -> bool | str:
@@ -621,12 +620,12 @@ def infer_feature_type_convert_json(
621
620
  return ("list[cat ? str]", value, message)
622
621
  elif first_element_type == Record:
623
622
  return (
624
- f"list[cat[{first_element_type.__get_name_with_schema__()}]]",
623
+ f"list[cat[{first_element_type.__get_name_with_module__()}]]",
625
624
  value,
626
625
  message,
627
626
  )
628
627
  elif isinstance(value, Record):
629
- return (f"cat[{value.__class__.__get_name_with_schema__()}]", value, message)
628
+ return (f"cat[{value.__class__.__get_name_with_module__()}]", value, message)
630
629
  if not mute:
631
630
  logger.warning(f"cannot infer feature type of: {value}, returning '?")
632
631
  return "?", value, message
@@ -634,7 +633,7 @@ def infer_feature_type_convert_json(
634
633
 
635
634
  def __init__(self, host: Artifact | Collection | Run):
636
635
  self._host = host
637
- self._feature_set_by_slot_ = None
636
+ self._schema_by_slot_ = None
638
637
  self._accessor_by_registry_ = None
639
638
 
640
639
 
@@ -651,15 +650,15 @@ def get_values(self) -> dict[str, Any]:
651
650
 
652
651
 
653
652
  def __getitem__(self, slot) -> QuerySet:
654
- if slot not in self._feature_set_by_slot:
653
+ if slot not in self._schema_by_slot:
655
654
  raise ValueError(
656
655
  f"No linked feature set for slot: {slot}\nDid you get validation"
657
656
  " warnings? Only features that match registered features get validated"
658
657
  " and linked."
659
658
  )
660
- feature_set = self._feature_set_by_slot[slot]
661
- orm_name = feature_set.registry
662
- return getattr(feature_set, self._accessor_by_registry[orm_name]).all()
659
+ schema = self._schema_by_slot[slot]
660
+ orm_name = schema.itype
661
+ return getattr(schema, self._accessor_by_registry[orm_name]).all()
663
662
 
664
663
 
665
664
  def filter_base(cls, **expression):
@@ -749,11 +748,11 @@ def get(cls, **expression) -> Record:
749
748
 
750
749
 
751
750
  @property # type: ignore
752
- def _feature_set_by_slot(self):
751
+ def _schema_by_slot(self):
753
752
  """Feature sets by slot."""
754
- if self._feature_set_by_slot_ is None:
755
- self._feature_set_by_slot_ = get_feature_set_by_slot_(self._host)
756
- return self._feature_set_by_slot_
753
+ if self._schema_by_slot_ is None:
754
+ self._schema_by_slot_ = get_schema_by_slot_(self._host)
755
+ return self._schema_by_slot_
757
756
 
758
757
 
759
758
  @property # type: ignore
@@ -833,11 +832,11 @@ def _add_values(
833
832
  model_name = "Param" if is_param else "Feature"
834
833
  if is_param:
835
834
  if self._host.__class__ == Artifact:
836
- if self._host.type != "model":
835
+ if self._host.kind != "model":
837
836
  raise ValidationError("Can only set params for model-like artifacts.")
838
837
  else:
839
838
  if self._host.__class__ == Artifact:
840
- if self._host.type != "dataset" and self._host.type is not None:
839
+ if self._host.kind != "dataset" and self._host.kind is not None:
841
840
  raise ValidationError(
842
841
  "Can only set features for dataset-like artifacts."
843
842
  )
@@ -913,7 +912,7 @@ def _add_values(
913
912
  raise ValidationError(
914
913
  f"Please save {record} before annotation."
915
914
  )
916
- features_labels[record.__class__.__get_name_with_schema__()].append(
915
+ features_labels[record.__class__.__get_name_with_module__()].append(
917
916
  (feature, record)
918
917
  )
919
918
  else:
@@ -957,7 +956,7 @@ def _add_values(
957
956
  links = [
958
957
  LinkORM(
959
958
  **{
960
- f"{self._host.__class__.__get_name_with_schema__().lower()}_id": self._host.id,
959
+ f"{self._host.__class__.__get_name_with_module__().lower()}_id": self._host.id,
961
960
  valuefield_id: feature_value.id,
962
961
  }
963
962
  )
@@ -1030,14 +1029,14 @@ def remove_values(
1030
1029
  link_models_on_models = {
1031
1030
  getattr(
1032
1031
  Artifact, obj.related_name
1033
- ).through.__get_name_with_schema__(): obj.related_model.__get_name_with_schema__()
1032
+ ).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
1034
1033
  for obj in Artifact._meta.related_objects
1035
- if obj.related_model.__get_name_with_schema__() == feature_registry
1034
+ if obj.related_model.__get_name_with_module__() == feature_registry
1036
1035
  }
1037
1036
  link_attribute = {
1038
1037
  obj.related_name
1039
1038
  for obj in Artifact._meta.related_objects
1040
- if obj.related_model.__get_name_with_schema__() in link_models_on_models
1039
+ if obj.related_model.__get_name_with_module__() in link_models_on_models
1041
1040
  }.pop()
1042
1041
  getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
1043
1042
  else:
@@ -1050,36 +1049,37 @@ def remove_values(
1050
1049
  # we can clean the FeatureValue registry periodically if we want to
1051
1050
 
1052
1051
 
1053
- def add_feature_set(self, feature_set: FeatureSet, slot: str) -> None:
1054
- """Curate artifact with a feature set.
1052
+ def add_schema(self, schema: Schema, slot: str) -> None:
1053
+ """Annotate artifact with a schema.
1055
1054
 
1056
1055
  Args:
1057
- feature_set: `FeatureSet` A feature set record.
1058
- slot: `str` The slot that marks where the feature set is stored in
1056
+ schema: `Schema` A schema record.
1057
+ slot: `str` The slot that marks where the schema is stored in
1059
1058
  the artifact.
1060
1059
  """
1060
+ # TODO: deprecate as soon as we have the Schema-based curators
1061
1061
  if self._host._state.adding:
1062
1062
  raise ValueError(
1063
1063
  "Please save the artifact or collection before adding a feature set!"
1064
1064
  )
1065
1065
  host_db = self._host._state.db
1066
- feature_set.save(using=host_db)
1066
+ schema.save(using=host_db)
1067
1067
  host_id_field = get_host_id_field(self._host)
1068
1068
  kwargs = {
1069
1069
  host_id_field: self._host.id,
1070
- "featureset": feature_set,
1070
+ "schema": schema,
1071
1071
  "slot": slot,
1072
1072
  }
1073
1073
  link_record = (
1074
- self._host.feature_sets.through.objects.using(host_db)
1074
+ self._host._schemas_m2m.through.objects.using(host_db)
1075
1075
  .filter(**kwargs)
1076
1076
  .one_or_none()
1077
1077
  )
1078
1078
  if link_record is None:
1079
- self._host.feature_sets.through(**kwargs).save(using=host_db)
1080
- if slot in self._feature_set_by_slot:
1079
+ self._host._schemas_m2m.through(**kwargs).save(using=host_db)
1080
+ if slot in self._schema_by_slot:
1081
1081
  logger.debug(f"replaced existing {slot} feature set")
1082
- self._feature_set_by_slot_[slot] = feature_set # type: ignore
1082
+ self._schema_by_slot_[slot] = schema # type: ignore
1083
1083
 
1084
1084
 
1085
1085
  def _add_set_from_df(
@@ -1090,18 +1090,18 @@ def _add_set_from_df(
1090
1090
  ):
1091
1091
  """Add feature set corresponding to column names of DataFrame."""
1092
1092
  if isinstance(self._host, Artifact):
1093
- assert self._host._accessor == "DataFrame" # noqa: S101
1093
+ assert self._host.otype == "DataFrame" # noqa: S101
1094
1094
  else:
1095
1095
  # Collection
1096
- assert self._host.artifact._accessor == "DataFrame" # noqa: S101
1096
+ assert self._host.artifact.otype == "DataFrame" # noqa: S101
1097
1097
  df = self._host.load()
1098
- feature_set = FeatureSet.from_df(
1098
+ schema = Schema.from_df(
1099
1099
  df=df,
1100
1100
  field=field,
1101
1101
  mute=mute,
1102
1102
  organism=organism,
1103
1103
  )
1104
- self._host._feature_sets = {"columns": feature_set}
1104
+ self._host._staged__schemas_m2m = {"columns": schema}
1105
1105
  self._host.save()
1106
1106
 
1107
1107
 
@@ -1114,13 +1114,13 @@ def _add_set_from_anndata(
1114
1114
  ):
1115
1115
  """Add features from AnnData."""
1116
1116
  if isinstance(self._host, Artifact):
1117
- assert self._host._accessor == "AnnData" # noqa: S101
1117
+ assert self._host.otype == "AnnData" # noqa: S101
1118
1118
  else:
1119
1119
  raise NotImplementedError()
1120
1120
 
1121
1121
  # parse and register features
1122
1122
  adata = self._host.load()
1123
- feature_sets = parse_feature_sets_from_anndata(
1123
+ _schemas_m2m = parse_staged__schemas_m2m_from_anndata(
1124
1124
  adata,
1125
1125
  var_field=var_field,
1126
1126
  obs_field=obs_field,
@@ -1129,7 +1129,7 @@ def _add_set_from_anndata(
1129
1129
  )
1130
1130
 
1131
1131
  # link feature sets
1132
- self._host._feature_sets = feature_sets
1132
+ self._host._staged__schemas_m2m = _schemas_m2m
1133
1133
  self._host.save()
1134
1134
 
1135
1135
 
@@ -1144,18 +1144,18 @@ def _add_set_from_mudata(
1144
1144
  if obs_fields is None:
1145
1145
  obs_fields = {}
1146
1146
  if isinstance(self._host, Artifact):
1147
- assert self._host._accessor == "MuData" # noqa: S101
1147
+ assert self._host.otype == "MuData" # noqa: S101
1148
1148
  else:
1149
1149
  raise NotImplementedError()
1150
1150
 
1151
1151
  # parse and register features
1152
1152
  mdata = self._host.load()
1153
- feature_sets = {}
1153
+ _schemas_m2m = {}
1154
1154
  obs_features = Feature.from_values(mdata.obs.columns)
1155
1155
  if len(obs_features) > 0:
1156
- feature_sets["obs"] = FeatureSet(features=obs_features)
1156
+ _schemas_m2m["obs"] = Schema(features=obs_features)
1157
1157
  for modality, field in var_fields.items():
1158
- modality_fs = parse_feature_sets_from_anndata(
1158
+ modality_fs = parse_staged__schemas_m2m_from_anndata(
1159
1159
  mdata[modality],
1160
1160
  var_field=field,
1161
1161
  obs_field=obs_fields.get(modality, Feature.name),
@@ -1163,22 +1163,22 @@ def _add_set_from_mudata(
1163
1163
  organism=organism,
1164
1164
  )
1165
1165
  for k, v in modality_fs.items():
1166
- feature_sets[f"['{modality}'].{k}"] = v
1166
+ _schemas_m2m[f"['{modality}'].{k}"] = v
1167
1167
 
1168
- def unify_feature_sets_by_hash(feature_sets):
1168
+ def unify_staged__schemas_m2m_by_hash(_schemas_m2m):
1169
1169
  unique_values = {}
1170
1170
 
1171
- for key, value in feature_sets.items():
1171
+ for key, value in _schemas_m2m.items():
1172
1172
  value_hash = value.hash # Assuming each value has a .hash attribute
1173
1173
  if value_hash in unique_values:
1174
- feature_sets[key] = unique_values[value_hash]
1174
+ _schemas_m2m[key] = unique_values[value_hash]
1175
1175
  else:
1176
1176
  unique_values[value_hash] = value
1177
1177
 
1178
- return feature_sets
1178
+ return _schemas_m2m
1179
1179
 
1180
1180
  # link feature sets
1181
- self._host._feature_sets = unify_feature_sets_by_hash(feature_sets)
1181
+ self._host._staged__schemas_m2m = unify_staged__schemas_m2m_by_hash(_schemas_m2m)
1182
1182
  self._host.save()
1183
1183
 
1184
1184
 
@@ -1188,8 +1188,8 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1188
1188
  if transfer_logs is None:
1189
1189
  transfer_logs = {"mapped": [], "transferred": [], "run": None}
1190
1190
  using_key = settings._using_key
1191
- for slot, feature_set in data.features._feature_set_by_slot.items():
1192
- members = feature_set.members
1191
+ for slot, schema in data.features._schema_by_slot.items():
1192
+ members = schema.members
1193
1193
  if len(members) == 0:
1194
1194
  continue
1195
1195
  registry = members[0].__class__
@@ -1225,20 +1225,18 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1225
1225
  save(new_members)
1226
1226
 
1227
1227
  # create a new feature set from feature values using the same uid
1228
- feature_set_self = FeatureSet.from_values(
1229
- member_uids, field=getattr(registry, field)
1230
- )
1231
- if feature_set_self is None:
1228
+ schema_self = Schema.from_values(member_uids, field=getattr(registry, field))
1229
+ if schema_self is None:
1232
1230
  if hasattr(registry, "organism_id"):
1233
1231
  logger.warning(
1234
- f"FeatureSet is not transferred, check if organism is set correctly: {feature_set}"
1232
+ f"Schema is not transferred, check if organism is set correctly: {schema}"
1235
1233
  )
1236
1234
  continue
1237
- # make sure the uid matches if featureset is composed of same features
1238
- if feature_set_self.hash == feature_set.hash:
1239
- feature_set_self.uid = feature_set.uid
1240
- logger.info(f"saving {slot} featureset: {feature_set_self}")
1241
- self._host.features.add_feature_set(feature_set_self, slot)
1235
+ # make sure the uid matches if schema is composed of same features
1236
+ if schema_self.hash == schema.hash:
1237
+ schema_self.uid = schema.uid
1238
+ logger.info(f"saving {slot} schema: {schema_self}")
1239
+ self._host.features.add_schema(schema_self, slot)
1242
1240
 
1243
1241
 
1244
1242
  def make_external(self, feature: Feature) -> None:
@@ -1250,8 +1248,8 @@ def make_external(self, feature: Feature) -> None:
1250
1248
  """
1251
1249
  if not isinstance(feature, Feature):
1252
1250
  raise TypeError("feature must be a Feature record!")
1253
- feature_sets = FeatureSet.filter(features=feature).all()
1254
- for fs in feature_sets:
1251
+ _schemas_m2m = Schema.filter(features=feature).all()
1252
+ for fs in _schemas_m2m:
1255
1253
  f = Feature.filter(uid=feature.uid).all()
1256
1254
  features_updated = fs.members.difference(f)
1257
1255
  if len(features_updated) > 0:
@@ -1261,13 +1259,14 @@ def make_external(self, feature: Feature) -> None:
1261
1259
  fs.n = len(features_updated)
1262
1260
  fs.save()
1263
1261
  # delete the link between the feature and the feature set
1264
- FeatureSet.features.through.objects.filter(
1265
- feature_id=feature.id, featureset_id=fs.id
1262
+ Schema.features.through.objects.filter(
1263
+ feature_id=feature.id, schema_id=fs.id
1266
1264
  ).delete()
1267
- # if no members are left in the featureset, delete it
1265
+ # if no members are left in the schema, delete it
1268
1266
  if len(features_updated) == 0:
1269
1267
  logger.warning(f"deleting empty feature set: {fs}")
1270
1268
  fs.artifacts.set([])
1269
+ fs._artifacts_m2m.set([])
1271
1270
  fs.delete()
1272
1271
 
1273
1272
 
@@ -1277,10 +1276,12 @@ FeatureManager.__repr__ = __repr__
1277
1276
  ParamManager.__repr__ = __repr__
1278
1277
  FeatureManager.__getitem__ = __getitem__
1279
1278
  FeatureManager.get_values = get_values
1280
- FeatureManager._feature_set_by_slot = _feature_set_by_slot
1279
+ FeatureManager._schema_by_slot = _schema_by_slot
1280
+ FeatureManager._feature_set_by_slot = _schema_by_slot
1281
1281
  FeatureManager._accessor_by_registry = _accessor_by_registry
1282
1282
  FeatureManager.add_values = add_values_features
1283
- FeatureManager.add_feature_set = add_feature_set
1283
+ FeatureManager.add_schema = add_schema
1284
+ FeatureManager.add_feature_set = add_schema # backward compat, will raise warning soon
1284
1285
  FeatureManager._add_set_from_df = _add_set_from_df
1285
1286
  FeatureManager._add_set_from_anndata = _add_set_from_anndata
1286
1287
  FeatureManager._add_set_from_mudata = _add_set_from_mudata