lamindb 0.77.2__py3-none-any.whl → 1.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. lamindb/__init__.py +39 -32
  2. lamindb/_artifact.py +95 -64
  3. lamindb/_can_curate.py +19 -10
  4. lamindb/_collection.py +51 -49
  5. lamindb/_feature.py +9 -9
  6. lamindb/_finish.py +99 -86
  7. lamindb/_from_values.py +20 -17
  8. lamindb/_is_versioned.py +2 -1
  9. lamindb/_parents.py +23 -16
  10. lamindb/_query_manager.py +3 -3
  11. lamindb/_query_set.py +85 -18
  12. lamindb/_record.py +121 -46
  13. lamindb/_run.py +3 -3
  14. lamindb/_save.py +14 -8
  15. lamindb/{_feature_set.py → _schema.py} +34 -31
  16. lamindb/_storage.py +2 -1
  17. lamindb/_transform.py +51 -23
  18. lamindb/_ulabel.py +17 -8
  19. lamindb/_view.py +15 -14
  20. lamindb/base/__init__.py +24 -0
  21. lamindb/base/fields.py +281 -0
  22. lamindb/base/ids.py +103 -0
  23. lamindb/base/types.py +51 -0
  24. lamindb/base/users.py +30 -0
  25. lamindb/base/validation.py +67 -0
  26. lamindb/core/__init__.py +19 -14
  27. lamindb/core/_context.py +297 -228
  28. lamindb/core/_data.py +44 -49
  29. lamindb/core/_describe.py +41 -31
  30. lamindb/core/_django.py +59 -44
  31. lamindb/core/_feature_manager.py +192 -168
  32. lamindb/core/_label_manager.py +22 -22
  33. lamindb/core/_mapped_collection.py +17 -14
  34. lamindb/core/_settings.py +1 -12
  35. lamindb/core/_sync_git.py +56 -9
  36. lamindb/core/_track_environment.py +1 -1
  37. lamindb/core/datasets/_core.py +5 -6
  38. lamindb/core/exceptions.py +0 -7
  39. lamindb/core/fields.py +1 -1
  40. lamindb/core/loaders.py +18 -2
  41. lamindb/core/{schema.py → relations.py} +22 -19
  42. lamindb/core/storage/_anndata_accessor.py +1 -2
  43. lamindb/core/storage/_backed_access.py +2 -1
  44. lamindb/core/storage/_tiledbsoma.py +40 -13
  45. lamindb/core/storage/objects.py +1 -1
  46. lamindb/core/storage/paths.py +13 -8
  47. lamindb/core/subsettings/__init__.py +0 -2
  48. lamindb/core/types.py +2 -23
  49. lamindb/core/versioning.py +11 -7
  50. lamindb/{_curate.py → curators/__init__.py} +700 -57
  51. lamindb/curators/_spatial.py +528 -0
  52. lamindb/integrations/_vitessce.py +1 -3
  53. lamindb/migrations/0052_squashed.py +1261 -0
  54. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +57 -0
  55. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +35 -0
  56. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +61 -0
  57. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +22 -0
  58. lamindb/migrations/0057_link_models_latest_report_and_others.py +356 -0
  59. lamindb/migrations/0058_artifact__actions_collection__actions.py +22 -0
  60. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +31 -0
  61. lamindb/migrations/0060_alter_artifact__actions.py +22 -0
  62. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +45 -0
  63. lamindb/migrations/0062_add_is_latest_field.py +32 -0
  64. lamindb/migrations/0063_populate_latest_field.py +45 -0
  65. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +33 -0
  66. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +22 -0
  67. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +352 -0
  68. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +20 -0
  69. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +20 -0
  70. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +1294 -0
  71. lamindb/migrations/0069_squashed.py +1770 -0
  72. lamindb/migrations/0070_lamindbv1_migrate_data.py +78 -0
  73. lamindb/migrations/0071_lamindbv1_migrate_schema.py +741 -0
  74. lamindb/migrations/0072_remove_user__branch_code_remove_user_aux_and_more.py +148 -0
  75. lamindb/migrations/0073_merge_ourprojects.py +945 -0
  76. lamindb/migrations/0074_lamindbv1_part4.py +374 -0
  77. lamindb/migrations/0075_lamindbv1_part5.py +276 -0
  78. lamindb/migrations/0076_lamindbv1_part6.py +621 -0
  79. lamindb/migrations/0077_lamindbv1_part6b.py +228 -0
  80. lamindb/migrations/0078_lamindbv1_part6c.py +468 -0
  81. lamindb/migrations/0079_alter_rundata_value_json_and_more.py +36 -0
  82. lamindb/migrations/__init__.py +0 -0
  83. lamindb/models.py +4064 -0
  84. {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/METADATA +15 -20
  85. lamindb-1.0rc1.dist-info/RECORD +100 -0
  86. {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/WHEEL +1 -1
  87. lamindb/core/subsettings/_transform_settings.py +0 -21
  88. lamindb-0.77.2.dist-info/RECORD +0 -63
  89. {lamindb-0.77.2.dist-info → lamindb-1.0rc1.dist-info}/LICENSE +0 -0
@@ -14,35 +14,17 @@ from anndata import AnnData
14
14
  from django.contrib.postgres.aggregates import ArrayAgg
15
15
  from django.db import connections
16
16
  from django.db.models import Aggregate
17
- from lamin_utils import colors, logger
17
+ from lamin_utils import logger
18
18
  from lamindb_setup.core.hashing import hash_set
19
19
  from lamindb_setup.core.upath import create_path
20
- from lnschema_core.models import (
21
- Artifact,
22
- Collection,
23
- Feature,
24
- FeatureManager,
25
- FeatureValue,
26
- LinkORM,
27
- Param,
28
- ParamManager,
29
- ParamManagerArtifact,
30
- ParamManagerRun,
31
- ParamValue,
32
- Record,
33
- Run,
34
- ULabel,
35
- )
36
20
  from rich.table import Column, Table
37
21
  from rich.text import Text
38
22
 
39
23
  from lamindb._feature import (
40
- FEATURE_DTYPES,
41
24
  convert_pandas_dtype_to_lamin_dtype,
42
25
  suggest_categorical_for_str_iterable,
43
26
  )
44
- from lamindb._feature_set import DICT_KEYS_TYPE, FeatureSet
45
- from lamindb._from_values import _print_values
27
+ from lamindb._from_values import _format_values
46
28
  from lamindb._record import (
47
29
  REGISTRY_UNIQUE_FIELD,
48
30
  get_name_field,
@@ -50,8 +32,25 @@ from lamindb._record import (
50
32
  transfer_to_default_db,
51
33
  )
52
34
  from lamindb._save import save
35
+ from lamindb._schema import DICT_KEYS_TYPE, Schema
53
36
  from lamindb.core.exceptions import DoesNotExist, ValidationError
54
37
  from lamindb.core.storage import LocalPathClasses
38
+ from lamindb.models import (
39
+ Artifact,
40
+ Collection,
41
+ Feature,
42
+ FeatureManager,
43
+ FeatureValue,
44
+ LinkORM,
45
+ Param,
46
+ ParamManager,
47
+ ParamManagerArtifact,
48
+ ParamManagerRun,
49
+ ParamValue,
50
+ Record,
51
+ Run,
52
+ ULabel,
53
+ )
55
54
 
56
55
  from ._describe import (
57
56
  NAME_WIDTH,
@@ -63,15 +62,15 @@ from ._describe import (
63
62
  from ._django import get_artifact_with_related
64
63
  from ._label_manager import _get_labels, describe_labels
65
64
  from ._settings import settings
66
- from .schema import (
65
+ from .relations import (
67
66
  dict_related_model_to_related_name,
68
67
  )
69
68
 
70
69
  if TYPE_CHECKING:
71
- from lnschema_core.types import FieldAttr
72
70
  from rich.tree import Tree
73
71
 
74
72
  from lamindb._query_set import QuerySet
73
+ from lamindb.base.types import FieldAttr
75
74
 
76
75
 
77
76
  def get_host_id_field(host: Artifact | Collection) -> str:
@@ -84,7 +83,7 @@ def get_host_id_field(host: Artifact | Collection) -> str:
84
83
 
85
84
  def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
86
85
  dictionary = {
87
- field.related_model.__get_name_with_schema__(): field.name
86
+ field.related_model.__get_name_with_module__(): field.name
88
87
  for field in host._meta.related_objects
89
88
  }
90
89
  dictionary["Feature"] = "features"
@@ -92,25 +91,25 @@ def get_accessor_by_registry_(host: Artifact | Collection) -> dict:
92
91
  return dictionary
93
92
 
94
93
 
95
- def get_feature_set_by_slot_(host: Artifact | Collection) -> dict:
94
+ def get_schema_by_slot_(host: Artifact | Collection) -> dict:
96
95
  if isinstance(host, Collection):
97
96
  return {}
98
97
  # if the host is not yet saved
99
98
  if host._state.adding:
100
- if hasattr(host, "_feature_sets"):
101
- return host._feature_sets
99
+ if hasattr(host, "_staged__schemas_m2m"):
100
+ return host._staged__schemas_m2m
102
101
  else:
103
102
  return {}
104
103
  host_db = host._state.db
105
104
  host_id_field = get_host_id_field(host)
106
105
  kwargs = {host_id_field: host.id}
107
106
  # otherwise, we need a query
108
- links_feature_set = (
109
- host.feature_sets.through.objects.using(host_db)
107
+ links_schema = (
108
+ host._schemas_m2m.through.objects.using(host_db)
110
109
  .filter(**kwargs)
111
- .select_related("featureset")
110
+ .select_related("schema")
112
111
  )
113
- return {fsl.slot: fsl.featureset for fsl in links_feature_set}
112
+ return {fsl.slot: fsl.schema for fsl in links_schema}
114
113
 
115
114
 
116
115
  def get_label_links(
@@ -126,11 +125,11 @@ def get_label_links(
126
125
  return link_records
127
126
 
128
127
 
129
- def get_feature_set_links(host: Artifact | Collection) -> QuerySet:
128
+ def get_schema_links(host: Artifact | Collection) -> QuerySet:
130
129
  host_id_field = get_host_id_field(host)
131
130
  kwargs = {host_id_field: host.id}
132
- links_feature_set = host.feature_sets.through.objects.filter(**kwargs)
133
- return links_feature_set
131
+ links_schema = host._schemas_m2m.through.objects.filter(**kwargs)
132
+ return links_schema
134
133
 
135
134
 
136
135
  def get_link_attr(link: LinkORM | type[LinkORM], data: Artifact | Collection) -> str:
@@ -270,25 +269,27 @@ def _get_non_categoricals(
270
269
  return non_categoricals
271
270
 
272
271
 
273
- def _get_featuresets_postgres(
272
+ def _get_schemas_postgres(
274
273
  self: Artifact | Collection,
275
274
  related_data: dict | None = None,
276
275
  ) -> dict:
277
276
  if not related_data:
278
- artifact_meta = get_artifact_with_related(self, include_featureset=True)
277
+ artifact_meta = get_artifact_with_related(self, include_schema=True)
279
278
  related_data = artifact_meta.get("related_data", {})
280
279
 
281
- fs_data = related_data.get("featuresets", {}) if related_data else {}
280
+ fs_data = related_data.get("schemas", {}) if related_data else {}
282
281
  return fs_data
283
282
 
284
283
 
285
- def _create_feature_table(name: str, registry_str: str, data: list) -> Table:
284
+ def _create_feature_table(
285
+ name: str, registry_str: str, data: list, show_header: bool = False
286
+ ) -> Table:
286
287
  """Create a Rich table for a feature group."""
287
288
  table = Table(
288
289
  Column(name, style="", no_wrap=True, width=NAME_WIDTH),
289
290
  Column(registry_str, style="dim", no_wrap=True, width=TYPE_WIDTH),
290
291
  Column("", width=VALUES_WIDTH, no_wrap=True),
291
- show_header=True,
292
+ show_header=show_header,
292
293
  box=None,
293
294
  pad_edge=False,
294
295
  )
@@ -324,36 +325,36 @@ def describe_features(
324
325
  return dictionary if to_dict else tree
325
326
 
326
327
  # feature sets
327
- feature_set_data: dict[str, tuple[str, list[str]]] = {}
328
+ schema_data: dict[str, tuple[str, list[str]]] = {}
328
329
  feature_data: dict[str, tuple[str, list[str]]] = {}
329
330
  if not print_params and not to_dict:
330
331
  if self.id is not None and connections[self._state.db].vendor == "postgresql":
331
- fs_data = _get_featuresets_postgres(self, related_data=related_data)
332
+ fs_data = _get_schemas_postgres(self, related_data=related_data)
332
333
  for fs_id, (slot, data) in fs_data.items():
333
334
  for registry_str, feature_names in data.items():
334
- feature_set = FeatureSet.get(id=fs_id)
335
- feature_set_data[slot] = (feature_set, feature_names)
335
+ schema = Schema.objects.using(self._state.db).get(id=fs_id)
336
+ schema_data[slot] = (schema, feature_names)
336
337
  for feature_name in feature_names:
337
338
  feature_data[feature_name] = (slot, registry_str)
338
339
  else:
339
- for slot, feature_set in get_feature_set_by_slot_(self).items():
340
- features = feature_set.members
340
+ for slot, schema in get_schema_by_slot_(self).items():
341
+ features = schema.members
341
342
  # features.first() is a lot slower than features[0] here
342
343
  name_field = get_name_field(features[0])
343
344
  feature_names = list(features.values_list(name_field, flat=True)[:20])
344
- feature_set_data[slot] = (feature_set, feature_names)
345
+ schema_data[slot] = (schema, feature_names)
345
346
  for feature_name in feature_names:
346
- feature_data[feature_name] = (slot, feature_set.registry)
347
+ feature_data[feature_name] = (slot, schema.itype)
347
348
 
348
- internal_feature_names: set[str] = {} # type: ignore
349
+ internal_feature_names: dict[str, str] = {}
349
350
  if isinstance(self, Artifact):
350
- feature_sets = self.feature_sets.filter(registry="Feature").all()
351
- internal_feature_names = set() # type: ignore
352
- if len(feature_sets) > 0:
353
- for feature_set in feature_sets:
354
- internal_feature_names = internal_feature_names.union(
355
- set(feature_set.members.values_list("name", flat=True))
356
- ) # type: ignore
351
+ _schemas_m2m = self._schemas_m2m.filter(itype="Feature").all()
352
+ internal_feature_names = {}
353
+ if len(_schemas_m2m) > 0:
354
+ for schema in _schemas_m2m:
355
+ internal_feature_names.update(
356
+ dict(schema.members.values_list("name", "dtype"))
357
+ )
357
358
 
358
359
  # categorical feature values
359
360
  # Get the categorical data using the appropriate method
@@ -388,7 +389,7 @@ def describe_features(
388
389
 
389
390
  # Format message
390
391
  printed_values = (
391
- _print_values(sorted(values), n=10, quotes=False)
392
+ _format_values(sorted(values), n=10, quotes=False)
392
393
  if not is_list_type or not feature_dtype.startswith("list")
393
394
  else sorted(values)
394
395
  )
@@ -407,78 +408,99 @@ def describe_features(
407
408
  if to_dict:
408
409
  return dictionary
409
410
 
410
- # Dataset section
411
- internal_features_slot: dict[
412
- str, list
413
- ] = {} # internal features from the `Feature` registry that contain labels
411
+ # Dataset features section
412
+ # internal features that contain labels (only `Feature` features contain labels)
413
+ internal_feature_labels_slot: dict[str, list] = {}
414
414
  for feature_name, feature_row in internal_feature_labels.items():
415
415
  slot, _ = feature_data.get(feature_name)
416
- internal_features_slot.setdefault(slot, []).append(feature_row)
417
- dataset_tree_children = []
418
-
419
- for slot, (feature_set, feature_names) in feature_set_data.items():
420
- if slot in internal_features_slot:
421
- feature_rows = internal_features_slot[slot]
416
+ internal_feature_labels_slot.setdefault(slot, []).append(feature_row)
417
+
418
+ int_features_tree_children = []
419
+ for slot, (schema, feature_names) in schema_data.items():
420
+ if slot in internal_feature_labels_slot:
421
+ # add internal Feature features with labels
422
+ feature_rows = internal_feature_labels_slot[slot]
423
+ # add internal Feature features without labels
424
+ feature_rows += [
425
+ (
426
+ feature_name,
427
+ Text(str(internal_feature_names.get(feature_name)), style="dim"),
428
+ "",
429
+ )
430
+ for feature_name in feature_names
431
+ if feature_name and feature_name not in internal_feature_labels
432
+ ]
422
433
  else:
434
+ # add internal non-Feature features without labels
423
435
  feature_rows = [
424
- (feature_name, Text(str(feature_set.dtype), style="dim"), "")
436
+ (
437
+ feature_name,
438
+ Text(
439
+ str(
440
+ internal_feature_names.get(feature_name)
441
+ if feature_name in internal_feature_names
442
+ else schema.dtype
443
+ ),
444
+ style="dim",
445
+ ),
446
+ "",
447
+ )
425
448
  for feature_name in feature_names
426
449
  if feature_name
427
450
  ]
428
- dataset_tree_children.append(
451
+ int_features_tree_children.append(
429
452
  _create_feature_table(
430
453
  Text.assemble(
431
454
  (slot, "violet"),
432
455
  (" • ", "dim"),
433
- (str(feature_set.n), "pink1"),
456
+ (str(schema.n), "pink1"),
434
457
  ),
435
- Text.assemble((f"[{feature_set.registry}]", "pink1")),
458
+ Text.assemble((f"[{schema.itype}]", "pink1")),
436
459
  feature_rows,
460
+ show_header=True,
437
461
  )
438
462
  )
439
463
  ## internal features from the non-`Feature` registry
440
- if dataset_tree_children:
464
+ if int_features_tree_children:
441
465
  dataset_tree = tree.add(
442
466
  Text.assemble(
443
- ("Dataset", "bold bright_magenta"),
467
+ ("Dataset features", "bold bright_magenta"),
444
468
  ("/", "dim"),
445
- (".feature_sets", "dim bold"),
469
+ ("._schemas_m2m", "dim bold"),
446
470
  )
447
471
  )
448
- for child in dataset_tree_children:
472
+ for child in int_features_tree_children:
449
473
  dataset_tree.add(child)
450
474
 
451
- # Annotations section
452
- ## external features
453
- features_tree_children = []
475
+ # Linked features
476
+ ext_features_tree_children = []
454
477
  if external_data:
455
- features_tree_children.append(
478
+ ext_features_tree_children.append(
456
479
  _create_feature_table(
457
- Text.assemble(
458
- ("Params" if print_params else "Features", "green_yellow")
459
- ),
480
+ "",
460
481
  "",
461
482
  external_data,
462
483
  )
463
484
  )
464
- annotations_tree = None
465
- if features_tree_children:
466
- annotations_tree = tree.add(Text("Annotations", style="bold dark_orange"))
467
- for child in features_tree_children:
468
- annotations_tree.add(child)
485
+ # ext_features_tree = None
486
+ ext_features_header = Text(
487
+ "Params" if print_params else "Linked features", style="bold dark_orange"
488
+ )
489
+ if ext_features_tree_children:
490
+ ext_features_tree = tree.add(ext_features_header)
491
+ for child in ext_features_tree_children:
492
+ ext_features_tree.add(child)
469
493
  if with_labels:
470
- labels_tree = describe_labels(self, as_subtree=True)
494
+ # avoid querying the db if the labels were queried already
495
+ labels_data = related_data.get("m2m") if related_data is not None else None
496
+ labels_tree = describe_labels(self, labels_data=labels_data, as_subtree=True)
471
497
  if labels_tree:
472
- if annotations_tree is None:
473
- annotations_tree = tree.add(
474
- Text("Annotations", style="bold dark_orange")
475
- )
476
- annotations_tree.add(labels_tree)
498
+ tree.add(labels_tree)
477
499
 
478
500
  return tree
479
501
 
480
502
 
481
- def parse_feature_sets_from_anndata(
503
+ def parse_staged__schemas_m2m_from_anndata(
482
504
  adata: AnnData,
483
505
  var_field: FieldAttr | None = None,
484
506
  obs_field: FieldAttr = Feature.name,
@@ -502,11 +524,11 @@ def parse_feature_sets_from_anndata(
502
524
  if adata.X is None
503
525
  else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
504
526
  )
505
- feature_sets = {}
527
+ _schemas_m2m = {}
506
528
  if var_field is not None:
507
529
  logger.info("parsing feature names of X stored in slot 'var'")
508
530
  logger.indent = " "
509
- feature_set_var = FeatureSet.from_values(
531
+ schema_var = Schema.from_values(
510
532
  data_parse.var.index,
511
533
  var_field,
512
534
  type=type,
@@ -514,28 +536,28 @@ def parse_feature_sets_from_anndata(
514
536
  organism=organism,
515
537
  raise_validation_error=False,
516
538
  )
517
- if feature_set_var is not None:
518
- feature_sets["var"] = feature_set_var
519
- logger.save(f"linked: {feature_set_var}")
539
+ if schema_var is not None:
540
+ _schemas_m2m["var"] = schema_var
541
+ logger.save(f"linked: {schema_var}")
520
542
  logger.indent = ""
521
- if feature_set_var is None:
543
+ if schema_var is None:
522
544
  logger.warning("skip linking features to artifact in slot 'var'")
523
545
  if len(data_parse.obs.columns) > 0:
524
546
  logger.info("parsing feature names of slot 'obs'")
525
547
  logger.indent = " "
526
- feature_set_obs = FeatureSet.from_df(
548
+ schema_obs = Schema.from_df(
527
549
  df=data_parse.obs,
528
550
  field=obs_field,
529
551
  mute=mute,
530
552
  organism=organism,
531
553
  )
532
- if feature_set_obs is not None:
533
- feature_sets["obs"] = feature_set_obs
534
- logger.save(f"linked: {feature_set_obs}")
554
+ if schema_obs is not None:
555
+ _schemas_m2m["obs"] = schema_obs
556
+ logger.save(f"linked: {schema_obs}")
535
557
  logger.indent = ""
536
- if feature_set_obs is None:
558
+ if schema_obs is None:
537
559
  logger.warning("skip linking features to artifact in slot 'obs'")
538
- return feature_sets
560
+ return _schemas_m2m
539
561
 
540
562
 
541
563
  def is_valid_datetime_str(date_string: str) -> bool | str:
@@ -598,12 +620,12 @@ def infer_feature_type_convert_json(
598
620
  return ("list[cat ? str]", value, message)
599
621
  elif first_element_type == Record:
600
622
  return (
601
- f"list[cat[{first_element_type.__get_name_with_schema__()}]]",
623
+ f"list[cat[{first_element_type.__get_name_with_module__()}]]",
602
624
  value,
603
625
  message,
604
626
  )
605
627
  elif isinstance(value, Record):
606
- return (f"cat[{value.__class__.__get_name_with_schema__()}]", value, message)
628
+ return (f"cat[{value.__class__.__get_name_with_module__()}]", value, message)
607
629
  if not mute:
608
630
  logger.warning(f"cannot infer feature type of: {value}, returning '?")
609
631
  return "?", value, message
@@ -611,7 +633,7 @@ def infer_feature_type_convert_json(
611
633
 
612
634
  def __init__(self, host: Artifact | Collection | Run):
613
635
  self._host = host
614
- self._feature_set_by_slot_ = None
636
+ self._schema_by_slot_ = None
615
637
  self._accessor_by_registry_ = None
616
638
 
617
639
 
@@ -628,15 +650,15 @@ def get_values(self) -> dict[str, Any]:
628
650
 
629
651
 
630
652
  def __getitem__(self, slot) -> QuerySet:
631
- if slot not in self._feature_set_by_slot:
653
+ if slot not in self._schema_by_slot:
632
654
  raise ValueError(
633
655
  f"No linked feature set for slot: {slot}\nDid you get validation"
634
656
  " warnings? Only features that match registered features get validated"
635
657
  " and linked."
636
658
  )
637
- feature_set = self._feature_set_by_slot[slot]
638
- orm_name = feature_set.registry
639
- return getattr(feature_set, self._accessor_by_registry[orm_name]).all()
659
+ schema = self._schema_by_slot[slot]
660
+ orm_name = schema.itype
661
+ return getattr(schema, self._accessor_by_registry[orm_name]).all()
640
662
 
641
663
 
642
664
  def filter_base(cls, **expression):
@@ -726,11 +748,11 @@ def get(cls, **expression) -> Record:
726
748
 
727
749
 
728
750
  @property # type: ignore
729
- def _feature_set_by_slot(self):
751
+ def _schema_by_slot(self):
730
752
  """Feature sets by slot."""
731
- if self._feature_set_by_slot_ is None:
732
- self._feature_set_by_slot_ = get_feature_set_by_slot_(self._host)
733
- return self._feature_set_by_slot_
753
+ if self._schema_by_slot_ is None:
754
+ self._schema_by_slot_ = get_schema_by_slot_(self._host)
755
+ return self._schema_by_slot_
734
756
 
735
757
 
736
758
  @property # type: ignore
@@ -810,11 +832,11 @@ def _add_values(
810
832
  model_name = "Param" if is_param else "Feature"
811
833
  if is_param:
812
834
  if self._host.__class__ == Artifact:
813
- if self._host.type != "model":
835
+ if self._host.kind != "model":
814
836
  raise ValidationError("Can only set params for model-like artifacts.")
815
837
  else:
816
838
  if self._host.__class__ == Artifact:
817
- if self._host.type != "dataset" and self._host.type is not None:
839
+ if self._host.kind != "dataset" and self._host.kind is not None:
818
840
  raise ValidationError(
819
841
  "Can only set features for dataset-like artifacts."
820
842
  )
@@ -890,7 +912,7 @@ def _add_values(
890
912
  raise ValidationError(
891
913
  f"Please save {record} before annotation."
892
914
  )
893
- features_labels[record.__class__.__get_name_with_schema__()].append(
915
+ features_labels[record.__class__.__get_name_with_module__()].append(
894
916
  (feature, record)
895
917
  )
896
918
  else:
@@ -934,7 +956,7 @@ def _add_values(
934
956
  links = [
935
957
  LinkORM(
936
958
  **{
937
- f"{self._host.__class__.__get_name_with_schema__().lower()}_id": self._host.id,
959
+ f"{self._host.__class__.__get_name_with_module__().lower()}_id": self._host.id,
938
960
  valuefield_id: feature_value.id,
939
961
  }
940
962
  )
@@ -1007,14 +1029,14 @@ def remove_values(
1007
1029
  link_models_on_models = {
1008
1030
  getattr(
1009
1031
  Artifact, obj.related_name
1010
- ).through.__get_name_with_schema__(): obj.related_model.__get_name_with_schema__()
1032
+ ).through.__get_name_with_module__(): obj.related_model.__get_name_with_module__()
1011
1033
  for obj in Artifact._meta.related_objects
1012
- if obj.related_model.__get_name_with_schema__() == feature_registry
1034
+ if obj.related_model.__get_name_with_module__() == feature_registry
1013
1035
  }
1014
1036
  link_attribute = {
1015
1037
  obj.related_name
1016
1038
  for obj in Artifact._meta.related_objects
1017
- if obj.related_model.__get_name_with_schema__() in link_models_on_models
1039
+ if obj.related_model.__get_name_with_module__() in link_models_on_models
1018
1040
  }.pop()
1019
1041
  getattr(self._host, link_attribute).filter(**filter_kwargs).all().delete()
1020
1042
  else:
@@ -1027,36 +1049,37 @@ def remove_values(
1027
1049
  # we can clean the FeatureValue registry periodically if we want to
1028
1050
 
1029
1051
 
1030
- def add_feature_set(self, feature_set: FeatureSet, slot: str) -> None:
1031
- """Curate artifact with a feature set.
1052
+ def add_schema(self, schema: Schema, slot: str) -> None:
1053
+ """Annotate artifact with a schema.
1032
1054
 
1033
1055
  Args:
1034
- feature_set: `FeatureSet` A feature set record.
1035
- slot: `str` The slot that marks where the feature set is stored in
1056
+ schema: `Schema` A schema record.
1057
+ slot: `str` The slot that marks where the schema is stored in
1036
1058
  the artifact.
1037
1059
  """
1060
+ # TODO: deprecate as soon as we have the Schema-based curators
1038
1061
  if self._host._state.adding:
1039
1062
  raise ValueError(
1040
1063
  "Please save the artifact or collection before adding a feature set!"
1041
1064
  )
1042
1065
  host_db = self._host._state.db
1043
- feature_set.save(using=host_db)
1066
+ schema.save(using=host_db)
1044
1067
  host_id_field = get_host_id_field(self._host)
1045
1068
  kwargs = {
1046
1069
  host_id_field: self._host.id,
1047
- "featureset": feature_set,
1070
+ "schema": schema,
1048
1071
  "slot": slot,
1049
1072
  }
1050
1073
  link_record = (
1051
- self._host.feature_sets.through.objects.using(host_db)
1074
+ self._host._schemas_m2m.through.objects.using(host_db)
1052
1075
  .filter(**kwargs)
1053
1076
  .one_or_none()
1054
1077
  )
1055
1078
  if link_record is None:
1056
- self._host.feature_sets.through(**kwargs).save(using=host_db)
1057
- if slot in self._feature_set_by_slot:
1079
+ self._host._schemas_m2m.through(**kwargs).save(using=host_db)
1080
+ if slot in self._schema_by_slot:
1058
1081
  logger.debug(f"replaced existing {slot} feature set")
1059
- self._feature_set_by_slot_[slot] = feature_set # type: ignore
1082
+ self._schema_by_slot_[slot] = schema # type: ignore
1060
1083
 
1061
1084
 
1062
1085
  def _add_set_from_df(
@@ -1067,18 +1090,18 @@ def _add_set_from_df(
1067
1090
  ):
1068
1091
  """Add feature set corresponding to column names of DataFrame."""
1069
1092
  if isinstance(self._host, Artifact):
1070
- assert self._host._accessor == "DataFrame" # noqa: S101
1093
+ assert self._host.otype == "DataFrame" # noqa: S101
1071
1094
  else:
1072
1095
  # Collection
1073
- assert self._host.artifact._accessor == "DataFrame" # noqa: S101
1096
+ assert self._host.artifact.otype == "DataFrame" # noqa: S101
1074
1097
  df = self._host.load()
1075
- feature_set = FeatureSet.from_df(
1098
+ schema = Schema.from_df(
1076
1099
  df=df,
1077
1100
  field=field,
1078
1101
  mute=mute,
1079
1102
  organism=organism,
1080
1103
  )
1081
- self._host._feature_sets = {"columns": feature_set}
1104
+ self._host._staged__schemas_m2m = {"columns": schema}
1082
1105
  self._host.save()
1083
1106
 
1084
1107
 
@@ -1091,13 +1114,13 @@ def _add_set_from_anndata(
1091
1114
  ):
1092
1115
  """Add features from AnnData."""
1093
1116
  if isinstance(self._host, Artifact):
1094
- assert self._host._accessor == "AnnData" # noqa: S101
1117
+ assert self._host.otype == "AnnData" # noqa: S101
1095
1118
  else:
1096
1119
  raise NotImplementedError()
1097
1120
 
1098
1121
  # parse and register features
1099
1122
  adata = self._host.load()
1100
- feature_sets = parse_feature_sets_from_anndata(
1123
+ _schemas_m2m = parse_staged__schemas_m2m_from_anndata(
1101
1124
  adata,
1102
1125
  var_field=var_field,
1103
1126
  obs_field=obs_field,
@@ -1106,7 +1129,7 @@ def _add_set_from_anndata(
1106
1129
  )
1107
1130
 
1108
1131
  # link feature sets
1109
- self._host._feature_sets = feature_sets
1132
+ self._host._staged__schemas_m2m = _schemas_m2m
1110
1133
  self._host.save()
1111
1134
 
1112
1135
 
@@ -1121,18 +1144,18 @@ def _add_set_from_mudata(
1121
1144
  if obs_fields is None:
1122
1145
  obs_fields = {}
1123
1146
  if isinstance(self._host, Artifact):
1124
- assert self._host._accessor == "MuData" # noqa: S101
1147
+ assert self._host.otype == "MuData" # noqa: S101
1125
1148
  else:
1126
1149
  raise NotImplementedError()
1127
1150
 
1128
1151
  # parse and register features
1129
1152
  mdata = self._host.load()
1130
- feature_sets = {}
1153
+ _schemas_m2m = {}
1131
1154
  obs_features = Feature.from_values(mdata.obs.columns)
1132
1155
  if len(obs_features) > 0:
1133
- feature_sets["obs"] = FeatureSet(features=obs_features)
1156
+ _schemas_m2m["obs"] = Schema(features=obs_features)
1134
1157
  for modality, field in var_fields.items():
1135
- modality_fs = parse_feature_sets_from_anndata(
1158
+ modality_fs = parse_staged__schemas_m2m_from_anndata(
1136
1159
  mdata[modality],
1137
1160
  var_field=field,
1138
1161
  obs_field=obs_fields.get(modality, Feature.name),
@@ -1140,22 +1163,22 @@ def _add_set_from_mudata(
1140
1163
  organism=organism,
1141
1164
  )
1142
1165
  for k, v in modality_fs.items():
1143
- feature_sets[f"['{modality}'].{k}"] = v
1166
+ _schemas_m2m[f"['{modality}'].{k}"] = v
1144
1167
 
1145
- def unify_feature_sets_by_hash(feature_sets):
1168
+ def unify_staged__schemas_m2m_by_hash(_schemas_m2m):
1146
1169
  unique_values = {}
1147
1170
 
1148
- for key, value in feature_sets.items():
1171
+ for key, value in _schemas_m2m.items():
1149
1172
  value_hash = value.hash # Assuming each value has a .hash attribute
1150
1173
  if value_hash in unique_values:
1151
- feature_sets[key] = unique_values[value_hash]
1174
+ _schemas_m2m[key] = unique_values[value_hash]
1152
1175
  else:
1153
1176
  unique_values[value_hash] = value
1154
1177
 
1155
- return feature_sets
1178
+ return _schemas_m2m
1156
1179
 
1157
1180
  # link feature sets
1158
- self._host._feature_sets = unify_feature_sets_by_hash(feature_sets)
1181
+ self._host._staged__schemas_m2m = unify_staged__schemas_m2m_by_hash(_schemas_m2m)
1159
1182
  self._host.save()
1160
1183
 
1161
1184
 
@@ -1165,8 +1188,8 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1165
1188
  if transfer_logs is None:
1166
1189
  transfer_logs = {"mapped": [], "transferred": [], "run": None}
1167
1190
  using_key = settings._using_key
1168
- for slot, feature_set in data.features._feature_set_by_slot.items():
1169
- members = feature_set.members
1191
+ for slot, schema in data.features._schema_by_slot.items():
1192
+ members = schema.members
1170
1193
  if len(members) == 0:
1171
1194
  continue
1172
1195
  registry = members[0].__class__
@@ -1202,20 +1225,18 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
1202
1225
  save(new_members)
1203
1226
 
1204
1227
  # create a new feature set from feature values using the same uid
1205
- feature_set_self = FeatureSet.from_values(
1206
- member_uids, field=getattr(registry, field)
1207
- )
1208
- if feature_set_self is None:
1228
+ schema_self = Schema.from_values(member_uids, field=getattr(registry, field))
1229
+ if schema_self is None:
1209
1230
  if hasattr(registry, "organism_id"):
1210
1231
  logger.warning(
1211
- f"FeatureSet is not transferred, check if organism is set correctly: {feature_set}"
1232
+ f"Schema is not transferred, check if organism is set correctly: {schema}"
1212
1233
  )
1213
1234
  continue
1214
- # make sure the uid matches if featureset is composed of same features
1215
- if feature_set_self.hash == feature_set.hash:
1216
- feature_set_self.uid = feature_set.uid
1217
- logger.info(f"saving {slot} featureset: {feature_set_self}")
1218
- self._host.features.add_feature_set(feature_set_self, slot)
1235
+ # make sure the uid matches if schema is composed of same features
1236
+ if schema_self.hash == schema.hash:
1237
+ schema_self.uid = schema.uid
1238
+ logger.info(f"saving {slot} schema: {schema_self}")
1239
+ self._host.features.add_schema(schema_self, slot)
1219
1240
 
1220
1241
 
1221
1242
  def make_external(self, feature: Feature) -> None:
@@ -1227,8 +1248,8 @@ def make_external(self, feature: Feature) -> None:
1227
1248
  """
1228
1249
  if not isinstance(feature, Feature):
1229
1250
  raise TypeError("feature must be a Feature record!")
1230
- feature_sets = FeatureSet.filter(features=feature).all()
1231
- for fs in feature_sets:
1251
+ _schemas_m2m = Schema.filter(features=feature).all()
1252
+ for fs in _schemas_m2m:
1232
1253
  f = Feature.filter(uid=feature.uid).all()
1233
1254
  features_updated = fs.members.difference(f)
1234
1255
  if len(features_updated) > 0:
@@ -1238,13 +1259,14 @@ def make_external(self, feature: Feature) -> None:
1238
1259
  fs.n = len(features_updated)
1239
1260
  fs.save()
1240
1261
  # delete the link between the feature and the feature set
1241
- FeatureSet.features.through.objects.filter(
1242
- feature_id=feature.id, featureset_id=fs.id
1262
+ Schema.features.through.objects.filter(
1263
+ feature_id=feature.id, schema_id=fs.id
1243
1264
  ).delete()
1244
- # if no members are left in the featureset, delete it
1265
+ # if no members are left in the schema, delete it
1245
1266
  if len(features_updated) == 0:
1246
1267
  logger.warning(f"deleting empty feature set: {fs}")
1247
1268
  fs.artifacts.set([])
1269
+ fs._artifacts_m2m.set([])
1248
1270
  fs.delete()
1249
1271
 
1250
1272
 
@@ -1254,10 +1276,12 @@ FeatureManager.__repr__ = __repr__
1254
1276
  ParamManager.__repr__ = __repr__
1255
1277
  FeatureManager.__getitem__ = __getitem__
1256
1278
  FeatureManager.get_values = get_values
1257
- FeatureManager._feature_set_by_slot = _feature_set_by_slot
1279
+ FeatureManager._schema_by_slot = _schema_by_slot
1280
+ FeatureManager._feature_set_by_slot = _schema_by_slot
1258
1281
  FeatureManager._accessor_by_registry = _accessor_by_registry
1259
1282
  FeatureManager.add_values = add_values_features
1260
- FeatureManager.add_feature_set = add_feature_set
1283
+ FeatureManager.add_schema = add_schema
1284
+ FeatureManager.add_feature_set = add_schema # backward compat, will raise warning soon
1261
1285
  FeatureManager._add_set_from_df = _add_set_from_df
1262
1286
  FeatureManager._add_set_from_anndata = _add_set_from_anndata
1263
1287
  FeatureManager._add_set_from_mudata = _add_set_from_mudata