lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +216 -133
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/objects.py +6 -0
  20. lamindb/core/subsettings/__init__.py +2 -0
  21. lamindb/core/subsettings/_annotation_settings.py +11 -0
  22. lamindb/curators/__init__.py +7 -3559
  23. lamindb/curators/_legacy.py +2056 -0
  24. lamindb/curators/core.py +1546 -0
  25. lamindb/errors.py +11 -0
  26. lamindb/examples/__init__.py +27 -0
  27. lamindb/examples/schemas/__init__.py +12 -0
  28. lamindb/examples/schemas/_anndata.py +25 -0
  29. lamindb/examples/schemas/_simple.py +19 -0
  30. lamindb/integrations/_vitessce.py +8 -5
  31. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  32. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  33. lamindb/models/__init__.py +12 -2
  34. lamindb/models/_describe.py +21 -4
  35. lamindb/models/_feature_manager.py +384 -301
  36. lamindb/models/_from_values.py +1 -1
  37. lamindb/models/_is_versioned.py +5 -15
  38. lamindb/models/_label_manager.py +8 -2
  39. lamindb/models/artifact.py +354 -177
  40. lamindb/models/artifact_set.py +122 -0
  41. lamindb/models/can_curate.py +4 -1
  42. lamindb/models/collection.py +79 -56
  43. lamindb/models/core.py +1 -1
  44. lamindb/models/feature.py +78 -47
  45. lamindb/models/has_parents.py +24 -9
  46. lamindb/models/project.py +3 -3
  47. lamindb/models/query_manager.py +221 -22
  48. lamindb/models/query_set.py +251 -206
  49. lamindb/models/record.py +211 -344
  50. lamindb/models/run.py +59 -5
  51. lamindb/models/save.py +9 -5
  52. lamindb/models/schema.py +673 -196
  53. lamindb/models/transform.py +5 -14
  54. lamindb/models/ulabel.py +8 -5
  55. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
  56. lamindb-1.5.0.dist-info/RECORD +108 -0
  57. lamindb-1.3.2.dist-info/RECORD +0 -95
  58. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
  59. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
lamindb/models/schema.py CHANGED
@@ -1,12 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Any, overload
3
+ from typing import TYPE_CHECKING, Any, Type, overload
4
4
 
5
5
  import numpy as np
6
6
  from django.db import models
7
7
  from django.db.models import CASCADE, PROTECT, ManyToManyField
8
8
  from lamin_utils import logger
9
- from lamindb_setup.core.hashing import HASH_LENGTH, hash_set
9
+ from lamindb_setup.core.hashing import HASH_LENGTH, hash_string
10
+ from rich.table import Table
11
+ from rich.text import Text
12
+ from rich.tree import Tree
10
13
 
11
14
  from lamindb.base import ids
12
15
  from lamindb.base.fields import (
@@ -17,10 +20,11 @@ from lamindb.base.fields import (
17
20
  JSONField,
18
21
  )
19
22
  from lamindb.base.types import FieldAttr, ListLike
20
- from lamindb.errors import InvalidArgument
23
+ from lamindb.errors import FieldValidationError, InvalidArgument
24
+ from lamindb.models.feature import parse_cat_dtype
21
25
 
22
- from ..base import deprecated
23
26
  from ..errors import ValidationError
27
+ from ._describe import format_rich_tree, highlight_time
24
28
  from ._relations import (
25
29
  dict_related_model_to_related_name,
26
30
  get_related_name,
@@ -36,14 +40,13 @@ from .record import (
36
40
  LinkORM,
37
41
  Record,
38
42
  Registry,
43
+ _get_record_kwargs,
39
44
  init_self_from_db,
40
45
  update_attributes,
41
46
  )
42
47
  from .run import Param, TracksRun, TracksUpdates
43
48
 
44
49
  if TYPE_CHECKING:
45
- from collections.abc import Iterable
46
-
47
50
  import pandas as pd
48
51
  from django.db.models.query_utils import DeferredAttribute
49
52
 
@@ -80,80 +83,271 @@ def validate_features(features: list[Record]) -> Record:
80
83
  return next(iter(feature_types)) # return value in set of cardinality 1
81
84
 
82
85
 
83
- class Schema(Record, CanCurate, TracksRun):
84
- """Schemas.
86
+ def get_features_config(
87
+ features: list[Record] | tuple[Record, dict],
88
+ ) -> tuple[list[Record], list[tuple[Record, dict]]]:
89
+ """Get features and their config from the return of feature.with_config()."""
90
+ features_list = []
91
+ configs = []
92
+ try:
93
+ for feature in features:
94
+ if isinstance(feature, tuple):
95
+ features_list.append(feature[0])
96
+ configs.append(feature) # store the tuple in configs
97
+ else:
98
+ features_list.append(feature)
99
+ return features_list, configs # type: ignore
100
+ except TypeError:
101
+ return features, configs # type: ignore
85
102
 
86
- The simplest schema is a feature set such as the set of columns of a `DataFrame`.
87
103
 
88
- A composite schema has multiple components, e.g., for an `AnnData`, one schema for `obs` and another one for `var`.
104
+ def describe_schema(self: Schema) -> Tree:
105
+ """Create a rich tree visualization of a Schema with its features."""
106
+ otype = self.otype if hasattr(self, "otype") and self.otype else ""
107
+ tree = Tree(
108
+ Text.assemble((self.__class__.__name__, "bold"), (f" {otype}", "bold dim")),
109
+ guide_style="dim", # dim the connecting lines
110
+ )
89
111
 
90
- Args:
91
- features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
92
- records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
93
- a set upon instantiation. If you'd like to pass values, use
94
- :meth:`~lamindb.Schema.from_values` or
95
- :meth:`~lamindb.Schema.from_df`.
96
- components: `dict[str, Schema] | None = None` A dictionary mapping component names to
97
- their corresponding :class:`~lamindb.Schema` objects for composite schemas.
98
- name: `str | None = None` A name.
99
- description: `str | None = None` A description.
100
- dtype: `str | None = None` The simple type. Defaults to
101
- `None` for sets of :class:`~lamindb.Feature` records.
102
- Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
103
- itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
104
- type: `Schema | None = None` A type.
105
- is_type: `bool = False` Distinguish types from instances of the type.
106
- otype: `str | None = None` An object type to define the structure of a composite schema.
107
- minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
108
- ordered_set: `bool = False` Whether features are required to be ordered.
109
- maximal_set: `bool = False` If `True`, no additional features are allowed.
110
- slot: `str | None = None` The slot name when this schema is used as a component in a
111
- composite schema.
112
- coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
113
- during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
112
+ tree.add(f".uid = '{self.uid}'")
113
+ tree.add(f".name = '{self.name}'")
114
+ if self.description:
115
+ tree.add(f".description = '{self.description}'")
116
+ if self.itype:
117
+ tree.add(f".itype = '{self.itype}'")
118
+ if self.type:
119
+ tree.add(f".type = '{self.type}'")
120
+ tree.add(f".ordered_set = {self.ordered_set}")
121
+ tree.add(f".maximal_set = {self.maximal_set}")
122
+ if hasattr(self, "created_by") and self.created_by:
123
+ tree.add(
124
+ Text.assemble(
125
+ ".created_by = ",
126
+ (
127
+ self.created_by.handle
128
+ if self.created_by.name is None
129
+ else f"{self.created_by.handle} ({self.created_by.name})"
130
+ ),
131
+ )
132
+ )
133
+ if hasattr(self, "created_at") and self.created_at:
134
+ tree.add(Text.assemble(".created_at = ", highlight_time(str(self.created_at))))
135
+
136
+ members = self.members
137
+
138
+ # Add features section
139
+ features = tree.add(
140
+ Text.assemble(
141
+ (self.itype, "violet"),
142
+ (" • ", "dim"),
143
+ (str(members.count()), "pink1"),
144
+ )
145
+ )
146
+
147
+ if hasattr(self, "members") and self.members.count() > 0:
148
+ # create a table for the features
149
+ feature_table = Table(
150
+ show_header=True, header_style="dim", box=None, pad_edge=False
151
+ )
152
+
153
+ # Add columns
154
+ feature_table.add_column("name", style="", no_wrap=True)
155
+ feature_table.add_column("dtype", style="", no_wrap=True)
156
+ feature_table.add_column("optional", style="", no_wrap=True)
157
+ feature_table.add_column("nullable", style="", no_wrap=True)
158
+ feature_table.add_column("coerce_dtype", style="", no_wrap=True)
159
+ feature_table.add_column("default_value", style="", no_wrap=True)
160
+
161
+ # Add rows for each member
162
+ optionals = self.optionals.get()
163
+ for member in self.members:
164
+ feature_table.add_row(
165
+ member.name,
166
+ Text(
167
+ str(member.dtype)
168
+ ), # needs to be wrapped in Text to display correctly
169
+ "✓" if optionals.filter(uid=member.uid).exists() else "✗",
170
+ "✓" if member.nullable else "✗",
171
+ "✓" if member.coerce_dtype else "✗",
172
+ str(member.default_value) if member.default_value else "unset",
173
+ )
114
174
 
115
- .. dropdown:: Why does LaminDB model schemas, not just features?
175
+ # Add the table to the features branch
176
+ features.add(feature_table)
116
177
 
117
- 1. Performance: Imagine you measure the same panel of 20k transcripts in
118
- 1M samples. By modeling the panel as a feature set, you can link all
119
- your artifacts against one feature set and only need to store 1M
120
- instead of 1M x 20k = 20B links.
121
- 2. Interpretation: Model protein panels, gene panels, etc.
122
- 3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
178
+ return tree
123
179
 
124
- These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
125
180
 
126
- Note:
181
+ class SchemaOptionals:
182
+ """Manage and access optional features in a schema."""
127
183
 
128
- A feature set can be identified by the `hash` of its feature uids.
129
- It's stored in the `.hash` field.
184
+ def __init__(self, schema) -> None:
185
+ self.schema = schema
130
186
 
131
- A `slot` provides a string key to access feature sets. For instance, for the schema of an
132
- `AnnData` object, it would be `'obs'` for `adata.obs`.
187
+ def get_uids(self) -> list[str]:
188
+ """Get the uids of the optional features.
189
+
190
+ Does **not** need an additional query to the database, while `get()` does.
191
+ """
192
+ if (
193
+ self.schema._aux is not None
194
+ and "af" in self.schema._aux
195
+ and "1" in self.schema._aux["af"]
196
+ ):
197
+ return self.schema._aux["af"]["1"]
198
+ else:
199
+ return []
200
+
201
+ def get(self) -> QuerySet:
202
+ """Get the optional features."""
203
+ uids = self.get_uids()
204
+ if uids:
205
+ return Feature.objects.filter(uid__in=uids).order_by("links_schema__id")
206
+ else:
207
+ return Feature.objects.none() # empty QuerySet
208
+
209
+ def set(self, features: list[Feature]) -> None:
210
+ """Set the optional features (overwrites whichever schemas are currently optional)."""
211
+ if not isinstance(features, list) or not all(
212
+ isinstance(f, Feature) for f in features
213
+ ):
214
+ raise TypeError("features must be a list of Feature records!")
215
+ self.schema._aux = self.schema._aux or {}
216
+ if len(features) > 0:
217
+ self.schema._aux.setdefault("af", {})["1"] = [f.uid for f in features]
218
+
219
+ def remove(self, features: Feature | list[Feature]) -> None:
220
+ """Make one or multiple features required by removing them from the set of optional features."""
221
+ if not isinstance(features, list):
222
+ features = [features]
223
+ if not all(isinstance(f, Feature) for f in features):
224
+ raise TypeError("features must be a list of Feature records!")
225
+ if len(features) > 0:
226
+ self.schema._aux = self.schema._aux or {}
227
+ if "1" in self.schema._aux.get("af", {}):
228
+ for feature in features:
229
+ self.schema._aux["af"]["1"].remove(feature.uid)
230
+
231
+ def add(self, features: Feature | list[Feature]) -> None:
232
+ """Make one or multiple features optional by adding them to the set of optional features."""
233
+ self.schema._aux = self.schema._aux or {}
234
+ if not isinstance(features, list):
235
+ features = [features]
236
+ if not all(isinstance(f, Feature) for f in features):
237
+ raise TypeError("features must be a list of Feature records!")
238
+ if len(features) > 0:
239
+ if "1" not in self.schema._aux.setdefault("af", {}):
240
+ self.set(features)
241
+ else:
242
+ self.schema._aux.setdefault("af", {})["1"].extend(
243
+ [f.uid for f in features]
244
+ )
245
+
246
+
247
+ KNOWN_SCHEMAS = {
248
+ "kMi7B_N88uu-YnbTLDU-DA": "0000000000000000", # valid_features
249
+ "1gocc_TJ1RU2bMwDRK-WUA": "0000000000000001", # valid_ensembl_gene_ids
250
+ "GTxxM36n9tocphLfdbNt9g": "0000000000000002", # anndata_ensembl_gene_ids_and_valid_features_in_obs
251
+ }
252
+
253
+
254
+ class Schema(Record, CanCurate, TracksRun):
255
+ """Schemas of a dataset such as the set of columns of a `DataFrame`.
256
+
257
+ Composite schemas can have multiple slots, e.g., for an `AnnData`, one schema for slot `obs` and another one for `var`.
258
+
259
+ Args:
260
+ features: `list[Record] | list[tuple[Feature, dict]] | None = None` Feature
261
+ records, e.g., `[Feature(...), Feature(...)]` or Features with their config, e.g., `[Feature(...).with_config(optional=True)]`.
262
+ index: `Feature | None = None` A :class:`~lamindb.Feature` record to validate an index of a `DataFrame` and therefore also, e.g., `AnnData` obs and var indices.
263
+ slots: `dict[str, Schema] | None = None` A dictionary mapping slot names to :class:`~lamindb.Schema` objects.
264
+ name: `str | None = None` Name of the Schema.
265
+ description: `str | None = None` Description of the Schema.
266
+ flexible: `bool | None = None` Whether to include any feature of the same `itype` in validation
267
+ and annotation. If no Features are passed, defaults to `True`, otherwise to `False`.
268
+ This means that if you explicitly pass Features, any additional Features will be disregarded during validation & annotation.
269
+ type: `Schema | None = None` Type of Schema to group measurements by.
270
+ Define types like `ln.Schema(name="ProteinPanel", is_type=True)`.
271
+ is_type: `bool = False` Whether the Schema is a Type.
272
+ itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
273
+ otype: `str | None = None` An object type to define the structure of a composite schema (e.g., DataFrame, AnnData).
274
+ dtype: `str | None = None` The simple type (e.g., "num", "float", "int").
275
+ Defaults to `None` for sets of :class:`~lamindb.Feature` records and to `"num"` (e.g., for sets of :class:`~bionty.Gene`) otherwise.
276
+ minimal_set: `bool = True` Whether all passed Features are required by default.
277
+ See :attr:`~lamindb.Schema.optionals` for more-fine-grained control.
278
+ maximal_set: `bool = False` Whether additional Features are allowed.
279
+ ordered_set: `bool = False` Whether Features are required to be ordered.
280
+ coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
281
+ during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
133
282
 
134
283
  See Also:
135
- :meth:`~lamindb.Schema.from_values`
136
- Create from values.
137
- :meth:`~lamindb.Schema.from_df`
138
- Create from dataframe columns.
284
+ :meth:`~lamindb.Artifact.from_df`
285
+ Validate & annotate a `DataFrame` with a schema.
286
+ :meth:`~lamindb.Artifact.from_anndata`
287
+ Validate & annotate an `AnnData` with a schema.
288
+ :meth:`~lamindb.Artifact.from_mudata`
289
+ Validate & annotate an `MuData` with a schema.
290
+ :meth:`~lamindb.Artifact.from_spatialdata`
291
+ Validate & annotate a `SpatialData` with a schema.
139
292
 
140
293
  Examples:
141
294
 
142
- Create a schema (feature set) from df with types:
295
+ The typical way to create a schema::
143
296
 
144
- >>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
145
- >>> schema = ln.Schema.from_df(df)
297
+ import lamindb as ln
298
+ import bionty as bt
299
+ import pandas as pd
146
300
 
147
- Create a schema (feature set) from features:
301
+ # a schema with a single required feature
302
+ schema = ln.Schema(
303
+ features=[
304
+ ln.Feature(name="required_feature", dtype=str).save(),
305
+ ],
306
+ ).save()
148
307
 
149
- >>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
150
- >>> schema = ln.Schema(features)
308
+ # a schema that constrains feature identifiers to be a valid ensembl gene ids or feature names
309
+ schema = ln.Schema(itype=bt.Gene.ensembl_gene_id)
310
+ schema = ln.Schema(itype=ln.Feature) # is equivalent to itype=ln.Feature.name
311
+
312
+ # a schema that requires a single feature but also validates & annotates any additional features with valid feature names
313
+ schema = ln.Schema(
314
+ features=[
315
+ ln.Feature(name="required_feature", dtype=str).save(),
316
+ ],
317
+ itype=ln.Schema(itype=ln.Feature),
318
+ flexible=True,
319
+ ).save()
151
320
 
152
- Create a schema (feature set) from identifier values:
321
+ Passing options to the `Schema` constructor::
153
322
 
154
- >>> import bionty as bt
155
- >>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
323
+ # also validate the index
324
+ schema = ln.Schema(
325
+ features=[
326
+ ln.Feature(name="required_feature", dtype=str).save(),
327
+ ],
328
+ index=ln.Feature(name="sample", dtype=ln.ULabel).save(),
329
+ ).save()
330
+
331
+ # mark a single feature as optional and ignore other features of the same identifier type
332
+ schema = ln.Schema(
333
+ features=[
334
+ ln.Feature(name="required_feature", dtype=str).save(),
335
+ ln.Feature(name="feature2", dtype=int).save().with_config(optional=True),
336
+ ],
337
+ ).save()
338
+
339
+ Alternative constructors (:meth:`~lamindb.Schema.from_values`, :meth:`~lamindb.Schema.from_df`)::
340
+
341
+ # parse & validate identifier values
342
+ schema = ln.Schema.from_values(
343
+ adata.var["ensemble_id"],
344
+ field=bt.Gene.ensembl_gene_id,
345
+ organism="mouse",
346
+ ).save()
156
347
 
348
+ # from a dataframe
349
+ df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
350
+ schema = ln.Schema.from_df(df)
157
351
  """
158
352
 
159
353
  class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
@@ -162,34 +356,24 @@ class Schema(Record, CanCurate, TracksRun):
162
356
  _name_field: str = "name"
163
357
  _aux_fields: dict[str, tuple[str, type]] = {
164
358
  "0": ("coerce_dtype", bool),
165
- "1": ("_index_feature_uid", str),
359
+ "1": ("optionals", list[str]),
360
+ "2": ("flexible", bool),
361
+ "3": ("index_feature_uid", str),
166
362
  }
167
363
 
168
364
  id: int = models.AutoField(primary_key=True)
169
365
  """Internal id, valid only in one DB instance."""
170
366
  uid: str = CharField(editable=False, unique=True, db_index=True, max_length=20)
171
- """A universal id (hash of the set of feature values)."""
367
+ """A universal id.
368
+
369
+ Before lamindb 1.5, it was 20 char long. Since lamindb 1.5, it is 16 char long.
370
+ """
172
371
  name: str | None = CharField(max_length=150, null=True, db_index=True)
173
372
  """A name."""
174
373
  description: str | None = CharField(null=True, db_index=True)
175
374
  """A description."""
176
- n = IntegerField()
177
- """Number of features in the set."""
178
- dtype: str | None = CharField(max_length=64, null=True, editable=False)
179
- """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
180
-
181
- For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
182
- """
183
- itype: str | None = CharField(
184
- max_length=120, db_index=True, null=True, editable=False
185
- )
186
- """A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
187
-
188
- Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
189
-
190
- .. versionchanged:: 1.0.0
191
- Was called `registry` before.
192
- """
375
+ n: int = IntegerField()
376
+ """Number of features in the schema."""
193
377
  type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
194
378
  """Type of schema.
195
379
 
@@ -203,8 +387,20 @@ class Schema(Record, CanCurate, TracksRun):
203
387
  """Records of this type."""
204
388
  is_type: bool = BooleanField(default=False, db_index=True, null=True)
205
389
  """Distinguish types from instances of the type."""
390
+ itype: str | None = CharField(
391
+ max_length=120, db_index=True, null=True, editable=False
392
+ )
393
+ """A registry that stores feature identifier types used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
394
+
395
+ Depending on `itype`, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
396
+ """
206
397
  otype: str | None = CharField(max_length=64, db_index=True, null=True)
207
398
  """Default Python object type, e.g., DataFrame, AnnData."""
399
+ dtype: str | None = CharField(max_length=64, null=True, editable=False)
400
+ """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
401
+
402
+ For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
403
+ """
208
404
  hash: str | None = CharField(
209
405
  max_length=HASH_LENGTH, db_index=True, null=True, editable=False
210
406
  )
@@ -213,18 +409,19 @@ class Schema(Record, CanCurate, TracksRun):
213
409
  For a composite schema, the hash of hashes.
214
410
  """
215
411
  minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
216
- """Whether the schema contains a minimal set of linked features (default `True`).
412
+ """Whether all passed features are to be considered required by default (default `True`).
217
413
 
218
- If `False`, no features are linked to this schema.
219
-
220
- If `True`, features are linked and considered as a minimally required set in validation.
414
+ Note that features that are explicitly marked as `optional` via `feature.with_config(optional=True)`
415
+ are **not** required even if this `minimal_set` is true.
221
416
  """
222
417
  ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
223
418
  """Whether features are required to be ordered (default `False`)."""
224
419
  maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
225
- """If `False`, additional features are allowed (default `False`).
420
+ """Whether all features present in the dataset must be in the schema (default `False`).
421
+
422
+ If `False`, additional features are allowed to be present in the dataset.
226
423
 
227
- If `True`, the the minimal set is a maximal set and no additional features are allowed.
424
+ If `True`, no additional features are allowed to be present in the dataset.
228
425
  """
229
426
  components: Schema = ManyToManyField(
230
427
  "self", through="SchemaComponent", symmetrical=False, related_name="composites"
@@ -271,20 +468,22 @@ class Schema(Record, CanCurate, TracksRun):
271
468
  @overload
272
469
  def __init__(
273
470
  self,
274
- features: Iterable[Record] | None = None,
275
- components: dict[str, Schema] | None = None,
471
+ features: list[Record] | list[tuple[Feature, dict]] | None = None,
472
+ index: Feature | None = None,
473
+ slots: dict[str, Schema] | None = None,
276
474
  name: str | None = None,
277
475
  description: str | None = None,
278
- dtype: str | None = None,
279
476
  itype: str | Registry | FieldAttr | None = None,
477
+ flexible: bool | None = None,
280
478
  type: Schema | None = None,
281
479
  is_type: bool = False,
282
480
  otype: str | None = None,
283
- minimal_set: bool = True,
481
+ dtype: str | Type[int | float | str] | None = None, # noqa
284
482
  ordered_set: bool = False,
483
+ minimal_set: bool = True,
285
484
  maximal_set: bool = False,
286
- slot: str | None = None,
287
485
  coerce_dtype: bool = False,
486
+ n: int | None = None,
288
487
  ): ...
289
488
 
290
489
  @overload
@@ -304,50 +503,152 @@ class Schema(Record, CanCurate, TracksRun):
304
503
  if len(args) > 1:
305
504
  raise ValueError("Only one non-keyword arg allowed: features")
306
505
 
307
- features: Iterable[Record] | None = (
308
- args[0] if args else kwargs.pop("features", [])
309
- )
310
- # typing here anticipates transitioning to a ManyToMany
311
- # between composites and components similar to feature_sets
312
- # in lamindb v2
313
- components: dict[str, Schema] = kwargs.pop("components", {})
506
+ features: list[Record] | None = args[0] if args else kwargs.pop("features", [])
507
+ index: Feature | None = kwargs.pop("index", None)
508
+ slots: dict[str, Schema] = kwargs.pop("slots", {})
314
509
  name: str | None = kwargs.pop("name", None)
315
510
  description: str | None = kwargs.pop("description", None)
316
- dtype: str | None = kwargs.pop("dtype", None)
317
511
  itype: str | Record | DeferredAttribute | None = kwargs.pop("itype", None)
512
+ flexible: bool | None = kwargs.pop("flexible", None)
318
513
  type: Feature | None = kwargs.pop("type", None)
319
514
  is_type: bool = kwargs.pop("is_type", False)
320
515
  otype: str | None = kwargs.pop("otype", None)
516
+ dtype: str | None = kwargs.pop("dtype", None)
321
517
  minimal_set: bool = kwargs.pop("minimal_set", True)
322
518
  ordered_set: bool = kwargs.pop("ordered_set", False)
323
519
  maximal_set: bool = kwargs.pop("maximal_set", False)
324
- slot: str | None = kwargs.pop("slot", None)
325
- coerce_dtype: bool | None = kwargs.pop("coerce_dtype", None)
326
-
520
+ coerce_dtype: bool | None = kwargs.pop("coerce_dtype", False)
521
+ using: bool | None = kwargs.pop("using", None)
522
+ n_features: int | None = kwargs.pop("n", None)
523
+ # backward compat
524
+ if not slots:
525
+ if "components" in kwargs:
526
+ logger.warning(
527
+ "`components` as a keyword argument is deprecated, please use `slots` instead"
528
+ )
529
+ slots = kwargs.pop("components")
327
530
  if kwargs:
328
- raise ValueError(
329
- f"Unexpected keyword arguments: {', '.join(kwargs.keys())}\n"
330
- "Valid arguments are: features, description, dtype, itype, type, "
331
- "is_type, otype, minimal_set, ordered_set, maximal_set, "
332
- "slot, validated_by, coerce_dtype"
531
+ valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Schema)])
532
+ raise FieldValidationError(
533
+ f"Only {valid_keywords} are valid keyword arguments"
333
534
  )
535
+ (
536
+ features,
537
+ validated_kwargs,
538
+ optional_features,
539
+ features_registry,
540
+ flexible,
541
+ list_for_hashing,
542
+ ) = self._validate_kwargs_calculate_hash(
543
+ features=features,
544
+ index=index,
545
+ slots=slots,
546
+ name=name,
547
+ description=description,
548
+ itype=itype,
549
+ flexible=flexible,
550
+ type=type,
551
+ is_type=is_type,
552
+ otype=otype,
553
+ dtype=dtype,
554
+ minimal_set=minimal_set,
555
+ ordered_set=ordered_set,
556
+ maximal_set=maximal_set,
557
+ coerce_dtype=coerce_dtype,
558
+ n_features=n_features,
559
+ )
560
+ schema = (
561
+ Schema.objects.using(using)
562
+ .filter(hash=validated_kwargs["hash"])
563
+ .one_or_none()
564
+ )
565
+ self._list_for_hashing = list_for_hashing
566
+ if schema is not None:
567
+ logger.important(f"returning existing schema with same hash: {schema}")
568
+ init_self_from_db(self, schema)
569
+ update_attributes(self, validated_kwargs)
570
+ self.optionals.set(optional_features)
571
+ return None
572
+ self._slots: dict[str, Schema] = {}
573
+ if features:
574
+ self._features = (get_related_name(features_registry), features) # type: ignore
575
+ elif slots:
576
+ for slot_key, component in slots.items():
577
+ if component._state.adding:
578
+ raise InvalidArgument(
579
+ f"schema for {slot_key} {component} must be saved before use"
580
+ )
581
+ self._slots = slots
582
+ if validated_kwargs["hash"] in KNOWN_SCHEMAS:
583
+ validated_kwargs["uid"] = KNOWN_SCHEMAS[validated_kwargs["hash"]]
584
+ else:
585
+ validated_kwargs["uid"] = ids.base62_16()
586
+ super().__init__(**validated_kwargs)
587
+ # manipulating aux fields is easier after calling super().__init__()
588
+ self.optionals.set(optional_features)
589
+ self.flexible = flexible
590
+ if index is not None:
591
+ self._index_feature_uid = index.uid
334
592
 
593
+ def _validate_kwargs_calculate_hash(
594
+ self,
595
+ features: list[Record],
596
+ index: Feature | None,
597
+ slots: dict[str, Schema],
598
+ name: str | None,
599
+ description: str | None,
600
+ itype: str | Record | DeferredAttribute | None,
601
+ flexible: bool | None,
602
+ type: Feature | None,
603
+ is_type: bool,
604
+ otype: str | None,
605
+ dtype: str | None,
606
+ minimal_set: bool,
607
+ ordered_set: bool,
608
+ maximal_set: bool,
609
+ coerce_dtype: bool,
610
+ n_features: int | None,
611
+ optional_features_manual: list[Feature] | None = None,
612
+ ) -> tuple[list[Feature], dict[str, Any], list[Feature], Registry, bool, list[str]]:
613
+ optional_features = []
614
+ features_registry: Registry = None
615
+ if itype is not None:
616
+ if itype != "Composite":
617
+ itype = serialize_dtype(itype, is_itype=True)
618
+ if index is not None:
619
+ if not isinstance(index, Feature):
620
+ raise TypeError("index must be a Feature")
621
+ features.insert(0, index)
335
622
  if features:
623
+ features, configs = get_features_config(features)
336
624
  features_registry = validate_features(features)
337
625
  itype_compare = features_registry.__get_name_with_module__()
338
626
  if itype is not None:
339
- assert itype == itype_compare, str(itype_compare) # noqa: S101
627
+ assert itype.startswith(itype_compare), str(itype_compare) # noqa: S101
340
628
  else:
341
629
  itype = itype_compare
630
+ if n_features is not None:
631
+ if n_features != len(features):
632
+ logger.important(f"updating to n {len(features)} features")
342
633
  n_features = len(features)
343
- else:
634
+ if features_registry == Feature:
635
+ optional_features = [
636
+ config[0] for config in configs if config[1].get("optional")
637
+ ]
638
+ if optional_features:
639
+ assert optional_features_manual is None # noqa: S101
640
+ if not optional_features and optional_features_manual is not None:
641
+ optional_features = optional_features_manual
642
+ elif n_features is None:
344
643
  n_features = -1
345
644
  if dtype is None:
346
645
  dtype = None if itype is not None and itype == "Feature" else NUMBER_TYPE
347
646
  else:
348
647
  dtype = get_type_str(dtype)
349
- components: dict[str, Schema]
350
- if components:
648
+ flexible_default = n_features < 0
649
+ if flexible is None:
650
+ flexible = flexible_default
651
+ if slots:
351
652
  itype = "Composite"
352
653
  if otype is None:
353
654
  raise InvalidArgument("Please pass otype != None for composite schemas")
@@ -359,8 +660,8 @@ class Schema(Record, CanCurate, TracksRun):
359
660
  "name": name,
360
661
  "description": description,
361
662
  "type": type,
362
- "dtype": dtype,
363
663
  "is_type": is_type,
664
+ "dtype": dtype,
364
665
  "otype": otype,
365
666
  "n": n_features,
366
667
  "itype": itype_str,
@@ -368,35 +669,68 @@ class Schema(Record, CanCurate, TracksRun):
368
669
  "ordered_set": ordered_set,
369
670
  "maximal_set": maximal_set,
370
671
  }
672
+ n_features_default = -1
673
+ coerce_dtype_default = False
371
674
  if coerce_dtype:
372
675
  validated_kwargs["_aux"] = {"af": {"0": coerce_dtype}}
373
- if features:
374
- hash = hash_set({feature.uid for feature in features})
375
- elif components:
376
- hash = hash_set({component.hash for component in components.values()})
676
+ if slots:
677
+ list_for_hashing = [component.hash for component in slots.values()]
377
678
  else:
378
- hash = hash_set({str(value) for value in validated_kwargs.values()})
379
- validated_kwargs["hash"] = hash
380
- validated_kwargs["slot"] = slot
381
- schema = Schema.filter(hash=hash).one_or_none()
382
- if schema is not None:
383
- logger.important(f"returning existing schema with same hash: {schema}")
384
- init_self_from_db(self, schema)
385
- update_attributes(self, validated_kwargs)
386
- return None
387
- self._components: dict[str, Schema] = {}
388
- if features:
389
- self._features = (get_related_name(features_registry), features) # type: ignore
390
- elif components:
391
- for slot, component in components.items():
392
- if component._state.adding:
393
- raise InvalidArgument(
394
- f"component {slot} {component} must be saved before use"
679
+ HASH_CODE = {
680
+ "dtype": "a",
681
+ "itype": "b",
682
+ "minimal_set": "c",
683
+ "ordered_set": "d",
684
+ "maximal_set": "e",
685
+ "flexible": "f",
686
+ "coerce_dtype": "g",
687
+ "n": "h",
688
+ "optional": "i",
689
+ "features_hash": "j",
690
+ }
691
+ # we do not want pure informational annotations like otype, name, type, is_type, otype to be part of the hash
692
+ hash_args = ["dtype", "itype", "minimal_set", "ordered_set", "maximal_set"]
693
+ list_for_hashing = [
694
+ f"{HASH_CODE[arg]}={validated_kwargs[arg]}"
695
+ for arg in hash_args
696
+ if validated_kwargs[arg] is not None
697
+ ]
698
+ # only include in hash if not default so that it's backward compatible with records for which flexible was never set
699
+ if flexible != flexible_default:
700
+ list_for_hashing.append(f"{HASH_CODE['flexible']}={flexible}")
701
+ if coerce_dtype != coerce_dtype_default:
702
+ list_for_hashing.append(f"{HASH_CODE['coerce_dtype']}={coerce_dtype}")
703
+ if n_features != n_features_default:
704
+ list_for_hashing.append(f"{HASH_CODE['n']}={n_features}")
705
+ if features:
706
+ if optional_features:
707
+ feature_list_for_hashing = [
708
+ feature.uid
709
+ if feature not in set(optional_features)
710
+ else f"{feature.uid}({HASH_CODE['optional']})"
711
+ for feature in features
712
+ ]
713
+ else:
714
+ feature_list_for_hashing = [feature.uid for feature in features]
715
+ # order matters if ordered_set is True
716
+ if ordered_set:
717
+ features_hash = hash_string(":".join(feature_list_for_hashing))
718
+ else:
719
+ features_hash = hash_string(
720
+ ":".join(sorted(feature_list_for_hashing))
395
721
  )
396
- self._components = components
397
- self._slots = components
398
- validated_kwargs["uid"] = ids.base62_20()
399
- super().__init__(**validated_kwargs)
722
+ list_for_hashing.append(f"{HASH_CODE['features_hash']}={features_hash}")
723
+ self._list_for_hashing = sorted(list_for_hashing)
724
+ schema_hash = hash_string(":".join(self._list_for_hashing))
725
+ validated_kwargs["hash"] = schema_hash
726
+ return (
727
+ features,
728
+ validated_kwargs,
729
+ optional_features,
730
+ features_registry,
731
+ flexible,
732
+ list_for_hashing,
733
+ )
400
734
 
401
735
  @classmethod
402
736
  def from_values( # type: ignore
@@ -426,13 +760,18 @@ class Schema(Record, CanCurate, TracksRun):
426
760
  Raises:
427
761
  ValidationError: If some values are not valid.
428
762
 
429
- Examples:
763
+ Example:
764
+
765
+ ::
430
766
 
431
- >>> features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
432
- >>> schema = ln.Schema.from_values(features)
767
+ import lamindb as ln
768
+ import bionty as bt
433
769
 
434
- >>> genes = ["ENSG00000139618", "ENSG00000198786"]
435
- >>> schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
770
+ features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
771
+ schema = ln.Schema.from_values(features)
772
+
773
+ genes = ["ENSG00000139618", "ENSG00000198786"]
774
+ schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
436
775
  """
437
776
  if not isinstance(field, FieldAttr):
438
777
  raise TypeError(
@@ -496,7 +835,7 @@ class Schema(Record, CanCurate, TracksRun):
496
835
  df.columns, field=field, organism=organism
497
836
  )
498
837
  schema = Schema(
499
- validated_features, name=name, dtype=None, otype="DataFrame"
838
+ list(validated_features), name=name, dtype=None, otype="DataFrame"
500
839
  )
501
840
  else:
502
841
  dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
@@ -510,10 +849,9 @@ class Schema(Record, CanCurate, TracksRun):
510
849
  source=source,
511
850
  )
512
851
  schema = Schema(
513
- features=validated_features,
852
+ features=list(validated_features),
514
853
  name=name,
515
854
  dtype=get_type_str(dtype),
516
- otype="DataFrame",
517
855
  )
518
856
  return schema
519
857
 
@@ -521,12 +859,50 @@ class Schema(Record, CanCurate, TracksRun):
521
859
  """Save."""
522
860
  from .save import bulk_create
523
861
 
862
+ if not self._state.adding:
863
+ features = (
864
+ self._features[1]
865
+ if hasattr(self, "_features")
866
+ else (self.members.list() if self.members.exists() else [])
867
+ )
868
+ _, validated_kwargs, _, _, _, list_for_hashing = (
869
+ self._validate_kwargs_calculate_hash(
870
+ features=features, # type: ignore
871
+ index=None, # need to pass None here as otherwise counting double
872
+ slots=self._slots if hasattr(self, "_slots") else self.slots,
873
+ name=self.name,
874
+ description=self.description,
875
+ itype=self.itype,
876
+ flexible=self.flexible,
877
+ type=self.type,
878
+ is_type=self.is_type,
879
+ otype=self.otype,
880
+ dtype=self.dtype,
881
+ minimal_set=self.minimal_set,
882
+ ordered_set=self.ordered_set,
883
+ maximal_set=self.maximal_set,
884
+ coerce_dtype=self.coerce_dtype,
885
+ n_features=self.n,
886
+ optional_features_manual=self.optionals.get(),
887
+ )
888
+ )
889
+ if validated_kwargs["hash"] != self.hash:
890
+ from .artifact import Artifact
891
+
892
+ datasets = Artifact.filter(schema=self).all()
893
+ if datasets.exists():
894
+ logger.warning(
895
+ f"you updated the schema hash and might invalidate datasets that were previously validated with this schema: {datasets.list('uid')}"
896
+ )
897
+ self.hash = validated_kwargs["hash"]
898
+ self.n = validated_kwargs["n"]
899
+ self._list_for_hashing = list_for_hashing
524
900
  super().save(*args, **kwargs)
525
- if hasattr(self, "_components"):
901
+ if hasattr(self, "_slots"):
526
902
  # analogous to save_schema_links in core._data.py
527
903
  # which is called to save feature sets in artifact.save()
528
904
  links = []
529
- for slot, component in self._components.items():
905
+ for slot, component in self._slots.items():
530
906
  kwargs = {
531
907
  "composite_id": self.id,
532
908
  "component_id": component.id,
@@ -536,12 +912,15 @@ class Schema(Record, CanCurate, TracksRun):
536
912
  bulk_create(links, ignore_conflicts=True)
537
913
  if hasattr(self, "_features"):
538
914
  assert self.n > 0 # noqa: S101
915
+ using: bool | None = kwargs.pop("using", None)
539
916
  related_name, records = self._features
540
917
  # only the following method preserves the order
541
918
  # .set() does not preserve the order but orders by
542
919
  # the feature primary key
543
920
  through_model = getattr(self, related_name).through
544
- related_model_split = self.itype.split(".")
921
+ related_model_split = parse_cat_dtype(self.itype, is_itype=True)[
922
+ "registry_str"
923
+ ].split(".")
545
924
  if len(related_model_split) == 1:
546
925
  related_field = related_model_split[0].lower()
547
926
  else:
@@ -551,16 +930,23 @@ class Schema(Record, CanCurate, TracksRun):
551
930
  through_model(**{"schema_id": self.id, related_field_id: record.id})
552
931
  for record in records
553
932
  ]
554
- through_model.objects.bulk_create(links, ignore_conflicts=True)
933
+ through_model.objects.using(using).bulk_create(links, ignore_conflicts=True)
934
+ delattr(self, "_features")
555
935
  return self
556
936
 
557
937
  @property
558
938
  def members(self) -> QuerySet:
559
- """A queryset for the individual records of the set."""
939
+ """A queryset for the individual records in the feature set underlying the schema.
940
+
941
+ Unlike `schema.features`, `schema.genes`, `schema.proteins`, etc., this queryset is ordered and
942
+ doesn't require knowledge of the entity.
943
+ """
560
944
  if self._state.adding:
561
945
  # this should return a queryset and not a list...
562
946
  # need to fix this
563
947
  return self._features[1]
948
+ if self.itype == "Composite":
949
+ return Feature.objects.none()
564
950
  related_name = self._get_related_name()
565
951
  if related_name is None:
566
952
  related_name = "features"
@@ -579,62 +965,108 @@ class Schema(Record, CanCurate, TracksRun):
579
965
 
580
966
  @coerce_dtype.setter
581
967
  def coerce_dtype(self, value: bool) -> None:
582
- if self._aux is None: # type: ignore
583
- self._aux = {} # type: ignore
584
- if "af" not in self._aux:
585
- self._aux["af"] = {}
586
- self._aux["af"]["0"] = value
587
-
588
- # @property
589
- # def index_feature(self) -> None | Feature:
590
- # # index_feature: `Record | None = None` A :class:`~lamindb.Feature` to validate the index of a `DataFrame`.
591
- # """The uid of the index feature, if `index_feature` was set."""
592
- # if self._index_feature_uid is None:
593
- # return None
594
- # else:
595
- # return self.features.get(uid=self._index_feature_uid)
596
-
597
- # @property
598
- # def _index_feature_uid(self) -> None | str:
599
- # """The uid of the index feature, if `index_feature` was set."""
600
- # if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
601
- # return self._aux["af"]["1"]
602
- # else:
603
- # return None
604
-
605
- # @_index_feature_uid.setter
606
- # def _index_feature_uid(self, value: str) -> None:
607
- # if self._aux is None:
608
- # self._aux = {}
609
- # if "af" not in self._aux:
610
- # self._aux["af"] = {}
611
- # self._aux["af"]["1"] = value
968
+ self._aux = self._aux or {}
969
+ self._aux.setdefault("af", {})["0"] = value
970
+
971
+ @property
972
+ def flexible(self) -> bool:
973
+ """Indicates how to handle validation and annotation in case features are not defined.
974
+
975
+ Examples:
976
+
977
+ Make a rigid schema flexible::
978
+
979
+ schema = ln.Schema.get(name="my_schema")
980
+ schema.flexible = True
981
+ schema.save()
982
+
983
+ During schema creation::
984
+
985
+ # if you're not passing features but just defining the itype, defaults to flexible = True
986
+ schema = ln.Schema(itype=ln.Feature).save()
987
+ assert not schema.flexible
988
+
989
+ # if you're passing features, defaults to flexible = False
990
+ schema = ln.Schema(
991
+ features=[ln.Feature(name="my_required_feature", dtype=int).save()],
992
+ )
993
+ assert not schema.flexible
994
+
995
+ # you can also validate & annotate features in addition to those that you're explicitly defining:
996
+ schema = ln.Schema(
997
+ features=[ln.Feature(name="my_required_feature", dtype=int).save()],
998
+ flexible=True,
999
+ )
1000
+ assert schema.flexible
1001
+
1002
+ """
1003
+ if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]: # type: ignore
1004
+ return self._aux["af"]["2"] # type: ignore
1005
+ else:
1006
+ return (
1007
+ self.n < 0
1008
+ ) # is the flexible default, needed for backward compat if flexible was never set
1009
+
1010
+ @flexible.setter
1011
+ def flexible(self, value: bool) -> None:
1012
+ self._aux = self._aux or {}
1013
+ self._aux.setdefault("af", {})["2"] = value
612
1014
 
613
1015
  @property
614
- @deprecated("itype")
615
- def registry(self) -> str:
616
- return self.itype
1016
+ def index(self) -> None | Feature:
1017
+ """The feature configured to act as index.
617
1018
 
618
- @registry.setter
619
- def registry(self, value) -> None:
620
- self.itype = value
1019
+ To unset it, set `schema.index` to `None`.
1020
+ """
1021
+ if self._index_feature_uid is None:
1022
+ return None
1023
+ else:
1024
+ return self.features.get(uid=self._index_feature_uid)
1025
+
1026
+ @index.setter
1027
+ def index(self, value: None | Feature) -> None:
1028
+ if value is None:
1029
+ current_index = self.index
1030
+ self.features.remove(current_index)
1031
+ self._index_feature_uid = value
1032
+ else:
1033
+ self.features.add(value)
1034
+ self._index_feature_uid = value.uid
1035
+
1036
+ @property
1037
+ def _index_feature_uid(self) -> None | str:
1038
+ """The uid of the index feature."""
1039
+ if self._aux is not None and "af" in self._aux and "3" in self._aux["af"]:
1040
+ return self._aux["af"]["3"]
1041
+ else:
1042
+ return None
1043
+
1044
+ @_index_feature_uid.setter
1045
+ def _index_feature_uid(self, value: str | None) -> None:
1046
+ self._aux = self._aux or {}
1047
+ if value is None:
1048
+ self._aux.get("af", {}).pop("3")
1049
+ else:
1050
+ self._aux.setdefault("af", {})["3"] = value
621
1051
 
622
1052
  @property
623
1053
  def slots(self) -> dict[str, Schema]:
624
1054
  """Slots.
625
1055
 
626
- Examples::
1056
+ Examples:
627
1057
 
628
- # define composite schema
629
- anndata_schema = ln.Schema(
630
- name="small_dataset1_anndata_schema",
631
- otype="AnnData",
632
- components={"obs": obs_schema, "var": var_schema},
633
- ).save()
1058
+ ::
1059
+
1060
+ # define composite schema
1061
+ anndata_schema = ln.Schema(
1062
+ name="small_dataset1_anndata_schema",
1063
+ otype="AnnData",
1064
+ slots={"obs": obs_schema, "var": var_schema},
1065
+ ).save()
634
1066
 
635
- # access slots
636
- anndata_schema.slots
637
- # {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}
1067
+ # access slots
1068
+ anndata_schema.slots
1069
+ # {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}
638
1070
  """
639
1071
  if hasattr(self, "_slots"):
640
1072
  return self._slots
@@ -646,6 +1078,44 @@ class Schema(Record, CanCurate, TracksRun):
646
1078
  return self._slots
647
1079
  return {}
648
1080
 
1081
+ @property
1082
+ def optionals(self) -> SchemaOptionals:
1083
+ """Manage optional features.
1084
+
1085
+ Example:
1086
+
1087
+ ::
1088
+
1089
+ # a schema with optional "sample_name"
1090
+ schema_optional_sample_name = ln.Schema(
1091
+ features=[
1092
+ ln.Feature(name="sample_id", dtype=str).save(), # required
1093
+ ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True), # optional
1094
+ ],
1095
+ ).save()
1096
+
1097
+ # raise ValidationError since `sample_id` is required
1098
+ ln.curators.DataFrameCurator(
1099
+ pd.DataFrame(
1100
+ {
1101
+ "sample_name": ["Sample 1", "Sample 2"],
1102
+ }
1103
+ ),
1104
+ schema=schema_optional_sample_name).validate()
1105
+ )
1106
+
1107
+ # passes because an optional column is missing
1108
+ ln.curators.DataFrameCurator(
1109
+ pd.DataFrame(
1110
+ {
1111
+ "sample_id": ["sample1", "sample2"],
1112
+ }
1113
+ ),
1114
+ schema=schema_optional_sample_name).validate()
1115
+ )
1116
+ """
1117
+ return SchemaOptionals(self)
1118
+
649
1119
  def describe(self, return_str=False) -> None | str:
650
1120
  """Describe schema."""
651
1121
  message = str(self)
@@ -654,6 +1124,11 @@ class Schema(Record, CanCurate, TracksRun):
654
1124
  message + "\nslots:"
655
1125
  for slot, schema in self.slots.items():
656
1126
  message += f"\n {slot}: " + str(schema)
1127
+ else:
1128
+ tree = describe_schema(self)
1129
+ return format_rich_tree(
1130
+ tree, fallback="no linked features", return_str=return_str
1131
+ )
657
1132
  if return_str:
658
1133
  return message
659
1134
  else:
@@ -671,7 +1146,9 @@ def get_type_str(dtype: str | None) -> str | None:
671
1146
 
672
1147
  def _get_related_name(self: Schema) -> str:
673
1148
  related_models = dict_related_model_to_related_name(self, instance=self._state.db)
674
- related_name = related_models.get(self.itype)
1149
+ related_name = related_models.get(
1150
+ parse_cat_dtype(self.itype, is_itype=True)["registry_str"]
1151
+ )
675
1152
  return related_name
676
1153
 
677
1154