lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +203 -102
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/_tiledbsoma.py +29 -13
  20. lamindb/core/storage/objects.py +6 -0
  21. lamindb/core/subsettings/__init__.py +2 -0
  22. lamindb/core/subsettings/_annotation_settings.py +11 -0
  23. lamindb/curators/__init__.py +7 -3349
  24. lamindb/curators/_legacy.py +2056 -0
  25. lamindb/curators/core.py +1534 -0
  26. lamindb/errors.py +11 -0
  27. lamindb/examples/__init__.py +27 -0
  28. lamindb/examples/schemas/__init__.py +12 -0
  29. lamindb/examples/schemas/_anndata.py +25 -0
  30. lamindb/examples/schemas/_simple.py +19 -0
  31. lamindb/integrations/_vitessce.py +8 -5
  32. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  33. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  34. lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
  35. lamindb/models/__init__.py +4 -1
  36. lamindb/models/_describe.py +21 -4
  37. lamindb/models/_feature_manager.py +382 -287
  38. lamindb/models/_label_manager.py +8 -2
  39. lamindb/models/artifact.py +177 -106
  40. lamindb/models/artifact_set.py +122 -0
  41. lamindb/models/collection.py +73 -52
  42. lamindb/models/core.py +1 -1
  43. lamindb/models/feature.py +51 -17
  44. lamindb/models/has_parents.py +69 -14
  45. lamindb/models/project.py +1 -1
  46. lamindb/models/query_manager.py +221 -22
  47. lamindb/models/query_set.py +247 -172
  48. lamindb/models/record.py +65 -247
  49. lamindb/models/run.py +4 -4
  50. lamindb/models/save.py +8 -2
  51. lamindb/models/schema.py +456 -184
  52. lamindb/models/transform.py +2 -2
  53. lamindb/models/ulabel.py +8 -5
  54. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
  55. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
  56. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
  57. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0
lamindb/models/schema.py CHANGED
@@ -1,12 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Any, overload
3
+ from typing import TYPE_CHECKING, Any, Type, overload
4
4
 
5
5
  import numpy as np
6
6
  from django.db import models
7
7
  from django.db.models import CASCADE, PROTECT, ManyToManyField
8
8
  from lamin_utils import logger
9
- from lamindb_setup.core.hashing import HASH_LENGTH, hash_set
9
+ from lamindb_setup.core.hashing import HASH_LENGTH, hash_string
10
+ from rich.table import Table
11
+ from rich.text import Text
12
+ from rich.tree import Tree
10
13
 
11
14
  from lamindb.base import ids
12
15
  from lamindb.base.fields import (
@@ -17,10 +20,11 @@ from lamindb.base.fields import (
17
20
  JSONField,
18
21
  )
19
22
  from lamindb.base.types import FieldAttr, ListLike
20
- from lamindb.errors import InvalidArgument
23
+ from lamindb.errors import FieldValidationError, InvalidArgument
24
+ from lamindb.models.feature import parse_cat_dtype
21
25
 
22
- from ..base import deprecated
23
26
  from ..errors import ValidationError
27
+ from ._describe import format_rich_tree, highlight_time
24
28
  from ._relations import (
25
29
  dict_related_model_to_related_name,
26
30
  get_related_name,
@@ -36,14 +40,13 @@ from .record import (
36
40
  LinkORM,
37
41
  Record,
38
42
  Registry,
43
+ _get_record_kwargs,
39
44
  init_self_from_db,
40
45
  update_attributes,
41
46
  )
42
47
  from .run import Param, TracksRun, TracksUpdates
43
48
 
44
49
  if TYPE_CHECKING:
45
- from collections.abc import Iterable
46
-
47
50
  import pandas as pd
48
51
  from django.db.models.query_utils import DeferredAttribute
49
52
 
@@ -83,14 +86,14 @@ def validate_features(features: list[Record]) -> Record:
83
86
  def get_features_config(
84
87
  features: list[Record] | tuple[Record, dict],
85
88
  ) -> tuple[list[Record], list[tuple[Record, dict]]]:
86
- """Get features and their config from the return of feature.with_config."""
89
+ """Get features and their config from the return of feature.with_config()."""
87
90
  features_list = []
88
91
  configs = []
89
92
  try:
90
93
  for feature in features:
91
94
  if isinstance(feature, tuple):
92
95
  features_list.append(feature[0])
93
- configs.append(feature)
96
+ configs.append(feature) # store the tuple in configs
94
97
  else:
95
98
  features_list.append(feature)
96
99
  return features_list, configs # type: ignore
@@ -98,6 +101,83 @@ def get_features_config(
98
101
  return features, configs # type: ignore
99
102
 
100
103
 
104
+ def describe_schema(self: Schema) -> Tree:
105
+ """Create a rich tree visualization of a Schema with its features."""
106
+ otype = self.otype if hasattr(self, "otype") and self.otype else ""
107
+ tree = Tree(
108
+ Text.assemble((self.__class__.__name__, "bold"), (f" {otype}", "bold dim")),
109
+ guide_style="dim", # dim the connecting lines
110
+ )
111
+
112
+ tree.add(f".uid = '{self.uid}'")
113
+ tree.add(f".name = '{self.name}'")
114
+ if self.description:
115
+ tree.add(f".description = '{self.description}'")
116
+ if self.itype:
117
+ tree.add(f".itype = '{self.itype}'")
118
+ if self.type:
119
+ tree.add(f".type = '{self.type}'")
120
+ tree.add(f".ordered_set = {self.ordered_set}")
121
+ tree.add(f".maximal_set = {self.maximal_set}")
122
+ if hasattr(self, "created_by") and self.created_by:
123
+ tree.add(
124
+ Text.assemble(
125
+ ".created_by = ",
126
+ (
127
+ self.created_by.handle
128
+ if self.created_by.name is None
129
+ else f"{self.created_by.handle} ({self.created_by.name})"
130
+ ),
131
+ )
132
+ )
133
+ if hasattr(self, "created_at") and self.created_at:
134
+ tree.add(Text.assemble(".created_at = ", highlight_time(str(self.created_at))))
135
+
136
+ members = self.members
137
+
138
+ # Add features section
139
+ features = tree.add(
140
+ Text.assemble(
141
+ (self.itype, "violet"),
142
+ (" • ", "dim"),
143
+ (str(members.count()), "pink1"),
144
+ )
145
+ )
146
+
147
+ if hasattr(self, "members") and self.members.count() > 0:
148
+ # create a table for the features
149
+ feature_table = Table(
150
+ show_header=True, header_style="dim", box=None, pad_edge=False
151
+ )
152
+
153
+ # Add columns
154
+ feature_table.add_column("name", style="", no_wrap=True)
155
+ feature_table.add_column("dtype", style="", no_wrap=True)
156
+ feature_table.add_column("optional", style="", no_wrap=True)
157
+ feature_table.add_column("nullable", style="", no_wrap=True)
158
+ feature_table.add_column("coerce_dtype", style="", no_wrap=True)
159
+ feature_table.add_column("default_value", style="", no_wrap=True)
160
+
161
+ # Add rows for each member
162
+ optionals = self.optionals.get()
163
+ for member in self.members:
164
+ feature_table.add_row(
165
+ member.name,
166
+ Text(
167
+ str(member.dtype)
168
+ ), # needs to be wrapped in Text to display correctly
169
+ "✓" if optionals.filter(uid=member.uid).exists() else "✗",
170
+ "✓" if member.nullable else "✗",
171
+ "✓" if member.coerce_dtype else "✗",
172
+ str(member.default_value) if member.default_value else "unset",
173
+ )
174
+
175
+ # Add the table to the features branch
176
+ features.add(feature_table)
177
+
178
+ return tree
179
+
180
+
101
181
  class SchemaOptionals:
102
182
  """Manage and access optional features in a schema."""
103
183
 
@@ -127,7 +207,7 @@ class SchemaOptionals:
127
207
  return Feature.objects.none() # empty QuerySet
128
208
 
129
209
  def set(self, features: list[Feature]) -> None:
130
- """Set the optional features."""
210
+ """Set the optional features (overwrites whichever schemas are currently optional)."""
131
211
  if not isinstance(features, list) or not all(
132
212
  isinstance(f, Feature) for f in features
133
213
  ):
@@ -136,8 +216,20 @@ class SchemaOptionals:
136
216
  if len(features) > 0:
137
217
  self.schema._aux.setdefault("af", {})["1"] = [f.uid for f in features]
138
218
 
219
+ def remove(self, features: Feature | list[Feature]) -> None:
220
+ """Make one or multiple features required by removing them from the set of optional features."""
221
+ if not isinstance(features, list):
222
+ features = [features]
223
+ if not all(isinstance(f, Feature) for f in features):
224
+ raise TypeError("features must be a list of Feature records!")
225
+ if len(features) > 0:
226
+ self.schema._aux = self.schema._aux or {}
227
+ if "1" in self.schema._aux.get("af", {}):
228
+ for feature in features:
229
+ self.schema._aux["af"]["1"].remove(feature.uid)
230
+
139
231
  def add(self, features: Feature | list[Feature]) -> None:
140
- """Add feature to the optional features."""
232
+ """Make one or multiple features optional by adding them to the set of optional features."""
141
233
  self.schema._aux = self.schema._aux or {}
142
234
  if not isinstance(features, list):
143
235
  features = [features]
@@ -146,108 +238,116 @@ class SchemaOptionals:
146
238
  if len(features) > 0:
147
239
  if "1" not in self.schema._aux.setdefault("af", {}):
148
240
  self.set(features)
149
- self.schema._aux.setdefault("af", {})["1"].extend([f.uid for f in features])
241
+ else:
242
+ self.schema._aux.setdefault("af", {})["1"].extend(
243
+ [f.uid for f in features]
244
+ )
150
245
 
151
246
 
152
- class Schema(Record, CanCurate, TracksRun):
153
- """Schemas.
247
+ KNOWN_SCHEMAS = {
248
+ "kMi7B_N88uu-YnbTLDU-DA": "0000000000000000", # valid_features
249
+ "1gocc_TJ1RU2bMwDRK-WUA": "0000000000000001", # valid_ensembl_gene_ids
250
+ "GTxxM36n9tocphLfdbNt9g": "0000000000000002", # anndata_ensembl_gene_ids_and_valid_features_in_obs
251
+ }
154
252
 
155
- A simple schema is a feature set such as the set of columns of a `DataFrame`.
156
253
 
157
- A composite schema has multiple components, e.g., for an `AnnData`, one schema for `obs` and another one for `var`.
254
+ class Schema(Record, CanCurate, TracksRun):
255
+ """Schemas of a dataset such as the set of columns of a `DataFrame`.
158
256
 
159
- A schema can also merely define abstract constraints or instructions for dataset validation & annotation.
257
+ Composite schemas can have multiple slots, e.g., for an `AnnData`, one schema for slot `obs` and another one for `var`.
160
258
 
161
259
  Args:
162
- features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
163
- records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
164
- a set upon instantiation. If you'd like to pass values, use
165
- :meth:`~lamindb.Schema.from_values` or
166
- :meth:`~lamindb.Schema.from_df`.
167
- components: `dict[str, Schema] | None = None` A dictionary mapping slot names to
168
- components. A component is itself a :class:`~lamindb.Schema` object.
169
- name: `str | None = None` A name.
170
- description: `str | None = None` A description.
171
- itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
260
+ features: `list[Record] | list[tuple[Feature, dict]] | None = None` Feature
261
+ records, e.g., `[Feature(...), Feature(...)]` or Features with their config, e.g., `[Feature(...).with_config(optional=True)]`.
262
+ index: `Feature | None = None` A :class:`~lamindb.Feature` record to validate an index of a `DataFrame` and therefore also, e.g., `AnnData` obs and var indices.
263
+ slots: `dict[str, Schema] | None = None` A dictionary mapping slot names to :class:`~lamindb.Schema` objects.
264
+ name: `str | None = None` Name of the Schema.
265
+ description: `str | None = None` Description of the Schema.
172
266
  flexible: `bool | None = None` Whether to include any feature of the same `itype` in validation
173
- and annotation. If no features are passed, defaults to `True`, otherwise to `False`.
174
- type: `Schema | None = None` A type.
175
- is_type: `bool = False` Distinguish types from instances of the type.
176
- otype: `str | None = None` An object type to define the structure of a composite schema.
177
- dtype: `str | None = None` The simple type. Defaults to
178
- `None` for sets of :class:`~lamindb.Feature` records.
179
- Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
180
- minimal_set: `bool = True` Whether all passed features are to be considered required by default.
267
+ and annotation. If no Features are passed, defaults to `True`, otherwise to `False`.
268
+ This means that if you explicitly pass Features, any additional Features will be disregarded during validation & annotation.
269
+ type: `Schema | None = None` Type of Schema to group measurements by.
270
+ Define types like `ln.Schema(name="ProteinPanel", is_type=True)`.
271
+ is_type: `bool = False` Whether the Schema is a Type.
272
+ itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
273
+ otype: `str | None = None` An object type to define the structure of a composite schema (e.g., DataFrame, AnnData).
274
+ dtype: `str | None = None` The simple type (e.g., "num", "float", "int").
275
+ Defaults to `None` for sets of :class:`~lamindb.Feature` records and to `"num"` (e.g., for sets of :class:`~bionty.Gene`) otherwise.
276
+ minimal_set: `bool = True` Whether all passed Features are required by default.
181
277
  See :attr:`~lamindb.Schema.optionals` for more-fine-grained control.
182
- ordered_set: `bool = False` Whether features are required to be ordered.
183
- maximal_set: `bool = False` If `True`, no additional features are allowed.
278
+ maximal_set: `bool = False` Whether additional Features are allowed.
279
+ ordered_set: `bool = False` Whether Features are required to be ordered.
184
280
  coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
185
281
  during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
186
282
 
187
- .. dropdown:: Why does LaminDB model schemas, not just features?
188
-
189
- 1. Performance: Imagine you measure the same panel of 20k transcripts in
190
- 1M samples. By modeling the panel as a schema, you can link all
191
- your artifacts against one schema and only need to store 1M
192
- instead of 1M x 20k = 20B links.
193
- 2. Interpretation: Model protein panels, gene panels, etc.
194
- 3. Data integration: Schemas provide the information that determines whether two datasets can be meaningfully concatenated.
195
-
196
- Note:
197
-
198
- A `slot` provides a string key to access schema components. For instance, for the schema of an
199
- `AnnData` object, it would be `'obs'` for `adata.obs`.
200
-
201
283
  See Also:
202
- :meth:`~lamindb.Schema.from_values`
203
- Create from values.
204
- :meth:`~lamindb.Schema.from_df`
205
- Create from dataframe columns.
284
+ :meth:`~lamindb.Artifact.from_df`
285
+ Validate & annotate a `DataFrame` with a schema.
286
+ :meth:`~lamindb.Artifact.from_anndata`
287
+ Validate & annotate an `AnnData` with a schema.
288
+ :meth:`~lamindb.Artifact.from_mudata`
289
+ Validate & annotate an `MuData` with a schema.
290
+ :meth:`~lamindb.Artifact.from_spatialdata`
291
+ Validate & annotate a `SpatialData` with a schema.
206
292
 
207
293
  Examples:
208
294
 
209
- Create schemas::
295
+ The typical way to create a schema::
210
296
 
211
297
  import lamindb as ln
212
298
  import bionty as bt
213
299
  import pandas as pd
214
300
 
215
- # From a dataframe
216
- df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
217
- schema = ln.Schema.from_df(df)
218
-
219
- # From explicitly defined features
301
+ # a schema with a single required feature
220
302
  schema = ln.Schema(
221
303
  features=[
222
304
  ln.Feature(name="required_feature", dtype=str).save(),
223
305
  ],
224
306
  ).save()
225
307
 
226
- # By merely constraining an identifier type
308
+ # a schema that constrains feature identifiers to be a valid ensembl gene ids or feature names
227
309
  schema = ln.Schema(itype=bt.Gene.ensembl_gene_id)
310
+ schema = ln.Schema(itype=ln.Feature) # is equivalent to itype=ln.Feature.name
228
311
 
229
- # A combination of the above
312
+ # a schema that requires a single feature but also validates & annotates any additional features with valid feature names
230
313
  schema = ln.Schema(
231
314
  features=[
232
315
  ln.Feature(name="required_feature", dtype=str).save(),
233
316
  ],
317
+ itype=ln.Schema(itype=ln.Feature),
234
318
  flexible=True,
235
319
  ).save()
236
320
 
237
- # By parsing & validating identifier values
238
- schema = ln.Schema.from_values(
239
- adata.var["ensemble_id"],
240
- field=bt.Gene.ensembl_gene_id,
241
- organism="mouse",
321
+ Passing options to the `Schema` constructor::
322
+
323
+ # also validate the index
324
+ schema = ln.Schema(
325
+ features=[
326
+ ln.Feature(name="required_feature", dtype=str).save(),
327
+ ],
328
+ index=ln.Feature(name="sample", dtype=ln.ULabel).save(),
242
329
  ).save()
243
330
 
244
- # Mark a single feature as optional and ignore other features of the same identifier type
331
+ # mark a single feature as optional and ignore other features of the same identifier type
245
332
  schema = ln.Schema(
246
333
  features=[
247
334
  ln.Feature(name="required_feature", dtype=str).save(),
248
335
  ln.Feature(name="feature2", dtype=int).save().with_config(optional=True),
249
336
  ],
250
337
  ).save()
338
+
339
+ Alternative constructors (:meth:`~lamindb.Schema.from_values`, :meth:`~lamindb.Schema.from_df`)::
340
+
341
+ # parse & validate identifier values
342
+ schema = ln.Schema.from_values(
343
+ adata.var["ensemble_id"],
344
+ field=bt.Gene.ensembl_gene_id,
345
+ organism="mouse",
346
+ ).save()
347
+
348
+ # from a dataframe
349
+ df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
350
+ schema = ln.Schema.from_df(df)
251
351
  """
252
352
 
253
353
  class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
@@ -258,28 +358,22 @@ class Schema(Record, CanCurate, TracksRun):
258
358
  "0": ("coerce_dtype", bool),
259
359
  "1": ("optionals", list[str]),
260
360
  "2": ("flexible", bool),
361
+ "3": ("index_feature_uid", str),
261
362
  }
262
363
 
263
364
  id: int = models.AutoField(primary_key=True)
264
365
  """Internal id, valid only in one DB instance."""
265
366
  uid: str = CharField(editable=False, unique=True, db_index=True, max_length=20)
266
- """A universal id (hash of the set of feature values)."""
367
+ """A universal id.
368
+
369
+ Before lamindb 1.5, it was 20 char long. Since lamindb 1.5, it is 16 char long.
370
+ """
267
371
  name: str | None = CharField(max_length=150, null=True, db_index=True)
268
372
  """A name."""
269
373
  description: str | None = CharField(null=True, db_index=True)
270
374
  """A description."""
271
- n = IntegerField()
272
- """Number of features in the set."""
273
- itype: str | None = CharField(
274
- max_length=120, db_index=True, null=True, editable=False
275
- )
276
- """A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
277
-
278
- Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
279
-
280
- .. versionchanged:: 1.0.0
281
- Was called `registry` before.
282
- """
375
+ n: int = IntegerField()
376
+ """Number of features in the schema."""
283
377
  type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
284
378
  """Type of schema.
285
379
 
@@ -293,6 +387,13 @@ class Schema(Record, CanCurate, TracksRun):
293
387
  """Records of this type."""
294
388
  is_type: bool = BooleanField(default=False, db_index=True, null=True)
295
389
  """Distinguish types from instances of the type."""
390
+ itype: str | None = CharField(
391
+ max_length=120, db_index=True, null=True, editable=False
392
+ )
393
+ """A registry that stores feature identifier types used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
394
+
395
+ Depending on `itype`, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
396
+ """
296
397
  otype: str | None = CharField(max_length=64, db_index=True, null=True)
297
398
  """Default Python object type, e.g., DataFrame, AnnData."""
298
399
  dtype: str | None = CharField(max_length=64, null=True, editable=False)
@@ -338,7 +439,7 @@ class Schema(Record, CanCurate, TracksRun):
338
439
  artifacts: Artifact
339
440
  """The artifacts that measure a feature set that matches this schema."""
340
441
  validated_artifacts: Artifact
341
- """The artifacts that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
442
+ """The artifacts that were validated against this schema with a :class:`~lamindb.curators.core.Curator`."""
342
443
  projects: Project
343
444
  """Linked projects."""
344
445
  _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
@@ -356,7 +457,7 @@ class Schema(Record, CanCurate, TracksRun):
356
457
  # For instance, the set of measured features might be a superset of the minimally required set of features.
357
458
  # """
358
459
  # validated_schemas: Schema
359
- # """The schemas that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
460
+ # """The schemas that were validated against this schema with a :class:`~lamindb.curators.core.Curator`."""
360
461
  composite: Schema | None = ForeignKey(
361
462
  "self", PROTECT, related_name="+", default=None, null=True
362
463
  )
@@ -367,18 +468,22 @@ class Schema(Record, CanCurate, TracksRun):
367
468
  @overload
368
469
  def __init__(
369
470
  self,
370
- features: Iterable[Record] | None = None,
371
- components: dict[str, Schema] | None = None,
471
+ features: list[Record] | list[tuple[Feature, dict]] | None = None,
472
+ index: Feature | None = None,
473
+ slots: dict[str, Schema] | None = None,
372
474
  name: str | None = None,
373
475
  description: str | None = None,
374
- dtype: str | None = None,
375
476
  itype: str | Registry | FieldAttr | None = None,
477
+ flexible: bool | None = None,
376
478
  type: Schema | None = None,
377
479
  is_type: bool = False,
378
480
  otype: str | None = None,
481
+ dtype: str | Type[int | float | str] | None = None, # noqa
379
482
  ordered_set: bool = False,
483
+ minimal_set: bool = True,
380
484
  maximal_set: bool = False,
381
485
  coerce_dtype: bool = False,
486
+ n: int | None = None,
382
487
  ): ...
383
488
 
384
489
  @overload
@@ -398,13 +503,9 @@ class Schema(Record, CanCurate, TracksRun):
398
503
  if len(args) > 1:
399
504
  raise ValueError("Only one non-keyword arg allowed: features")
400
505
 
401
- features: Iterable[Record] | None = (
402
- args[0] if args else kwargs.pop("features", [])
403
- )
404
- # typing here anticipates transitioning to a ManyToMany
405
- # between composites and components similar to feature_sets
406
- # in lamindb v2
407
- components: dict[str, Schema] = kwargs.pop("components", {})
506
+ features: list[Record] | None = args[0] if args else kwargs.pop("features", [])
507
+ index: Feature | None = kwargs.pop("index", None)
508
+ slots: dict[str, Schema] = kwargs.pop("slots", {})
408
509
  name: str | None = kwargs.pop("name", None)
409
510
  description: str | None = kwargs.pop("description", None)
410
511
  itype: str | Record | DeferredAttribute | None = kwargs.pop("itype", None)
@@ -416,42 +517,136 @@ class Schema(Record, CanCurate, TracksRun):
416
517
  minimal_set: bool = kwargs.pop("minimal_set", True)
417
518
  ordered_set: bool = kwargs.pop("ordered_set", False)
418
519
  maximal_set: bool = kwargs.pop("maximal_set", False)
419
- coerce_dtype: bool | None = kwargs.pop("coerce_dtype", None)
420
- optional_features = []
421
-
520
+ coerce_dtype: bool | None = kwargs.pop("coerce_dtype", False)
521
+ using: bool | None = kwargs.pop("using", None)
522
+ n_features: int | None = kwargs.pop("n", None)
523
+ # backward compat
524
+ if not slots:
525
+ if "components" in kwargs:
526
+ logger.warning(
527
+ "`components` as a keyword argument is deprecated, please use `slots` instead"
528
+ )
529
+ slots = kwargs.pop("components")
422
530
  if kwargs:
423
- raise ValueError(
424
- f"Unexpected keyword arguments: {', '.join(kwargs.keys())}\n"
425
- "Valid arguments are: features, description, dtype, itype, type, "
426
- "is_type, otype, minimal_set, ordered_set, maximal_set, "
427
- "coerce_dtype"
531
+ valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Schema)])
532
+ raise FieldValidationError(
533
+ f"Only {valid_keywords} are valid keyword arguments"
428
534
  )
535
+ (
536
+ features,
537
+ validated_kwargs,
538
+ optional_features,
539
+ features_registry,
540
+ flexible,
541
+ ) = self._validate_kwargs_calculate_hash(
542
+ features=features,
543
+ index=index,
544
+ slots=slots,
545
+ name=name,
546
+ description=description,
547
+ itype=itype,
548
+ flexible=flexible,
549
+ type=type,
550
+ is_type=is_type,
551
+ otype=otype,
552
+ dtype=dtype,
553
+ minimal_set=minimal_set,
554
+ ordered_set=ordered_set,
555
+ maximal_set=maximal_set,
556
+ coerce_dtype=coerce_dtype,
557
+ n_features=n_features,
558
+ )
559
+ schema = (
560
+ Schema.objects.using(using)
561
+ .filter(hash=validated_kwargs["hash"])
562
+ .one_or_none()
563
+ )
564
+ if schema is not None:
565
+ logger.important(f"returning existing schema with same hash: {schema}")
566
+ init_self_from_db(self, schema)
567
+ update_attributes(self, validated_kwargs)
568
+ self.optionals.set(optional_features)
569
+ return None
570
+ self._slots: dict[str, Schema] = {}
571
+ if features:
572
+ self._features = (get_related_name(features_registry), features) # type: ignore
573
+ elif slots:
574
+ for slot_key, component in slots.items():
575
+ if component._state.adding:
576
+ raise InvalidArgument(
577
+ f"schema for {slot_key} {component} must be saved before use"
578
+ )
579
+ self._slots = slots
580
+ if validated_kwargs["hash"] in KNOWN_SCHEMAS:
581
+ validated_kwargs["uid"] = KNOWN_SCHEMAS[validated_kwargs["hash"]]
582
+ else:
583
+ validated_kwargs["uid"] = ids.base62_16()
584
+ super().__init__(**validated_kwargs)
585
+ # manipulating aux fields is easier after calling super().__init__()
586
+ self.optionals.set(optional_features)
587
+ self.flexible = flexible
588
+ if index is not None:
589
+ self._index_feature_uid = index.uid
590
+
591
+ def _validate_kwargs_calculate_hash(
592
+ self,
593
+ features: list[Record],
594
+ index: Feature | None,
595
+ slots: dict[str, Schema],
596
+ name: str | None,
597
+ description: str | None,
598
+ itype: str | Record | DeferredAttribute | None,
599
+ flexible: bool | None,
600
+ type: Feature | None,
601
+ is_type: bool,
602
+ otype: str | None,
603
+ dtype: str | None,
604
+ minimal_set: bool,
605
+ ordered_set: bool,
606
+ maximal_set: bool,
607
+ coerce_dtype: bool,
608
+ n_features: int | None,
609
+ optional_features_manual: list[Feature] | None = None,
610
+ ) -> tuple[list[Feature], dict[str, Any], list[Feature], Registry, bool]:
429
611
  optional_features = []
612
+ features_registry: Registry = None
430
613
  if itype is not None:
431
- itype = serialize_dtype(itype, is_itype=True)
614
+ if itype != "Composite":
615
+ itype = serialize_dtype(itype, is_itype=True)
616
+ if index is not None:
617
+ if not isinstance(index, Feature):
618
+ raise TypeError("index must be a Feature")
619
+ features.insert(0, index)
432
620
  if features:
433
621
  features, configs = get_features_config(features)
434
622
  features_registry = validate_features(features)
435
623
  itype_compare = features_registry.__get_name_with_module__()
436
624
  if itype is not None:
437
- assert itype == itype_compare, str(itype_compare) # noqa: S101
625
+ assert itype.startswith(itype_compare), str(itype_compare) # noqa: S101
438
626
  else:
439
627
  itype = itype_compare
628
+ if n_features is not None:
629
+ if n_features != len(features):
630
+ logger.important(f"updating to n {len(features)} features")
440
631
  n_features = len(features)
441
632
  if features_registry == Feature:
442
633
  optional_features = [
443
634
  config[0] for config in configs if config[1].get("optional")
444
635
  ]
445
- else:
636
+ if optional_features:
637
+ assert optional_features_manual is None # noqa: S101
638
+ if not optional_features and optional_features_manual is not None:
639
+ optional_features = optional_features_manual
640
+ elif n_features is None:
446
641
  n_features = -1
447
642
  if dtype is None:
448
643
  dtype = None if itype is not None and itype == "Feature" else NUMBER_TYPE
449
644
  else:
450
645
  dtype = get_type_str(dtype)
646
+ flexible_default = n_features < 0
451
647
  if flexible is None:
452
- flexible = n_features < 0
453
- components: dict[str, Schema]
454
- if components:
648
+ flexible = flexible_default
649
+ if slots:
455
650
  itype = "Composite"
456
651
  if otype is None:
457
652
  raise InvalidArgument("Please pass otype != None for composite schemas")
@@ -472,50 +667,67 @@ class Schema(Record, CanCurate, TracksRun):
472
667
  "ordered_set": ordered_set,
473
668
  "maximal_set": maximal_set,
474
669
  }
670
+ n_features_default = -1
671
+ coerce_dtype_default = False
475
672
  if coerce_dtype:
476
673
  validated_kwargs["_aux"] = {"af": {"0": coerce_dtype}}
477
- if components:
478
- hash = hash_set({component.hash for component in components.values()})
674
+ if slots:
675
+ list_for_hashing = [component.hash for component in slots.values()]
479
676
  else:
677
+ HASH_CODE = {
678
+ "dtype": "a",
679
+ "itype": "b",
680
+ "minimal_set": "c",
681
+ "ordered_set": "d",
682
+ "maximal_set": "e",
683
+ "flexible": "f",
684
+ "coerce_dtype": "g",
685
+ "n": "h",
686
+ "optional": "i",
687
+ "features_hash": "j",
688
+ }
480
689
  # we do not want pure informational annotations like otype, name, type, is_type, otype to be part of the hash
481
690
  hash_args = ["dtype", "itype", "minimal_set", "ordered_set", "maximal_set"]
482
- union_set = {
483
- str(validated_kwargs[arg])
691
+ list_for_hashing = [
692
+ f"{HASH_CODE[arg]}={validated_kwargs[arg]}"
484
693
  for arg in hash_args
485
694
  if validated_kwargs[arg] is not None
486
- }
487
- if flexible != n_features < 0:
488
- union_set.add(f"flexible:{flexible}")
695
+ ]
696
+ # only include in hash if not default so that it's backward compatible with records for which flexible was never set
697
+ if flexible != flexible_default:
698
+ list_for_hashing.append(f"{HASH_CODE['flexible']}={flexible}")
699
+ if coerce_dtype != coerce_dtype_default:
700
+ list_for_hashing.append(f"{HASH_CODE['coerce_dtype']}={coerce_dtype}")
701
+ if n_features != n_features_default:
702
+ list_for_hashing.append(f"{HASH_CODE['n']}={n_features}")
489
703
  if features:
490
- union_set = union_set.union({feature.uid for feature in features})
491
- if optional_features:
492
- union_set = union_set.union(
493
- {f"optional:{feature.uid}" for feature in optional_features}
494
- )
495
- hash = hash_set(union_set)
496
- validated_kwargs["hash"] = hash
497
- schema = Schema.filter(hash=hash).one_or_none()
498
- if schema is not None:
499
- logger.important(f"returning existing schema with same hash: {schema}")
500
- init_self_from_db(self, schema)
501
- update_attributes(self, validated_kwargs)
502
- self.optionals.set(optional_features)
503
- return None
504
- self._components: dict[str, Schema] = {}
505
- if features:
506
- self._features = (get_related_name(features_registry), features) # type: ignore
507
- elif components:
508
- for slot, component in components.items():
509
- if component._state.adding:
510
- raise InvalidArgument(
511
- f"component {slot} {component} must be saved before use"
704
+ if optional_features:
705
+ feature_list_for_hashing = [
706
+ feature.uid
707
+ if feature not in set(optional_features)
708
+ else f"{feature.uid}({HASH_CODE['optional']})"
709
+ for feature in features
710
+ ]
711
+ else:
712
+ feature_list_for_hashing = [feature.uid for feature in features]
713
+ # order matters if ordered_set is True
714
+ if ordered_set:
715
+ features_hash = hash_string(":".join(feature_list_for_hashing))
716
+ else:
717
+ features_hash = hash_string(
718
+ ":".join(sorted(feature_list_for_hashing))
512
719
  )
513
- self._components = components
514
- self._slots = components
515
- validated_kwargs["uid"] = ids.base62_20()
516
- super().__init__(**validated_kwargs)
517
- self.optionals.set(optional_features)
518
- self.flexible = flexible
720
+ list_for_hashing.append(f"{HASH_CODE['features_hash']}={features_hash}")
721
+ self._list_for_hashing = sorted(list_for_hashing)
722
+ schema_hash = hash_string(":".join(self._list_for_hashing))
723
+ validated_kwargs["hash"] = schema_hash
724
+ return (
725
+ features,
726
+ validated_kwargs,
727
+ optional_features,
728
+ features_registry,
729
+ flexible,
730
+ )
519
731
 
520
732
  @classmethod
521
733
  def from_values( # type: ignore
@@ -620,7 +832,7 @@ class Schema(Record, CanCurate, TracksRun):
620
832
  df.columns, field=field, organism=organism
621
833
  )
622
834
  schema = Schema(
623
- validated_features, name=name, dtype=None, otype="DataFrame"
835
+ list(validated_features), name=name, dtype=None, otype="DataFrame"
624
836
  )
625
837
  else:
626
838
  dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
@@ -634,7 +846,7 @@ class Schema(Record, CanCurate, TracksRun):
634
846
  source=source,
635
847
  )
636
848
  schema = Schema(
637
- features=validated_features,
849
+ features=list(validated_features),
638
850
  name=name,
639
851
  dtype=get_type_str(dtype),
640
852
  )
@@ -644,12 +856,47 @@ class Schema(Record, CanCurate, TracksRun):
644
856
  """Save."""
645
857
  from .save import bulk_create
646
858
 
859
+ if not self._state.adding:
860
+ features = (
861
+ self._features[1]
862
+ if hasattr(self, "_features")
863
+ else (self.members.list() if self.members.exists() else [])
864
+ )
865
+ _, validated_kwargs, _, _, _ = self._validate_kwargs_calculate_hash(
866
+ features=features, # type: ignore
867
+ index=None, # need to pass None here as otherwise counting double
868
+ slots=self.slots,
869
+ name=self.name,
870
+ description=self.description,
871
+ itype=self.itype,
872
+ flexible=self.flexible,
873
+ type=self.type,
874
+ is_type=self.is_type,
875
+ otype=self.otype,
876
+ dtype=self.dtype,
877
+ minimal_set=self.minimal_set,
878
+ ordered_set=self.ordered_set,
879
+ maximal_set=self.maximal_set,
880
+ coerce_dtype=self.coerce_dtype,
881
+ n_features=self.n,
882
+ optional_features_manual=self.optionals.get(),
883
+ )
884
+ if validated_kwargs["hash"] != self.hash:
885
+ from .artifact import Artifact
886
+
887
+ datasets = Artifact.filter(schema=self).all()
888
+ if datasets.exists():
889
+ logger.warning(
890
+ f"you updated the schema hash and might invalidate datasets that were previously validated with this schema: {datasets.list('uid')}"
891
+ )
892
+ self.hash = validated_kwargs["hash"]
893
+ self.n = validated_kwargs["n"]
647
894
  super().save(*args, **kwargs)
648
- if hasattr(self, "_components"):
895
+ if hasattr(self, "_slots"):
649
896
  # analogous to save_schema_links in core._data.py
650
897
  # which is called to save feature sets in artifact.save()
651
898
  links = []
652
- for slot, component in self._components.items():
899
+ for slot, component in self._slots.items():
653
900
  kwargs = {
654
901
  "composite_id": self.id,
655
902
  "component_id": component.id,
@@ -657,14 +904,18 @@ class Schema(Record, CanCurate, TracksRun):
657
904
  }
658
905
  links.append(Schema.components.through(**kwargs))
659
906
  bulk_create(links, ignore_conflicts=True)
907
+ delattr(self, "_slots")
660
908
  if hasattr(self, "_features"):
661
909
  assert self.n > 0 # noqa: S101
910
+ using: bool | None = kwargs.pop("using", None)
662
911
  related_name, records = self._features
663
912
  # only the following method preserves the order
664
913
  # .set() does not preserve the order but orders by
665
914
  # the feature primary key
666
915
  through_model = getattr(self, related_name).through
667
- related_model_split = self.itype.split(".")
916
+ related_model_split = parse_cat_dtype(self.itype, is_itype=True)[
917
+ "registry_str"
918
+ ].split(".")
668
919
  if len(related_model_split) == 1:
669
920
  related_field = related_model_split[0].lower()
670
921
  else:
@@ -674,16 +925,23 @@ class Schema(Record, CanCurate, TracksRun):
674
925
  through_model(**{"schema_id": self.id, related_field_id: record.id})
675
926
  for record in records
676
927
  ]
677
- through_model.objects.bulk_create(links, ignore_conflicts=True)
928
+ through_model.objects.using(using).bulk_create(links, ignore_conflicts=True)
929
+ delattr(self, "_features")
678
930
  return self
679
931
 
680
932
  @property
681
933
  def members(self) -> QuerySet:
682
- """A queryset for the individual records of the set."""
934
+ """A queryset for the individual records in the feature set underlying the schema.
935
+
936
+ Unlike `schema.features`, `schema.genes`, `schema.proteins`, etc., this queryset is ordered and
937
+ doesn't require knowledge of the entity.
938
+ """
683
939
  if self._state.adding:
684
940
  # this should return a queryset and not a list...
685
941
  # need to fix this
686
942
  return self._features[1]
943
+ if self.itype == "Composite":
944
+ return Feature.objects.none()
687
945
  related_name = self._get_related_name()
688
946
  if related_name is None:
689
947
  related_name = "features"
@@ -740,44 +998,51 @@ class Schema(Record, CanCurate, TracksRun):
740
998
  if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]: # type: ignore
741
999
  return self._aux["af"]["2"] # type: ignore
742
1000
  else:
743
- return self.n < 0
1001
+ return (
1002
+ self.n < 0
1003
+ ) # is the flexible default, needed for backward compat if flexible was never set
744
1004
 
745
1005
  @flexible.setter
746
1006
  def flexible(self, value: bool) -> None:
747
- if value != (self.n < 0):
748
- self._aux = self._aux or {}
749
- self._aux.setdefault("af", {})["2"] = value
750
-
751
- # @property
752
- # def index_feature(self) -> None | Feature:
753
- # # index_feature: `Record | None = None` A :class:`~lamindb.Feature` to validate the index of a `DataFrame`.
754
- # """The uid of the index feature, if `index_feature` was set."""
755
- # if self._index_feature_uid is None:
756
- # return None
757
- # else:
758
- # return self.features.get(uid=self._index_feature_uid)
759
-
760
- # @property
761
- # def _index_feature_uid(self) -> None | str:
762
- # """The uid of the index feature, if `index_feature` was set."""
763
- # if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
764
- # return self._aux["af"]["1"]
765
- # else:
766
- # return None
767
-
768
- # @_index_feature_uid.setter
769
- # def _index_feature_uid(self, value: str) -> None:
770
- # self._aux = self._aux or {}
771
- # self._aux.setdefault("af", {})["0"] = value
1007
+ self._aux = self._aux or {}
1008
+ self._aux.setdefault("af", {})["2"] = value
1009
+
1010
+ @property
1011
+ def index(self) -> None | Feature:
1012
+ """The feature configured to act as index.
1013
+
1014
+ To unset it, set `schema.index` to `None`.
1015
+ """
1016
+ if self._index_feature_uid is None:
1017
+ return None
1018
+ else:
1019
+ return self.features.get(uid=self._index_feature_uid)
1020
+
1021
+ @index.setter
1022
+ def index(self, value: None | Feature) -> None:
1023
+ if value is None:
1024
+ current_index = self.index
1025
+ self.features.remove(current_index)
1026
+ self._index_feature_uid = value
1027
+ else:
1028
+ self.features.add(value)
1029
+ self._index_feature_uid = value.uid
772
1030
 
773
1031
  @property
774
- @deprecated("itype")
775
- def registry(self) -> str:
776
- return self.itype
1032
+ def _index_feature_uid(self) -> None | str:
1033
+ """The uid of the index feature."""
1034
+ if self._aux is not None and "af" in self._aux and "3" in self._aux["af"]:
1035
+ return self._aux["af"]["3"]
1036
+ else:
1037
+ return None
777
1038
 
778
- @registry.setter
779
- def registry(self, value) -> None:
780
- self.itype = value
1039
+ @_index_feature_uid.setter
1040
+ def _index_feature_uid(self, value: str | None) -> None:
1041
+ self._aux = self._aux or {}
1042
+ if value is None:
1043
+ self._aux.get("af", {}).pop("3")
1044
+ else:
1045
+ self._aux.setdefault("af", {})["3"] = value
781
1046
 
782
1047
  @property
783
1048
  def slots(self) -> dict[str, Schema]:
@@ -791,7 +1056,7 @@ class Schema(Record, CanCurate, TracksRun):
791
1056
  anndata_schema = ln.Schema(
792
1057
  name="small_dataset1_anndata_schema",
793
1058
  otype="AnnData",
794
- components={"obs": obs_schema, "var": var_schema},
1059
+ slots={"obs": obs_schema, "var": var_schema},
795
1060
  ).save()
796
1061
 
797
1062
  # access slots
@@ -854,6 +1119,11 @@ class Schema(Record, CanCurate, TracksRun):
854
1119
  message + "\nslots:"
855
1120
  for slot, schema in self.slots.items():
856
1121
  message += f"\n {slot}: " + str(schema)
1122
+ else:
1123
+ tree = describe_schema(self)
1124
+ return format_rich_tree(
1125
+ tree, fallback="no linked features", return_str=return_str
1126
+ )
857
1127
  if return_str:
858
1128
  return message
859
1129
  else:
@@ -871,7 +1141,9 @@ def get_type_str(dtype: str | None) -> str | None:
871
1141
 
872
1142
  def _get_related_name(self: Schema) -> str:
873
1143
  related_models = dict_related_model_to_related_name(self, instance=self._state.db)
874
- related_name = related_models.get(self.itype)
1144
+ related_name = related_models.get(
1145
+ parse_cat_dtype(self.itype, is_itype=True)["registry_str"]
1146
+ )
875
1147
  return related_name
876
1148
 
877
1149
 
@@ -911,7 +1183,7 @@ class SchemaComponent(BasicRecord, LinkORM, TracksRun):
911
1183
  slot: str | None = CharField(null=True)
912
1184
 
913
1185
  class Meta:
914
- unique_together = (("composite", "component"), ("composite", "slot"))
1186
+ unique_together = (("composite", "slot", "component"), ("composite", "slot"))
915
1187
 
916
1188
 
917
1189
  Schema._get_related_name = _get_related_name