lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +216 -133
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3559
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1546 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/models/__init__.py +12 -2
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +384 -301
- lamindb/models/_from_values.py +1 -1
- lamindb/models/_is_versioned.py +5 -15
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +354 -177
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/can_curate.py +4 -1
- lamindb/models/collection.py +79 -56
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +78 -47
- lamindb/models/has_parents.py +24 -9
- lamindb/models/project.py +3 -3
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +251 -206
- lamindb/models/record.py +211 -344
- lamindb/models/run.py +59 -5
- lamindb/models/save.py +9 -5
- lamindb/models/schema.py +673 -196
- lamindb/models/transform.py +5 -14
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
- lamindb-1.5.0.dist-info/RECORD +108 -0
- lamindb-1.3.2.dist-info/RECORD +0 -95
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
lamindb/models/schema.py
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING, Any, overload
|
3
|
+
from typing import TYPE_CHECKING, Any, Type, overload
|
4
4
|
|
5
5
|
import numpy as np
|
6
6
|
from django.db import models
|
7
7
|
from django.db.models import CASCADE, PROTECT, ManyToManyField
|
8
8
|
from lamin_utils import logger
|
9
|
-
from lamindb_setup.core.hashing import HASH_LENGTH,
|
9
|
+
from lamindb_setup.core.hashing import HASH_LENGTH, hash_string
|
10
|
+
from rich.table import Table
|
11
|
+
from rich.text import Text
|
12
|
+
from rich.tree import Tree
|
10
13
|
|
11
14
|
from lamindb.base import ids
|
12
15
|
from lamindb.base.fields import (
|
@@ -17,10 +20,11 @@ from lamindb.base.fields import (
|
|
17
20
|
JSONField,
|
18
21
|
)
|
19
22
|
from lamindb.base.types import FieldAttr, ListLike
|
20
|
-
from lamindb.errors import InvalidArgument
|
23
|
+
from lamindb.errors import FieldValidationError, InvalidArgument
|
24
|
+
from lamindb.models.feature import parse_cat_dtype
|
21
25
|
|
22
|
-
from ..base import deprecated
|
23
26
|
from ..errors import ValidationError
|
27
|
+
from ._describe import format_rich_tree, highlight_time
|
24
28
|
from ._relations import (
|
25
29
|
dict_related_model_to_related_name,
|
26
30
|
get_related_name,
|
@@ -36,14 +40,13 @@ from .record import (
|
|
36
40
|
LinkORM,
|
37
41
|
Record,
|
38
42
|
Registry,
|
43
|
+
_get_record_kwargs,
|
39
44
|
init_self_from_db,
|
40
45
|
update_attributes,
|
41
46
|
)
|
42
47
|
from .run import Param, TracksRun, TracksUpdates
|
43
48
|
|
44
49
|
if TYPE_CHECKING:
|
45
|
-
from collections.abc import Iterable
|
46
|
-
|
47
50
|
import pandas as pd
|
48
51
|
from django.db.models.query_utils import DeferredAttribute
|
49
52
|
|
@@ -80,80 +83,271 @@ def validate_features(features: list[Record]) -> Record:
|
|
80
83
|
return next(iter(feature_types)) # return value in set of cardinality 1
|
81
84
|
|
82
85
|
|
83
|
-
|
84
|
-
|
86
|
+
def get_features_config(
|
87
|
+
features: list[Record] | tuple[Record, dict],
|
88
|
+
) -> tuple[list[Record], list[tuple[Record, dict]]]:
|
89
|
+
"""Get features and their config from the return of feature.with_config()."""
|
90
|
+
features_list = []
|
91
|
+
configs = []
|
92
|
+
try:
|
93
|
+
for feature in features:
|
94
|
+
if isinstance(feature, tuple):
|
95
|
+
features_list.append(feature[0])
|
96
|
+
configs.append(feature) # store the tuple in configs
|
97
|
+
else:
|
98
|
+
features_list.append(feature)
|
99
|
+
return features_list, configs # type: ignore
|
100
|
+
except TypeError:
|
101
|
+
return features, configs # type: ignore
|
85
102
|
|
86
|
-
The simplest schema is a feature set such as the set of columns of a `DataFrame`.
|
87
103
|
|
88
|
-
|
104
|
+
def describe_schema(self: Schema) -> Tree:
|
105
|
+
"""Create a rich tree visualization of a Schema with its features."""
|
106
|
+
otype = self.otype if hasattr(self, "otype") and self.otype else ""
|
107
|
+
tree = Tree(
|
108
|
+
Text.assemble((self.__class__.__name__, "bold"), (f" {otype}", "bold dim")),
|
109
|
+
guide_style="dim", # dim the connecting lines
|
110
|
+
)
|
89
111
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
112
|
+
tree.add(f".uid = '{self.uid}'")
|
113
|
+
tree.add(f".name = '{self.name}'")
|
114
|
+
if self.description:
|
115
|
+
tree.add(f".description = '{self.description}'")
|
116
|
+
if self.itype:
|
117
|
+
tree.add(f".itype = '{self.itype}'")
|
118
|
+
if self.type:
|
119
|
+
tree.add(f".type = '{self.type}'")
|
120
|
+
tree.add(f".ordered_set = {self.ordered_set}")
|
121
|
+
tree.add(f".maximal_set = {self.maximal_set}")
|
122
|
+
if hasattr(self, "created_by") and self.created_by:
|
123
|
+
tree.add(
|
124
|
+
Text.assemble(
|
125
|
+
".created_by = ",
|
126
|
+
(
|
127
|
+
self.created_by.handle
|
128
|
+
if self.created_by.name is None
|
129
|
+
else f"{self.created_by.handle} ({self.created_by.name})"
|
130
|
+
),
|
131
|
+
)
|
132
|
+
)
|
133
|
+
if hasattr(self, "created_at") and self.created_at:
|
134
|
+
tree.add(Text.assemble(".created_at = ", highlight_time(str(self.created_at))))
|
135
|
+
|
136
|
+
members = self.members
|
137
|
+
|
138
|
+
# Add features section
|
139
|
+
features = tree.add(
|
140
|
+
Text.assemble(
|
141
|
+
(self.itype, "violet"),
|
142
|
+
(" • ", "dim"),
|
143
|
+
(str(members.count()), "pink1"),
|
144
|
+
)
|
145
|
+
)
|
146
|
+
|
147
|
+
if hasattr(self, "members") and self.members.count() > 0:
|
148
|
+
# create a table for the features
|
149
|
+
feature_table = Table(
|
150
|
+
show_header=True, header_style="dim", box=None, pad_edge=False
|
151
|
+
)
|
152
|
+
|
153
|
+
# Add columns
|
154
|
+
feature_table.add_column("name", style="", no_wrap=True)
|
155
|
+
feature_table.add_column("dtype", style="", no_wrap=True)
|
156
|
+
feature_table.add_column("optional", style="", no_wrap=True)
|
157
|
+
feature_table.add_column("nullable", style="", no_wrap=True)
|
158
|
+
feature_table.add_column("coerce_dtype", style="", no_wrap=True)
|
159
|
+
feature_table.add_column("default_value", style="", no_wrap=True)
|
160
|
+
|
161
|
+
# Add rows for each member
|
162
|
+
optionals = self.optionals.get()
|
163
|
+
for member in self.members:
|
164
|
+
feature_table.add_row(
|
165
|
+
member.name,
|
166
|
+
Text(
|
167
|
+
str(member.dtype)
|
168
|
+
), # needs to be wrapped in Text to display correctly
|
169
|
+
"✓" if optionals.filter(uid=member.uid).exists() else "✗",
|
170
|
+
"✓" if member.nullable else "✗",
|
171
|
+
"✓" if member.coerce_dtype else "✗",
|
172
|
+
str(member.default_value) if member.default_value else "unset",
|
173
|
+
)
|
114
174
|
|
115
|
-
|
175
|
+
# Add the table to the features branch
|
176
|
+
features.add(feature_table)
|
116
177
|
|
117
|
-
|
118
|
-
1M samples. By modeling the panel as a feature set, you can link all
|
119
|
-
your artifacts against one feature set and only need to store 1M
|
120
|
-
instead of 1M x 20k = 20B links.
|
121
|
-
2. Interpretation: Model protein panels, gene panels, etc.
|
122
|
-
3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
|
178
|
+
return tree
|
123
179
|
|
124
|
-
These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
|
125
180
|
|
126
|
-
|
181
|
+
class SchemaOptionals:
|
182
|
+
"""Manage and access optional features in a schema."""
|
127
183
|
|
128
|
-
|
129
|
-
|
184
|
+
def __init__(self, schema) -> None:
|
185
|
+
self.schema = schema
|
130
186
|
|
131
|
-
|
132
|
-
|
187
|
+
def get_uids(self) -> list[str]:
|
188
|
+
"""Get the uids of the optional features.
|
189
|
+
|
190
|
+
Does **not** need an additional query to the database, while `get()` does.
|
191
|
+
"""
|
192
|
+
if (
|
193
|
+
self.schema._aux is not None
|
194
|
+
and "af" in self.schema._aux
|
195
|
+
and "1" in self.schema._aux["af"]
|
196
|
+
):
|
197
|
+
return self.schema._aux["af"]["1"]
|
198
|
+
else:
|
199
|
+
return []
|
200
|
+
|
201
|
+
def get(self) -> QuerySet:
|
202
|
+
"""Get the optional features."""
|
203
|
+
uids = self.get_uids()
|
204
|
+
if uids:
|
205
|
+
return Feature.objects.filter(uid__in=uids).order_by("links_schema__id")
|
206
|
+
else:
|
207
|
+
return Feature.objects.none() # empty QuerySet
|
208
|
+
|
209
|
+
def set(self, features: list[Feature]) -> None:
|
210
|
+
"""Set the optional features (overwrites whichever schemas are currently optional)."""
|
211
|
+
if not isinstance(features, list) or not all(
|
212
|
+
isinstance(f, Feature) for f in features
|
213
|
+
):
|
214
|
+
raise TypeError("features must be a list of Feature records!")
|
215
|
+
self.schema._aux = self.schema._aux or {}
|
216
|
+
if len(features) > 0:
|
217
|
+
self.schema._aux.setdefault("af", {})["1"] = [f.uid for f in features]
|
218
|
+
|
219
|
+
def remove(self, features: Feature | list[Feature]) -> None:
|
220
|
+
"""Make one or multiple features required by removing them from the set of optional features."""
|
221
|
+
if not isinstance(features, list):
|
222
|
+
features = [features]
|
223
|
+
if not all(isinstance(f, Feature) for f in features):
|
224
|
+
raise TypeError("features must be a list of Feature records!")
|
225
|
+
if len(features) > 0:
|
226
|
+
self.schema._aux = self.schema._aux or {}
|
227
|
+
if "1" in self.schema._aux.get("af", {}):
|
228
|
+
for feature in features:
|
229
|
+
self.schema._aux["af"]["1"].remove(feature.uid)
|
230
|
+
|
231
|
+
def add(self, features: Feature | list[Feature]) -> None:
|
232
|
+
"""Make one or multiple features optional by adding them to the set of optional features."""
|
233
|
+
self.schema._aux = self.schema._aux or {}
|
234
|
+
if not isinstance(features, list):
|
235
|
+
features = [features]
|
236
|
+
if not all(isinstance(f, Feature) for f in features):
|
237
|
+
raise TypeError("features must be a list of Feature records!")
|
238
|
+
if len(features) > 0:
|
239
|
+
if "1" not in self.schema._aux.setdefault("af", {}):
|
240
|
+
self.set(features)
|
241
|
+
else:
|
242
|
+
self.schema._aux.setdefault("af", {})["1"].extend(
|
243
|
+
[f.uid for f in features]
|
244
|
+
)
|
245
|
+
|
246
|
+
|
247
|
+
KNOWN_SCHEMAS = {
|
248
|
+
"kMi7B_N88uu-YnbTLDU-DA": "0000000000000000", # valid_features
|
249
|
+
"1gocc_TJ1RU2bMwDRK-WUA": "0000000000000001", # valid_ensembl_gene_ids
|
250
|
+
"GTxxM36n9tocphLfdbNt9g": "0000000000000002", # anndata_ensembl_gene_ids_and_valid_features_in_obs
|
251
|
+
}
|
252
|
+
|
253
|
+
|
254
|
+
class Schema(Record, CanCurate, TracksRun):
|
255
|
+
"""Schemas of a dataset such as the set of columns of a `DataFrame`.
|
256
|
+
|
257
|
+
Composite schemas can have multiple slots, e.g., for an `AnnData`, one schema for slot `obs` and another one for `var`.
|
258
|
+
|
259
|
+
Args:
|
260
|
+
features: `list[Record] | list[tuple[Feature, dict]] | None = None` Feature
|
261
|
+
records, e.g., `[Feature(...), Feature(...)]` or Features with their config, e.g., `[Feature(...).with_config(optional=True)]`.
|
262
|
+
index: `Feature | None = None` A :class:`~lamindb.Feature` record to validate an index of a `DataFrame` and therefore also, e.g., `AnnData` obs and var indices.
|
263
|
+
slots: `dict[str, Schema] | None = None` A dictionary mapping slot names to :class:`~lamindb.Schema` objects.
|
264
|
+
name: `str | None = None` Name of the Schema.
|
265
|
+
description: `str | None = None` Description of the Schema.
|
266
|
+
flexible: `bool | None = None` Whether to include any feature of the same `itype` in validation
|
267
|
+
and annotation. If no Features are passed, defaults to `True`, otherwise to `False`.
|
268
|
+
This means that if you explicitly pass Features, any additional Features will be disregarded during validation & annotation.
|
269
|
+
type: `Schema | None = None` Type of Schema to group measurements by.
|
270
|
+
Define types like `ln.Schema(name="ProteinPanel", is_type=True)`.
|
271
|
+
is_type: `bool = False` Whether the Schema is a Type.
|
272
|
+
itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
|
273
|
+
otype: `str | None = None` An object type to define the structure of a composite schema (e.g., DataFrame, AnnData).
|
274
|
+
dtype: `str | None = None` The simple type (e.g., "num", "float", "int").
|
275
|
+
Defaults to `None` for sets of :class:`~lamindb.Feature` records and to `"num"` (e.g., for sets of :class:`~bionty.Gene`) otherwise.
|
276
|
+
minimal_set: `bool = True` Whether all passed Features are required by default.
|
277
|
+
See :attr:`~lamindb.Schema.optionals` for more-fine-grained control.
|
278
|
+
maximal_set: `bool = False` Whether additional Features are allowed.
|
279
|
+
ordered_set: `bool = False` Whether Features are required to be ordered.
|
280
|
+
coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
|
281
|
+
during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
|
133
282
|
|
134
283
|
See Also:
|
135
|
-
:meth:`~lamindb.
|
136
|
-
|
137
|
-
:meth:`~lamindb.
|
138
|
-
|
284
|
+
:meth:`~lamindb.Artifact.from_df`
|
285
|
+
Validate & annotate a `DataFrame` with a schema.
|
286
|
+
:meth:`~lamindb.Artifact.from_anndata`
|
287
|
+
Validate & annotate an `AnnData` with a schema.
|
288
|
+
:meth:`~lamindb.Artifact.from_mudata`
|
289
|
+
Validate & annotate an `MuData` with a schema.
|
290
|
+
:meth:`~lamindb.Artifact.from_spatialdata`
|
291
|
+
Validate & annotate a `SpatialData` with a schema.
|
139
292
|
|
140
293
|
Examples:
|
141
294
|
|
142
|
-
|
295
|
+
The typical way to create a schema::
|
143
296
|
|
144
|
-
|
145
|
-
|
297
|
+
import lamindb as ln
|
298
|
+
import bionty as bt
|
299
|
+
import pandas as pd
|
146
300
|
|
147
|
-
|
301
|
+
# a schema with a single required feature
|
302
|
+
schema = ln.Schema(
|
303
|
+
features=[
|
304
|
+
ln.Feature(name="required_feature", dtype=str).save(),
|
305
|
+
],
|
306
|
+
).save()
|
148
307
|
|
149
|
-
|
150
|
-
|
308
|
+
# a schema that constrains feature identifiers to be a valid ensembl gene ids or feature names
|
309
|
+
schema = ln.Schema(itype=bt.Gene.ensembl_gene_id)
|
310
|
+
schema = ln.Schema(itype=ln.Feature) # is equivalent to itype=ln.Feature.name
|
311
|
+
|
312
|
+
# a schema that requires a single feature but also validates & annotates any additional features with valid feature names
|
313
|
+
schema = ln.Schema(
|
314
|
+
features=[
|
315
|
+
ln.Feature(name="required_feature", dtype=str).save(),
|
316
|
+
],
|
317
|
+
itype=ln.Schema(itype=ln.Feature),
|
318
|
+
flexible=True,
|
319
|
+
).save()
|
151
320
|
|
152
|
-
|
321
|
+
Passing options to the `Schema` constructor::
|
153
322
|
|
154
|
-
|
155
|
-
|
323
|
+
# also validate the index
|
324
|
+
schema = ln.Schema(
|
325
|
+
features=[
|
326
|
+
ln.Feature(name="required_feature", dtype=str).save(),
|
327
|
+
],
|
328
|
+
index=ln.Feature(name="sample", dtype=ln.ULabel).save(),
|
329
|
+
).save()
|
330
|
+
|
331
|
+
# mark a single feature as optional and ignore other features of the same identifier type
|
332
|
+
schema = ln.Schema(
|
333
|
+
features=[
|
334
|
+
ln.Feature(name="required_feature", dtype=str).save(),
|
335
|
+
ln.Feature(name="feature2", dtype=int).save().with_config(optional=True),
|
336
|
+
],
|
337
|
+
).save()
|
338
|
+
|
339
|
+
Alternative constructors (:meth:`~lamindb.Schema.from_values`, :meth:`~lamindb.Schema.from_df`)::
|
340
|
+
|
341
|
+
# parse & validate identifier values
|
342
|
+
schema = ln.Schema.from_values(
|
343
|
+
adata.var["ensemble_id"],
|
344
|
+
field=bt.Gene.ensembl_gene_id,
|
345
|
+
organism="mouse",
|
346
|
+
).save()
|
156
347
|
|
348
|
+
# from a dataframe
|
349
|
+
df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
|
350
|
+
schema = ln.Schema.from_df(df)
|
157
351
|
"""
|
158
352
|
|
159
353
|
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
@@ -162,34 +356,24 @@ class Schema(Record, CanCurate, TracksRun):
|
|
162
356
|
_name_field: str = "name"
|
163
357
|
_aux_fields: dict[str, tuple[str, type]] = {
|
164
358
|
"0": ("coerce_dtype", bool),
|
165
|
-
"1": ("
|
359
|
+
"1": ("optionals", list[str]),
|
360
|
+
"2": ("flexible", bool),
|
361
|
+
"3": ("index_feature_uid", str),
|
166
362
|
}
|
167
363
|
|
168
364
|
id: int = models.AutoField(primary_key=True)
|
169
365
|
"""Internal id, valid only in one DB instance."""
|
170
366
|
uid: str = CharField(editable=False, unique=True, db_index=True, max_length=20)
|
171
|
-
"""A universal id
|
367
|
+
"""A universal id.
|
368
|
+
|
369
|
+
Before lamindb 1.5, it was 20 char long. Since lamindb 1.5, it is 16 char long.
|
370
|
+
"""
|
172
371
|
name: str | None = CharField(max_length=150, null=True, db_index=True)
|
173
372
|
"""A name."""
|
174
373
|
description: str | None = CharField(null=True, db_index=True)
|
175
374
|
"""A description."""
|
176
|
-
n = IntegerField()
|
177
|
-
"""Number of features in the
|
178
|
-
dtype: str | None = CharField(max_length=64, null=True, editable=False)
|
179
|
-
"""Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
|
180
|
-
|
181
|
-
For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
|
182
|
-
"""
|
183
|
-
itype: str | None = CharField(
|
184
|
-
max_length=120, db_index=True, null=True, editable=False
|
185
|
-
)
|
186
|
-
"""A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
|
187
|
-
|
188
|
-
Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
|
189
|
-
|
190
|
-
.. versionchanged:: 1.0.0
|
191
|
-
Was called `registry` before.
|
192
|
-
"""
|
375
|
+
n: int = IntegerField()
|
376
|
+
"""Number of features in the schema."""
|
193
377
|
type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
|
194
378
|
"""Type of schema.
|
195
379
|
|
@@ -203,8 +387,20 @@ class Schema(Record, CanCurate, TracksRun):
|
|
203
387
|
"""Records of this type."""
|
204
388
|
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
205
389
|
"""Distinguish types from instances of the type."""
|
390
|
+
itype: str | None = CharField(
|
391
|
+
max_length=120, db_index=True, null=True, editable=False
|
392
|
+
)
|
393
|
+
"""A registry that stores feature identifier types used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
|
394
|
+
|
395
|
+
Depending on `itype`, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
|
396
|
+
"""
|
206
397
|
otype: str | None = CharField(max_length=64, db_index=True, null=True)
|
207
398
|
"""Default Python object type, e.g., DataFrame, AnnData."""
|
399
|
+
dtype: str | None = CharField(max_length=64, null=True, editable=False)
|
400
|
+
"""Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
|
401
|
+
|
402
|
+
For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
|
403
|
+
"""
|
208
404
|
hash: str | None = CharField(
|
209
405
|
max_length=HASH_LENGTH, db_index=True, null=True, editable=False
|
210
406
|
)
|
@@ -213,18 +409,19 @@ class Schema(Record, CanCurate, TracksRun):
|
|
213
409
|
For a composite schema, the hash of hashes.
|
214
410
|
"""
|
215
411
|
minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
|
216
|
-
"""Whether
|
412
|
+
"""Whether all passed features are to be considered required by default (default `True`).
|
217
413
|
|
218
|
-
|
219
|
-
|
220
|
-
If `True`, features are linked and considered as a minimally required set in validation.
|
414
|
+
Note that features that are explicitly marked as `optional` via `feature.with_config(optional=True)`
|
415
|
+
are **not** required even if this `minimal_set` is true.
|
221
416
|
"""
|
222
417
|
ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
|
223
418
|
"""Whether features are required to be ordered (default `False`)."""
|
224
419
|
maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
|
225
|
-
"""
|
420
|
+
"""Whether all features present in the dataset must be in the schema (default `False`).
|
421
|
+
|
422
|
+
If `False`, additional features are allowed to be present in the dataset.
|
226
423
|
|
227
|
-
If `True`,
|
424
|
+
If `True`, no additional features are allowed to be present in the dataset.
|
228
425
|
"""
|
229
426
|
components: Schema = ManyToManyField(
|
230
427
|
"self", through="SchemaComponent", symmetrical=False, related_name="composites"
|
@@ -271,20 +468,22 @@ class Schema(Record, CanCurate, TracksRun):
|
|
271
468
|
@overload
|
272
469
|
def __init__(
|
273
470
|
self,
|
274
|
-
features:
|
275
|
-
|
471
|
+
features: list[Record] | list[tuple[Feature, dict]] | None = None,
|
472
|
+
index: Feature | None = None,
|
473
|
+
slots: dict[str, Schema] | None = None,
|
276
474
|
name: str | None = None,
|
277
475
|
description: str | None = None,
|
278
|
-
dtype: str | None = None,
|
279
476
|
itype: str | Registry | FieldAttr | None = None,
|
477
|
+
flexible: bool | None = None,
|
280
478
|
type: Schema | None = None,
|
281
479
|
is_type: bool = False,
|
282
480
|
otype: str | None = None,
|
283
|
-
|
481
|
+
dtype: str | Type[int | float | str] | None = None, # noqa
|
284
482
|
ordered_set: bool = False,
|
483
|
+
minimal_set: bool = True,
|
285
484
|
maximal_set: bool = False,
|
286
|
-
slot: str | None = None,
|
287
485
|
coerce_dtype: bool = False,
|
486
|
+
n: int | None = None,
|
288
487
|
): ...
|
289
488
|
|
290
489
|
@overload
|
@@ -304,50 +503,152 @@ class Schema(Record, CanCurate, TracksRun):
|
|
304
503
|
if len(args) > 1:
|
305
504
|
raise ValueError("Only one non-keyword arg allowed: features")
|
306
505
|
|
307
|
-
features:
|
308
|
-
|
309
|
-
)
|
310
|
-
# typing here anticipates transitioning to a ManyToMany
|
311
|
-
# between composites and components similar to feature_sets
|
312
|
-
# in lamindb v2
|
313
|
-
components: dict[str, Schema] = kwargs.pop("components", {})
|
506
|
+
features: list[Record] | None = args[0] if args else kwargs.pop("features", [])
|
507
|
+
index: Feature | None = kwargs.pop("index", None)
|
508
|
+
slots: dict[str, Schema] = kwargs.pop("slots", {})
|
314
509
|
name: str | None = kwargs.pop("name", None)
|
315
510
|
description: str | None = kwargs.pop("description", None)
|
316
|
-
dtype: str | None = kwargs.pop("dtype", None)
|
317
511
|
itype: str | Record | DeferredAttribute | None = kwargs.pop("itype", None)
|
512
|
+
flexible: bool | None = kwargs.pop("flexible", None)
|
318
513
|
type: Feature | None = kwargs.pop("type", None)
|
319
514
|
is_type: bool = kwargs.pop("is_type", False)
|
320
515
|
otype: str | None = kwargs.pop("otype", None)
|
516
|
+
dtype: str | None = kwargs.pop("dtype", None)
|
321
517
|
minimal_set: bool = kwargs.pop("minimal_set", True)
|
322
518
|
ordered_set: bool = kwargs.pop("ordered_set", False)
|
323
519
|
maximal_set: bool = kwargs.pop("maximal_set", False)
|
324
|
-
|
325
|
-
|
326
|
-
|
520
|
+
coerce_dtype: bool | None = kwargs.pop("coerce_dtype", False)
|
521
|
+
using: bool | None = kwargs.pop("using", None)
|
522
|
+
n_features: int | None = kwargs.pop("n", None)
|
523
|
+
# backward compat
|
524
|
+
if not slots:
|
525
|
+
if "components" in kwargs:
|
526
|
+
logger.warning(
|
527
|
+
"`components` as a keyword argument is deprecated, please use `slots` instead"
|
528
|
+
)
|
529
|
+
slots = kwargs.pop("components")
|
327
530
|
if kwargs:
|
328
|
-
|
329
|
-
|
330
|
-
"
|
331
|
-
"is_type, otype, minimal_set, ordered_set, maximal_set, "
|
332
|
-
"slot, validated_by, coerce_dtype"
|
531
|
+
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Schema)])
|
532
|
+
raise FieldValidationError(
|
533
|
+
f"Only {valid_keywords} are valid keyword arguments"
|
333
534
|
)
|
535
|
+
(
|
536
|
+
features,
|
537
|
+
validated_kwargs,
|
538
|
+
optional_features,
|
539
|
+
features_registry,
|
540
|
+
flexible,
|
541
|
+
list_for_hashing,
|
542
|
+
) = self._validate_kwargs_calculate_hash(
|
543
|
+
features=features,
|
544
|
+
index=index,
|
545
|
+
slots=slots,
|
546
|
+
name=name,
|
547
|
+
description=description,
|
548
|
+
itype=itype,
|
549
|
+
flexible=flexible,
|
550
|
+
type=type,
|
551
|
+
is_type=is_type,
|
552
|
+
otype=otype,
|
553
|
+
dtype=dtype,
|
554
|
+
minimal_set=minimal_set,
|
555
|
+
ordered_set=ordered_set,
|
556
|
+
maximal_set=maximal_set,
|
557
|
+
coerce_dtype=coerce_dtype,
|
558
|
+
n_features=n_features,
|
559
|
+
)
|
560
|
+
schema = (
|
561
|
+
Schema.objects.using(using)
|
562
|
+
.filter(hash=validated_kwargs["hash"])
|
563
|
+
.one_or_none()
|
564
|
+
)
|
565
|
+
self._list_for_hashing = list_for_hashing
|
566
|
+
if schema is not None:
|
567
|
+
logger.important(f"returning existing schema with same hash: {schema}")
|
568
|
+
init_self_from_db(self, schema)
|
569
|
+
update_attributes(self, validated_kwargs)
|
570
|
+
self.optionals.set(optional_features)
|
571
|
+
return None
|
572
|
+
self._slots: dict[str, Schema] = {}
|
573
|
+
if features:
|
574
|
+
self._features = (get_related_name(features_registry), features) # type: ignore
|
575
|
+
elif slots:
|
576
|
+
for slot_key, component in slots.items():
|
577
|
+
if component._state.adding:
|
578
|
+
raise InvalidArgument(
|
579
|
+
f"schema for {slot_key} {component} must be saved before use"
|
580
|
+
)
|
581
|
+
self._slots = slots
|
582
|
+
if validated_kwargs["hash"] in KNOWN_SCHEMAS:
|
583
|
+
validated_kwargs["uid"] = KNOWN_SCHEMAS[validated_kwargs["hash"]]
|
584
|
+
else:
|
585
|
+
validated_kwargs["uid"] = ids.base62_16()
|
586
|
+
super().__init__(**validated_kwargs)
|
587
|
+
# manipulating aux fields is easier after calling super().__init__()
|
588
|
+
self.optionals.set(optional_features)
|
589
|
+
self.flexible = flexible
|
590
|
+
if index is not None:
|
591
|
+
self._index_feature_uid = index.uid
|
334
592
|
|
593
|
+
def _validate_kwargs_calculate_hash(
|
594
|
+
self,
|
595
|
+
features: list[Record],
|
596
|
+
index: Feature | None,
|
597
|
+
slots: dict[str, Schema],
|
598
|
+
name: str | None,
|
599
|
+
description: str | None,
|
600
|
+
itype: str | Record | DeferredAttribute | None,
|
601
|
+
flexible: bool | None,
|
602
|
+
type: Feature | None,
|
603
|
+
is_type: bool,
|
604
|
+
otype: str | None,
|
605
|
+
dtype: str | None,
|
606
|
+
minimal_set: bool,
|
607
|
+
ordered_set: bool,
|
608
|
+
maximal_set: bool,
|
609
|
+
coerce_dtype: bool,
|
610
|
+
n_features: int | None,
|
611
|
+
optional_features_manual: list[Feature] | None = None,
|
612
|
+
) -> tuple[list[Feature], dict[str, Any], list[Feature], Registry, bool, list[str]]:
|
613
|
+
optional_features = []
|
614
|
+
features_registry: Registry = None
|
615
|
+
if itype is not None:
|
616
|
+
if itype != "Composite":
|
617
|
+
itype = serialize_dtype(itype, is_itype=True)
|
618
|
+
if index is not None:
|
619
|
+
if not isinstance(index, Feature):
|
620
|
+
raise TypeError("index must be a Feature")
|
621
|
+
features.insert(0, index)
|
335
622
|
if features:
|
623
|
+
features, configs = get_features_config(features)
|
336
624
|
features_registry = validate_features(features)
|
337
625
|
itype_compare = features_registry.__get_name_with_module__()
|
338
626
|
if itype is not None:
|
339
|
-
assert itype
|
627
|
+
assert itype.startswith(itype_compare), str(itype_compare) # noqa: S101
|
340
628
|
else:
|
341
629
|
itype = itype_compare
|
630
|
+
if n_features is not None:
|
631
|
+
if n_features != len(features):
|
632
|
+
logger.important(f"updating to n {len(features)} features")
|
342
633
|
n_features = len(features)
|
343
|
-
|
634
|
+
if features_registry == Feature:
|
635
|
+
optional_features = [
|
636
|
+
config[0] for config in configs if config[1].get("optional")
|
637
|
+
]
|
638
|
+
if optional_features:
|
639
|
+
assert optional_features_manual is None # noqa: S101
|
640
|
+
if not optional_features and optional_features_manual is not None:
|
641
|
+
optional_features = optional_features_manual
|
642
|
+
elif n_features is None:
|
344
643
|
n_features = -1
|
345
644
|
if dtype is None:
|
346
645
|
dtype = None if itype is not None and itype == "Feature" else NUMBER_TYPE
|
347
646
|
else:
|
348
647
|
dtype = get_type_str(dtype)
|
349
|
-
|
350
|
-
if
|
648
|
+
flexible_default = n_features < 0
|
649
|
+
if flexible is None:
|
650
|
+
flexible = flexible_default
|
651
|
+
if slots:
|
351
652
|
itype = "Composite"
|
352
653
|
if otype is None:
|
353
654
|
raise InvalidArgument("Please pass otype != None for composite schemas")
|
@@ -359,8 +660,8 @@ class Schema(Record, CanCurate, TracksRun):
|
|
359
660
|
"name": name,
|
360
661
|
"description": description,
|
361
662
|
"type": type,
|
362
|
-
"dtype": dtype,
|
363
663
|
"is_type": is_type,
|
664
|
+
"dtype": dtype,
|
364
665
|
"otype": otype,
|
365
666
|
"n": n_features,
|
366
667
|
"itype": itype_str,
|
@@ -368,35 +669,68 @@ class Schema(Record, CanCurate, TracksRun):
|
|
368
669
|
"ordered_set": ordered_set,
|
369
670
|
"maximal_set": maximal_set,
|
370
671
|
}
|
672
|
+
n_features_default = -1
|
673
|
+
coerce_dtype_default = False
|
371
674
|
if coerce_dtype:
|
372
675
|
validated_kwargs["_aux"] = {"af": {"0": coerce_dtype}}
|
373
|
-
if
|
374
|
-
|
375
|
-
elif components:
|
376
|
-
hash = hash_set({component.hash for component in components.values()})
|
676
|
+
if slots:
|
677
|
+
list_for_hashing = [component.hash for component in slots.values()]
|
377
678
|
else:
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
679
|
+
HASH_CODE = {
|
680
|
+
"dtype": "a",
|
681
|
+
"itype": "b",
|
682
|
+
"minimal_set": "c",
|
683
|
+
"ordered_set": "d",
|
684
|
+
"maximal_set": "e",
|
685
|
+
"flexible": "f",
|
686
|
+
"coerce_dtype": "g",
|
687
|
+
"n": "h",
|
688
|
+
"optional": "i",
|
689
|
+
"features_hash": "j",
|
690
|
+
}
|
691
|
+
# we do not want pure informational annotations like otype, name, type, is_type, otype to be part of the hash
|
692
|
+
hash_args = ["dtype", "itype", "minimal_set", "ordered_set", "maximal_set"]
|
693
|
+
list_for_hashing = [
|
694
|
+
f"{HASH_CODE[arg]}={validated_kwargs[arg]}"
|
695
|
+
for arg in hash_args
|
696
|
+
if validated_kwargs[arg] is not None
|
697
|
+
]
|
698
|
+
# only include in hash if not default so that it's backward compatible with records for which flexible was never set
|
699
|
+
if flexible != flexible_default:
|
700
|
+
list_for_hashing.append(f"{HASH_CODE['flexible']}={flexible}")
|
701
|
+
if coerce_dtype != coerce_dtype_default:
|
702
|
+
list_for_hashing.append(f"{HASH_CODE['coerce_dtype']}={coerce_dtype}")
|
703
|
+
if n_features != n_features_default:
|
704
|
+
list_for_hashing.append(f"{HASH_CODE['n']}={n_features}")
|
705
|
+
if features:
|
706
|
+
if optional_features:
|
707
|
+
feature_list_for_hashing = [
|
708
|
+
feature.uid
|
709
|
+
if feature not in set(optional_features)
|
710
|
+
else f"{feature.uid}({HASH_CODE['optional']})"
|
711
|
+
for feature in features
|
712
|
+
]
|
713
|
+
else:
|
714
|
+
feature_list_for_hashing = [feature.uid for feature in features]
|
715
|
+
# order matters if ordered_set is True
|
716
|
+
if ordered_set:
|
717
|
+
features_hash = hash_string(":".join(feature_list_for_hashing))
|
718
|
+
else:
|
719
|
+
features_hash = hash_string(
|
720
|
+
":".join(sorted(feature_list_for_hashing))
|
395
721
|
)
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
722
|
+
list_for_hashing.append(f"{HASH_CODE['features_hash']}={features_hash}")
|
723
|
+
self._list_for_hashing = sorted(list_for_hashing)
|
724
|
+
schema_hash = hash_string(":".join(self._list_for_hashing))
|
725
|
+
validated_kwargs["hash"] = schema_hash
|
726
|
+
return (
|
727
|
+
features,
|
728
|
+
validated_kwargs,
|
729
|
+
optional_features,
|
730
|
+
features_registry,
|
731
|
+
flexible,
|
732
|
+
list_for_hashing,
|
733
|
+
)
|
400
734
|
|
401
735
|
@classmethod
|
402
736
|
def from_values( # type: ignore
|
@@ -426,13 +760,18 @@ class Schema(Record, CanCurate, TracksRun):
|
|
426
760
|
Raises:
|
427
761
|
ValidationError: If some values are not valid.
|
428
762
|
|
429
|
-
|
763
|
+
Example:
|
764
|
+
|
765
|
+
::
|
430
766
|
|
431
|
-
|
432
|
-
|
767
|
+
import lamindb as ln
|
768
|
+
import bionty as bt
|
433
769
|
|
434
|
-
|
435
|
-
|
770
|
+
features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
|
771
|
+
schema = ln.Schema.from_values(features)
|
772
|
+
|
773
|
+
genes = ["ENSG00000139618", "ENSG00000198786"]
|
774
|
+
schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
|
436
775
|
"""
|
437
776
|
if not isinstance(field, FieldAttr):
|
438
777
|
raise TypeError(
|
@@ -496,7 +835,7 @@ class Schema(Record, CanCurate, TracksRun):
|
|
496
835
|
df.columns, field=field, organism=organism
|
497
836
|
)
|
498
837
|
schema = Schema(
|
499
|
-
validated_features, name=name, dtype=None, otype="DataFrame"
|
838
|
+
list(validated_features), name=name, dtype=None, otype="DataFrame"
|
500
839
|
)
|
501
840
|
else:
|
502
841
|
dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
|
@@ -510,10 +849,9 @@ class Schema(Record, CanCurate, TracksRun):
|
|
510
849
|
source=source,
|
511
850
|
)
|
512
851
|
schema = Schema(
|
513
|
-
features=validated_features,
|
852
|
+
features=list(validated_features),
|
514
853
|
name=name,
|
515
854
|
dtype=get_type_str(dtype),
|
516
|
-
otype="DataFrame",
|
517
855
|
)
|
518
856
|
return schema
|
519
857
|
|
@@ -521,12 +859,50 @@ class Schema(Record, CanCurate, TracksRun):
|
|
521
859
|
"""Save."""
|
522
860
|
from .save import bulk_create
|
523
861
|
|
862
|
+
if not self._state.adding:
|
863
|
+
features = (
|
864
|
+
self._features[1]
|
865
|
+
if hasattr(self, "_features")
|
866
|
+
else (self.members.list() if self.members.exists() else [])
|
867
|
+
)
|
868
|
+
_, validated_kwargs, _, _, _, list_for_hashing = (
|
869
|
+
self._validate_kwargs_calculate_hash(
|
870
|
+
features=features, # type: ignore
|
871
|
+
index=None, # need to pass None here as otherwise counting double
|
872
|
+
slots=self._slots if hasattr(self, "_slots") else self.slots,
|
873
|
+
name=self.name,
|
874
|
+
description=self.description,
|
875
|
+
itype=self.itype,
|
876
|
+
flexible=self.flexible,
|
877
|
+
type=self.type,
|
878
|
+
is_type=self.is_type,
|
879
|
+
otype=self.otype,
|
880
|
+
dtype=self.dtype,
|
881
|
+
minimal_set=self.minimal_set,
|
882
|
+
ordered_set=self.ordered_set,
|
883
|
+
maximal_set=self.maximal_set,
|
884
|
+
coerce_dtype=self.coerce_dtype,
|
885
|
+
n_features=self.n,
|
886
|
+
optional_features_manual=self.optionals.get(),
|
887
|
+
)
|
888
|
+
)
|
889
|
+
if validated_kwargs["hash"] != self.hash:
|
890
|
+
from .artifact import Artifact
|
891
|
+
|
892
|
+
datasets = Artifact.filter(schema=self).all()
|
893
|
+
if datasets.exists():
|
894
|
+
logger.warning(
|
895
|
+
f"you updated the schema hash and might invalidate datasets that were previously validated with this schema: {datasets.list('uid')}"
|
896
|
+
)
|
897
|
+
self.hash = validated_kwargs["hash"]
|
898
|
+
self.n = validated_kwargs["n"]
|
899
|
+
self._list_for_hashing = list_for_hashing
|
524
900
|
super().save(*args, **kwargs)
|
525
|
-
if hasattr(self, "
|
901
|
+
if hasattr(self, "_slots"):
|
526
902
|
# analogous to save_schema_links in core._data.py
|
527
903
|
# which is called to save feature sets in artifact.save()
|
528
904
|
links = []
|
529
|
-
for slot, component in self.
|
905
|
+
for slot, component in self._slots.items():
|
530
906
|
kwargs = {
|
531
907
|
"composite_id": self.id,
|
532
908
|
"component_id": component.id,
|
@@ -536,12 +912,15 @@ class Schema(Record, CanCurate, TracksRun):
|
|
536
912
|
bulk_create(links, ignore_conflicts=True)
|
537
913
|
if hasattr(self, "_features"):
|
538
914
|
assert self.n > 0 # noqa: S101
|
915
|
+
using: bool | None = kwargs.pop("using", None)
|
539
916
|
related_name, records = self._features
|
540
917
|
# only the following method preserves the order
|
541
918
|
# .set() does not preserve the order but orders by
|
542
919
|
# the feature primary key
|
543
920
|
through_model = getattr(self, related_name).through
|
544
|
-
related_model_split = self.itype
|
921
|
+
related_model_split = parse_cat_dtype(self.itype, is_itype=True)[
|
922
|
+
"registry_str"
|
923
|
+
].split(".")
|
545
924
|
if len(related_model_split) == 1:
|
546
925
|
related_field = related_model_split[0].lower()
|
547
926
|
else:
|
@@ -551,16 +930,23 @@ class Schema(Record, CanCurate, TracksRun):
|
|
551
930
|
through_model(**{"schema_id": self.id, related_field_id: record.id})
|
552
931
|
for record in records
|
553
932
|
]
|
554
|
-
through_model.objects.bulk_create(links, ignore_conflicts=True)
|
933
|
+
through_model.objects.using(using).bulk_create(links, ignore_conflicts=True)
|
934
|
+
delattr(self, "_features")
|
555
935
|
return self
|
556
936
|
|
557
937
|
@property
|
558
938
|
def members(self) -> QuerySet:
|
559
|
-
"""A queryset for the individual records
|
939
|
+
"""A queryset for the individual records in the feature set underlying the schema.
|
940
|
+
|
941
|
+
Unlike `schema.features`, `schema.genes`, `schema.proteins`, etc., this queryset is ordered and
|
942
|
+
doesn't require knowledge of the entity.
|
943
|
+
"""
|
560
944
|
if self._state.adding:
|
561
945
|
# this should return a queryset and not a list...
|
562
946
|
# need to fix this
|
563
947
|
return self._features[1]
|
948
|
+
if self.itype == "Composite":
|
949
|
+
return Feature.objects.none()
|
564
950
|
related_name = self._get_related_name()
|
565
951
|
if related_name is None:
|
566
952
|
related_name = "features"
|
@@ -579,62 +965,108 @@ class Schema(Record, CanCurate, TracksRun):
|
|
579
965
|
|
580
966
|
@coerce_dtype.setter
|
581
967
|
def coerce_dtype(self, value: bool) -> None:
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
968
|
+
self._aux = self._aux or {}
|
969
|
+
self._aux.setdefault("af", {})["0"] = value
|
970
|
+
|
971
|
+
@property
|
972
|
+
def flexible(self) -> bool:
|
973
|
+
"""Indicates how to handle validation and annotation in case features are not defined.
|
974
|
+
|
975
|
+
Examples:
|
976
|
+
|
977
|
+
Make a rigid schema flexible::
|
978
|
+
|
979
|
+
schema = ln.Schema.get(name="my_schema")
|
980
|
+
schema.flexible = True
|
981
|
+
schema.save()
|
982
|
+
|
983
|
+
During schema creation::
|
984
|
+
|
985
|
+
# if you're not passing features but just defining the itype, defaults to flexible = True
|
986
|
+
schema = ln.Schema(itype=ln.Feature).save()
|
987
|
+
assert not schema.flexible
|
988
|
+
|
989
|
+
# if you're passing features, defaults to flexible = False
|
990
|
+
schema = ln.Schema(
|
991
|
+
features=[ln.Feature(name="my_required_feature", dtype=int).save()],
|
992
|
+
)
|
993
|
+
assert not schema.flexible
|
994
|
+
|
995
|
+
# you can also validate & annotate features in addition to those that you're explicitly defining:
|
996
|
+
schema = ln.Schema(
|
997
|
+
features=[ln.Feature(name="my_required_feature", dtype=int).save()],
|
998
|
+
flexible=True,
|
999
|
+
)
|
1000
|
+
assert schema.flexible
|
1001
|
+
|
1002
|
+
"""
|
1003
|
+
if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]: # type: ignore
|
1004
|
+
return self._aux["af"]["2"] # type: ignore
|
1005
|
+
else:
|
1006
|
+
return (
|
1007
|
+
self.n < 0
|
1008
|
+
) # is the flexible default, needed for backward compat if flexible was never set
|
1009
|
+
|
1010
|
+
@flexible.setter
|
1011
|
+
def flexible(self, value: bool) -> None:
|
1012
|
+
self._aux = self._aux or {}
|
1013
|
+
self._aux.setdefault("af", {})["2"] = value
|
612
1014
|
|
613
1015
|
@property
|
614
|
-
|
615
|
-
|
616
|
-
return self.itype
|
1016
|
+
def index(self) -> None | Feature:
|
1017
|
+
"""The feature configured to act as index.
|
617
1018
|
|
618
|
-
|
619
|
-
|
620
|
-
self.
|
1019
|
+
To unset it, set `schema.index` to `None`.
|
1020
|
+
"""
|
1021
|
+
if self._index_feature_uid is None:
|
1022
|
+
return None
|
1023
|
+
else:
|
1024
|
+
return self.features.get(uid=self._index_feature_uid)
|
1025
|
+
|
1026
|
+
@index.setter
|
1027
|
+
def index(self, value: None | Feature) -> None:
|
1028
|
+
if value is None:
|
1029
|
+
current_index = self.index
|
1030
|
+
self.features.remove(current_index)
|
1031
|
+
self._index_feature_uid = value
|
1032
|
+
else:
|
1033
|
+
self.features.add(value)
|
1034
|
+
self._index_feature_uid = value.uid
|
1035
|
+
|
1036
|
+
@property
|
1037
|
+
def _index_feature_uid(self) -> None | str:
|
1038
|
+
"""The uid of the index feature."""
|
1039
|
+
if self._aux is not None and "af" in self._aux and "3" in self._aux["af"]:
|
1040
|
+
return self._aux["af"]["3"]
|
1041
|
+
else:
|
1042
|
+
return None
|
1043
|
+
|
1044
|
+
@_index_feature_uid.setter
|
1045
|
+
def _index_feature_uid(self, value: str | None) -> None:
|
1046
|
+
self._aux = self._aux or {}
|
1047
|
+
if value is None:
|
1048
|
+
self._aux.get("af", {}).pop("3")
|
1049
|
+
else:
|
1050
|
+
self._aux.setdefault("af", {})["3"] = value
|
621
1051
|
|
622
1052
|
@property
|
623
1053
|
def slots(self) -> dict[str, Schema]:
|
624
1054
|
"""Slots.
|
625
1055
|
|
626
|
-
Examples
|
1056
|
+
Examples:
|
627
1057
|
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
1058
|
+
::
|
1059
|
+
|
1060
|
+
# define composite schema
|
1061
|
+
anndata_schema = ln.Schema(
|
1062
|
+
name="small_dataset1_anndata_schema",
|
1063
|
+
otype="AnnData",
|
1064
|
+
slots={"obs": obs_schema, "var": var_schema},
|
1065
|
+
).save()
|
634
1066
|
|
635
|
-
|
636
|
-
|
637
|
-
|
1067
|
+
# access slots
|
1068
|
+
anndata_schema.slots
|
1069
|
+
# {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}
|
638
1070
|
"""
|
639
1071
|
if hasattr(self, "_slots"):
|
640
1072
|
return self._slots
|
@@ -646,6 +1078,44 @@ class Schema(Record, CanCurate, TracksRun):
|
|
646
1078
|
return self._slots
|
647
1079
|
return {}
|
648
1080
|
|
1081
|
+
@property
|
1082
|
+
def optionals(self) -> SchemaOptionals:
|
1083
|
+
"""Manage optional features.
|
1084
|
+
|
1085
|
+
Example:
|
1086
|
+
|
1087
|
+
::
|
1088
|
+
|
1089
|
+
# a schema with optional "sample_name"
|
1090
|
+
schema_optional_sample_name = ln.Schema(
|
1091
|
+
features=[
|
1092
|
+
ln.Feature(name="sample_id", dtype=str).save(), # required
|
1093
|
+
ln.Feature(name="sample_name", dtype=str).save().with_config(optional=True), # optional
|
1094
|
+
],
|
1095
|
+
).save()
|
1096
|
+
|
1097
|
+
# raise ValidationError since `sample_id` is required
|
1098
|
+
ln.curators.DataFrameCurator(
|
1099
|
+
pd.DataFrame(
|
1100
|
+
{
|
1101
|
+
"sample_name": ["Sample 1", "Sample 2"],
|
1102
|
+
}
|
1103
|
+
),
|
1104
|
+
schema=schema_optional_sample_name).validate()
|
1105
|
+
)
|
1106
|
+
|
1107
|
+
# passes because an optional column is missing
|
1108
|
+
ln.curators.DataFrameCurator(
|
1109
|
+
pd.DataFrame(
|
1110
|
+
{
|
1111
|
+
"sample_id": ["sample1", "sample2"],
|
1112
|
+
}
|
1113
|
+
),
|
1114
|
+
schema=schema_optional_sample_name).validate()
|
1115
|
+
)
|
1116
|
+
"""
|
1117
|
+
return SchemaOptionals(self)
|
1118
|
+
|
649
1119
|
def describe(self, return_str=False) -> None | str:
|
650
1120
|
"""Describe schema."""
|
651
1121
|
message = str(self)
|
@@ -654,6 +1124,11 @@ class Schema(Record, CanCurate, TracksRun):
|
|
654
1124
|
message + "\nslots:"
|
655
1125
|
for slot, schema in self.slots.items():
|
656
1126
|
message += f"\n {slot}: " + str(schema)
|
1127
|
+
else:
|
1128
|
+
tree = describe_schema(self)
|
1129
|
+
return format_rich_tree(
|
1130
|
+
tree, fallback="no linked features", return_str=return_str
|
1131
|
+
)
|
657
1132
|
if return_str:
|
658
1133
|
return message
|
659
1134
|
else:
|
@@ -671,7 +1146,9 @@ def get_type_str(dtype: str | None) -> str | None:
|
|
671
1146
|
|
672
1147
|
def _get_related_name(self: Schema) -> str:
|
673
1148
|
related_models = dict_related_model_to_related_name(self, instance=self._state.db)
|
674
|
-
related_name = related_models.get(
|
1149
|
+
related_name = related_models.get(
|
1150
|
+
parse_cat_dtype(self.itype, is_itype=True)["registry_str"]
|
1151
|
+
)
|
675
1152
|
return related_name
|
676
1153
|
|
677
1154
|
|