lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +31 -26
- lamindb/_finish.py +9 -1
- lamindb/_tracked.py +26 -3
- lamindb/_view.py +2 -3
- lamindb/base/__init__.py +1 -1
- lamindb/base/ids.py +1 -10
- lamindb/base/users.py +1 -4
- lamindb/core/__init__.py +7 -65
- lamindb/core/_context.py +41 -10
- lamindb/core/_mapped_collection.py +4 -2
- lamindb/core/_settings.py +6 -6
- lamindb/core/_sync_git.py +1 -1
- lamindb/core/_track_environment.py +2 -1
- lamindb/core/datasets/_small.py +3 -3
- lamindb/core/loaders.py +22 -9
- lamindb/core/storage/_anndata_accessor.py +8 -3
- lamindb/core/storage/_backed_access.py +14 -7
- lamindb/core/storage/_pyarrow_dataset.py +24 -9
- lamindb/core/storage/_tiledbsoma.py +6 -4
- lamindb/core/storage/_zarr.py +32 -11
- lamindb/core/storage/objects.py +59 -26
- lamindb/core/storage/paths.py +16 -13
- lamindb/curators/__init__.py +173 -145
- lamindb/errors.py +1 -1
- lamindb/integrations/_vitessce.py +4 -4
- lamindb/migrations/0089_subsequent_runs.py +159 -0
- lamindb/migrations/0090_runproject_project_runs.py +73 -0
- lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
- lamindb/models/__init__.py +79 -0
- lamindb/{core → models}/_describe.py +3 -3
- lamindb/{core → models}/_django.py +8 -5
- lamindb/{core → models}/_feature_manager.py +103 -87
- lamindb/{_from_values.py → models/_from_values.py} +5 -2
- lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
- lamindb/{core → models}/_label_manager.py +10 -17
- lamindb/{core/relations.py → models/_relations.py} +8 -1
- lamindb/models/artifact.py +2601 -0
- lamindb/{_can_curate.py → models/can_curate.py} +349 -180
- lamindb/models/collection.py +683 -0
- lamindb/models/core.py +135 -0
- lamindb/models/feature.py +643 -0
- lamindb/models/flextable.py +163 -0
- lamindb/{_parents.py → models/has_parents.py} +55 -49
- lamindb/models/project.py +384 -0
- lamindb/{_query_manager.py → models/query_manager.py} +10 -8
- lamindb/{_query_set.py → models/query_set.py} +52 -30
- lamindb/models/record.py +1757 -0
- lamindb/models/run.py +563 -0
- lamindb/{_save.py → models/save.py} +18 -8
- lamindb/models/schema.py +732 -0
- lamindb/models/transform.py +360 -0
- lamindb/models/ulabel.py +249 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
- lamindb-1.2a2.dist-info/RECORD +94 -0
- lamindb/_artifact.py +0 -1361
- lamindb/_collection.py +0 -440
- lamindb/_feature.py +0 -316
- lamindb/_is_versioned.py +0 -40
- lamindb/_record.py +0 -1065
- lamindb/_run.py +0 -60
- lamindb/_schema.py +0 -347
- lamindb/_storage.py +0 -15
- lamindb/_transform.py +0 -170
- lamindb/_ulabel.py +0 -56
- lamindb/_utils.py +0 -9
- lamindb/base/validation.py +0 -63
- lamindb/core/_data.py +0 -491
- lamindb/core/fields.py +0 -12
- lamindb/models.py +0 -4435
- lamindb-1.1.0.dist-info/RECORD +0 -95
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
- {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
lamindb/models/schema.py
ADDED
@@ -0,0 +1,732 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any, overload
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from django.db import models
|
7
|
+
from django.db.models import CASCADE, PROTECT, ManyToManyField
|
8
|
+
from lamin_utils import logger
|
9
|
+
from lamindb_setup.core.hashing import HASH_LENGTH, hash_set
|
10
|
+
|
11
|
+
from lamindb.base import ids
|
12
|
+
from lamindb.base.fields import (
|
13
|
+
BooleanField,
|
14
|
+
CharField,
|
15
|
+
ForeignKey,
|
16
|
+
IntegerField,
|
17
|
+
JSONField,
|
18
|
+
)
|
19
|
+
from lamindb.base.types import FieldAttr, ListLike
|
20
|
+
from lamindb.errors import InvalidArgument
|
21
|
+
|
22
|
+
from ..base import deprecated
|
23
|
+
from ..errors import ValidationError
|
24
|
+
from ._relations import (
|
25
|
+
dict_related_model_to_related_name,
|
26
|
+
get_related_name,
|
27
|
+
)
|
28
|
+
from .can_curate import CanCurate
|
29
|
+
from .feature import (
|
30
|
+
Feature,
|
31
|
+
convert_pandas_dtype_to_lamin_dtype,
|
32
|
+
get_dtype_str_from_dtype,
|
33
|
+
)
|
34
|
+
from .record import (
|
35
|
+
BasicRecord,
|
36
|
+
LinkORM,
|
37
|
+
Record,
|
38
|
+
Registry,
|
39
|
+
init_self_from_db,
|
40
|
+
update_attributes,
|
41
|
+
)
|
42
|
+
from .run import Param, TracksRun, TracksUpdates
|
43
|
+
|
44
|
+
if TYPE_CHECKING:
|
45
|
+
from collections.abc import Iterable
|
46
|
+
|
47
|
+
import pandas as pd
|
48
|
+
from django.db.models.query_utils import DeferredAttribute
|
49
|
+
|
50
|
+
from .artifact import Artifact
|
51
|
+
from .project import Project
|
52
|
+
from .query_set import QuerySet
|
53
|
+
|
54
|
+
|
55
|
+
NUMBER_TYPE = "num"
|
56
|
+
DICT_KEYS_TYPE = type({}.keys()) # type: ignore
|
57
|
+
|
58
|
+
|
59
|
+
def validate_features(features: list[Record]) -> Record:
|
60
|
+
"""Validate and return feature type."""
|
61
|
+
try:
|
62
|
+
if len(features) == 0:
|
63
|
+
raise ValueError("Provide list of features with at least one element")
|
64
|
+
except TypeError:
|
65
|
+
raise ValueError(
|
66
|
+
"Please pass a ListLike of features, not a single feature"
|
67
|
+
) from None
|
68
|
+
if not hasattr(features, "__getitem__"):
|
69
|
+
raise TypeError("features has to be list-like")
|
70
|
+
if not isinstance(features[0], Record):
|
71
|
+
raise TypeError(
|
72
|
+
"features has to store feature records! use .from_values() otherwise"
|
73
|
+
)
|
74
|
+
feature_types = {feature.__class__ for feature in features}
|
75
|
+
if len(feature_types) > 1:
|
76
|
+
raise TypeError("schema can only contain a single type")
|
77
|
+
for feature in features:
|
78
|
+
if feature._state.adding:
|
79
|
+
raise ValueError("Can only construct feature sets from validated features")
|
80
|
+
return next(iter(feature_types)) # return value in set of cardinality 1
|
81
|
+
|
82
|
+
|
83
|
+
class Schema(Record, CanCurate, TracksRun):
|
84
|
+
"""Schemas.
|
85
|
+
|
86
|
+
The simplest schema is a feature set such as the set of columns of a `DataFrame`.
|
87
|
+
|
88
|
+
A composite schema has multiple components, e.g., for an `AnnData`, one schema for `obs` and another one for `var`.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
|
92
|
+
records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
|
93
|
+
a set upon instantiation. If you'd like to pass values, use
|
94
|
+
:meth:`~lamindb.Schema.from_values` or
|
95
|
+
:meth:`~lamindb.Schema.from_df`.
|
96
|
+
components: `dict[str, Schema] | None = None` A dictionary mapping component names to
|
97
|
+
their corresponding :class:`~lamindb.Schema` objects for composite schemas.
|
98
|
+
name: `str | None = None` A name.
|
99
|
+
description: `str | None = None` A description.
|
100
|
+
dtype: `str | None = None` The simple type. Defaults to
|
101
|
+
`None` for sets of :class:`~lamindb.Feature` records.
|
102
|
+
Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
|
103
|
+
itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
|
104
|
+
type: `Schema | None = None` A type.
|
105
|
+
is_type: `bool = False` Distinguish types from instances of the type.
|
106
|
+
otype: `str | None = None` An object type to define the structure of a composite schema.
|
107
|
+
minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
|
108
|
+
ordered_set: `bool = False` Whether features are required to be ordered.
|
109
|
+
maximal_set: `bool = False` If `True`, no additional features are allowed.
|
110
|
+
slot: `str | None = None` The slot name when this schema is used as a component in a
|
111
|
+
composite schema.
|
112
|
+
coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
|
113
|
+
during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
|
114
|
+
|
115
|
+
.. dropdown:: Why does LaminDB model schemas, not just features?
|
116
|
+
|
117
|
+
1. Performance: Imagine you measure the same panel of 20k transcripts in
|
118
|
+
1M samples. By modeling the panel as a feature set, you can link all
|
119
|
+
your artifacts against one feature set and only need to store 1M
|
120
|
+
instead of 1M x 20k = 20B links.
|
121
|
+
2. Interpretation: Model protein panels, gene panels, etc.
|
122
|
+
3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
|
123
|
+
|
124
|
+
These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
|
125
|
+
|
126
|
+
Note:
|
127
|
+
|
128
|
+
A feature set can be identified by the `hash` of its feature uids.
|
129
|
+
It's stored in the `.hash` field.
|
130
|
+
|
131
|
+
A `slot` provides a string key to access feature sets. For instance, for the schema of an
|
132
|
+
`AnnData` object, it would be `'obs'` for `adata.obs`.
|
133
|
+
|
134
|
+
See Also:
|
135
|
+
:meth:`~lamindb.Schema.from_values`
|
136
|
+
Create from values.
|
137
|
+
:meth:`~lamindb.Schema.from_df`
|
138
|
+
Create from dataframe columns.
|
139
|
+
|
140
|
+
Examples:
|
141
|
+
|
142
|
+
Create a schema (feature set) from df with types:
|
143
|
+
|
144
|
+
>>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
|
145
|
+
>>> schema = ln.Schema.from_df(df)
|
146
|
+
|
147
|
+
Create a schema (feature set) from features:
|
148
|
+
|
149
|
+
>>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
|
150
|
+
>>> schema = ln.Schema(features)
|
151
|
+
|
152
|
+
Create a schema (feature set) from identifier values:
|
153
|
+
|
154
|
+
>>> import bionty as bt
|
155
|
+
>>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
|
156
|
+
|
157
|
+
"""
|
158
|
+
|
159
|
+
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
160
|
+
abstract = False
|
161
|
+
|
162
|
+
_name_field: str = "name"
|
163
|
+
_aux_fields: dict[str, tuple[str, type]] = {
|
164
|
+
"0": ("coerce_dtype", bool),
|
165
|
+
"1": ("_index_feature_uid", str),
|
166
|
+
}
|
167
|
+
|
168
|
+
id: int = models.AutoField(primary_key=True)
|
169
|
+
"""Internal id, valid only in one DB instance."""
|
170
|
+
uid: str = CharField(editable=False, unique=True, db_index=True, max_length=20)
|
171
|
+
"""A universal id (hash of the set of feature values)."""
|
172
|
+
name: str | None = CharField(max_length=150, null=True, db_index=True)
|
173
|
+
"""A name."""
|
174
|
+
description: str | None = CharField(null=True, db_index=True)
|
175
|
+
"""A description."""
|
176
|
+
n = IntegerField()
|
177
|
+
"""Number of features in the set."""
|
178
|
+
dtype: str | None = CharField(max_length=64, null=True, editable=False)
|
179
|
+
"""Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
|
180
|
+
|
181
|
+
For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
|
182
|
+
"""
|
183
|
+
itype: str | None = CharField(
|
184
|
+
max_length=120, db_index=True, null=True, editable=False
|
185
|
+
)
|
186
|
+
"""A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
|
187
|
+
|
188
|
+
Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
|
189
|
+
|
190
|
+
.. versionchanged:: 1.0.0
|
191
|
+
Was called `registry` before.
|
192
|
+
"""
|
193
|
+
type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
|
194
|
+
"""Type of schema.
|
195
|
+
|
196
|
+
Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
|
197
|
+
|
198
|
+
You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`.
|
199
|
+
|
200
|
+
Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.
|
201
|
+
"""
|
202
|
+
records: Schema
|
203
|
+
"""Records of this type."""
|
204
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
205
|
+
"""Distinguish types from instances of the type."""
|
206
|
+
otype: str | None = CharField(max_length=64, db_index=True, null=True)
|
207
|
+
"""Default Python object type, e.g., DataFrame, AnnData."""
|
208
|
+
hash: str | None = CharField(
|
209
|
+
max_length=HASH_LENGTH, db_index=True, null=True, editable=False
|
210
|
+
)
|
211
|
+
"""A hash of the set of feature identifiers.
|
212
|
+
|
213
|
+
For a composite schema, the hash of hashes.
|
214
|
+
"""
|
215
|
+
minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
|
216
|
+
"""Whether the schema contains a minimal set of linked features (default `True`).
|
217
|
+
|
218
|
+
If `False`, no features are linked to this schema.
|
219
|
+
|
220
|
+
If `True`, features are linked and considered as a minimally required set in validation.
|
221
|
+
"""
|
222
|
+
ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
|
223
|
+
"""Whether features are required to be ordered (default `False`)."""
|
224
|
+
maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
|
225
|
+
"""If `False`, additional features are allowed (default `False`).
|
226
|
+
|
227
|
+
If `True`, the the minimal set is a maximal set and no additional features are allowed.
|
228
|
+
"""
|
229
|
+
components: Schema = ManyToManyField(
|
230
|
+
"self", through="SchemaComponent", symmetrical=False, related_name="composites"
|
231
|
+
)
|
232
|
+
"""Components of this schema."""
|
233
|
+
composites: Schema
|
234
|
+
"""The composite schemas that contains this schema as a component.
|
235
|
+
|
236
|
+
For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
|
237
|
+
"""
|
238
|
+
features: Feature
|
239
|
+
"""The features contained in the schema."""
|
240
|
+
params: Param
|
241
|
+
"""The params contained in the schema."""
|
242
|
+
artifacts: Artifact
|
243
|
+
"""The artifacts that measure a feature set that matches this schema."""
|
244
|
+
validated_artifacts: Artifact
|
245
|
+
"""The artifacts that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
|
246
|
+
projects: Project
|
247
|
+
"""Linked projects."""
|
248
|
+
_curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
|
249
|
+
# lamindb v2
|
250
|
+
# _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
|
251
|
+
# ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
|
252
|
+
# -- the following two fields are dynamically removed from the API for now
|
253
|
+
validated_by: Schema | None = ForeignKey(
|
254
|
+
"self", PROTECT, related_name="validated_schemas", default=None, null=True
|
255
|
+
)
|
256
|
+
# """The schema that validated this schema during curation.
|
257
|
+
|
258
|
+
# When performing validation, the schema that enforced validation is often less concrete than what is validated.
|
259
|
+
|
260
|
+
# For instance, the set of measured features might be a superset of the minimally required set of features.
|
261
|
+
# """
|
262
|
+
# validated_schemas: Schema
|
263
|
+
# """The schemas that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
|
264
|
+
composite: Schema | None = ForeignKey(
|
265
|
+
"self", PROTECT, related_name="+", default=None, null=True
|
266
|
+
)
|
267
|
+
# The legacy foreign key
|
268
|
+
slot: str | None = CharField(max_length=100, db_index=True, null=True)
|
269
|
+
# The legacy slot
|
270
|
+
|
271
|
+
@overload
|
272
|
+
def __init__(
|
273
|
+
self,
|
274
|
+
features: Iterable[Record] | None = None,
|
275
|
+
components: dict[str, Schema] | None = None,
|
276
|
+
name: str | None = None,
|
277
|
+
description: str | None = None,
|
278
|
+
dtype: str | None = None,
|
279
|
+
itype: str | Registry | FieldAttr | None = None,
|
280
|
+
type: Schema | None = None,
|
281
|
+
is_type: bool = False,
|
282
|
+
otype: str | None = None,
|
283
|
+
minimal_set: bool = True,
|
284
|
+
ordered_set: bool = False,
|
285
|
+
maximal_set: bool = False,
|
286
|
+
slot: str | None = None,
|
287
|
+
coerce_dtype: bool = False,
|
288
|
+
): ...
|
289
|
+
|
290
|
+
@overload
|
291
|
+
def __init__(
|
292
|
+
self,
|
293
|
+
*db_args,
|
294
|
+
): ...
|
295
|
+
|
296
|
+
def __init__(
|
297
|
+
self,
|
298
|
+
*args,
|
299
|
+
**kwargs,
|
300
|
+
):
|
301
|
+
if len(args) == len(self._meta.concrete_fields):
|
302
|
+
super().__init__(*args, **kwargs)
|
303
|
+
return None
|
304
|
+
if len(args) > 1:
|
305
|
+
raise ValueError("Only one non-keyword arg allowed: features")
|
306
|
+
|
307
|
+
features: Iterable[Record] | None = (
|
308
|
+
args[0] if args else kwargs.pop("features", [])
|
309
|
+
)
|
310
|
+
# typing here anticipates transitioning to a ManyToMany
|
311
|
+
# between composites and components similar to feature_sets
|
312
|
+
# in lamindb v2
|
313
|
+
components: dict[str, Schema] = kwargs.pop("components", {})
|
314
|
+
name: str | None = kwargs.pop("name", None)
|
315
|
+
description: str | None = kwargs.pop("description", None)
|
316
|
+
dtype: str | None = kwargs.pop("dtype", None)
|
317
|
+
itype: str | Record | DeferredAttribute | None = kwargs.pop("itype", None)
|
318
|
+
type: Feature | None = kwargs.pop("type", None)
|
319
|
+
is_type: bool = kwargs.pop("is_type", False)
|
320
|
+
otype: str | None = kwargs.pop("otype", None)
|
321
|
+
minimal_set: bool = kwargs.pop("minimal_set", True)
|
322
|
+
ordered_set: bool = kwargs.pop("ordered_set", False)
|
323
|
+
maximal_set: bool = kwargs.pop("maximal_set", False)
|
324
|
+
slot: str | None = kwargs.pop("slot", None)
|
325
|
+
coerce_dtype: bool | None = kwargs.pop("coerce_dtype", None)
|
326
|
+
|
327
|
+
if kwargs:
|
328
|
+
raise ValueError(
|
329
|
+
f"Unexpected keyword arguments: {', '.join(kwargs.keys())}\n"
|
330
|
+
"Valid arguments are: features, description, dtype, itype, type, "
|
331
|
+
"is_type, otype, minimal_set, ordered_set, maximal_set, "
|
332
|
+
"slot, validated_by, coerce_dtype"
|
333
|
+
)
|
334
|
+
|
335
|
+
if features:
|
336
|
+
features_registry = validate_features(features)
|
337
|
+
itype_compare = features_registry.__get_name_with_module__()
|
338
|
+
if itype is not None:
|
339
|
+
assert itype == itype_compare, str(itype_compare) # noqa: S101
|
340
|
+
else:
|
341
|
+
itype = itype_compare
|
342
|
+
n_features = len(features)
|
343
|
+
else:
|
344
|
+
n_features = -1
|
345
|
+
if dtype is None:
|
346
|
+
dtype = None if itype is not None and itype == "Feature" else NUMBER_TYPE
|
347
|
+
else:
|
348
|
+
dtype = get_type_str(dtype)
|
349
|
+
components: dict[str, Schema]
|
350
|
+
if components:
|
351
|
+
itype = "Composite"
|
352
|
+
if otype is None:
|
353
|
+
raise InvalidArgument("Please pass otype != None for composite schemas")
|
354
|
+
if itype is not None and not isinstance(itype, str):
|
355
|
+
itype_str = get_dtype_str_from_dtype(itype, is_itype=True)
|
356
|
+
else:
|
357
|
+
itype_str = itype
|
358
|
+
validated_kwargs = {
|
359
|
+
"name": name,
|
360
|
+
"description": description,
|
361
|
+
"type": type,
|
362
|
+
"dtype": dtype,
|
363
|
+
"is_type": is_type,
|
364
|
+
"otype": otype,
|
365
|
+
"n": n_features,
|
366
|
+
"itype": itype_str,
|
367
|
+
"minimal_set": minimal_set,
|
368
|
+
"ordered_set": ordered_set,
|
369
|
+
"maximal_set": maximal_set,
|
370
|
+
}
|
371
|
+
if coerce_dtype:
|
372
|
+
validated_kwargs["_aux"] = {"af": {"0": coerce_dtype}}
|
373
|
+
if features:
|
374
|
+
hash = hash_set({feature.uid for feature in features})
|
375
|
+
elif components:
|
376
|
+
hash = hash_set({component.hash for component in components.values()})
|
377
|
+
else:
|
378
|
+
hash = hash_set({str(value) for value in validated_kwargs.values()})
|
379
|
+
validated_kwargs["hash"] = hash
|
380
|
+
validated_kwargs["slot"] = slot
|
381
|
+
schema = Schema.filter(hash=hash).one_or_none()
|
382
|
+
if schema is not None:
|
383
|
+
logger.important(f"returning existing schema with same hash: {schema}")
|
384
|
+
init_self_from_db(self, schema)
|
385
|
+
update_attributes(self, validated_kwargs)
|
386
|
+
return None
|
387
|
+
self._components: dict[str, Schema] = {}
|
388
|
+
if features:
|
389
|
+
self._features = (get_related_name(features_registry), features) # type: ignore
|
390
|
+
elif components:
|
391
|
+
for slot, component in components.items():
|
392
|
+
if component._state.adding:
|
393
|
+
raise InvalidArgument(
|
394
|
+
f"component {slot} {component} must be saved before use"
|
395
|
+
)
|
396
|
+
self._components = components
|
397
|
+
self._slots = components
|
398
|
+
validated_kwargs["uid"] = ids.base62_20()
|
399
|
+
super().__init__(**validated_kwargs)
|
400
|
+
|
401
|
+
@classmethod
|
402
|
+
def from_values( # type: ignore
|
403
|
+
cls,
|
404
|
+
values: ListLike,
|
405
|
+
field: FieldAttr = Feature.name,
|
406
|
+
type: str | None = None,
|
407
|
+
name: str | None = None,
|
408
|
+
mute: bool = False,
|
409
|
+
organism: Record | str | None = None,
|
410
|
+
source: Record | None = None,
|
411
|
+
raise_validation_error: bool = True,
|
412
|
+
) -> Schema:
|
413
|
+
"""Create feature set for validated features.
|
414
|
+
|
415
|
+
Args:
|
416
|
+
values: A list of values, like feature names or ids.
|
417
|
+
field: The field of a reference registry to map values.
|
418
|
+
type: The simple type.
|
419
|
+
Defaults to `None` if reference registry is :class:`~lamindb.Feature`,
|
420
|
+
defaults to `"float"` otherwise.
|
421
|
+
name: A name.
|
422
|
+
organism: An organism to resolve gene mapping.
|
423
|
+
source: A public ontology to resolve feature identifier mapping.
|
424
|
+
raise_validation_error: Whether to raise a validation error if some values are not valid.
|
425
|
+
|
426
|
+
Raises:
|
427
|
+
ValidationError: If some values are not valid.
|
428
|
+
|
429
|
+
Examples:
|
430
|
+
|
431
|
+
>>> features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
|
432
|
+
>>> schema = ln.Schema.from_values(features)
|
433
|
+
|
434
|
+
>>> genes = ["ENSG00000139618", "ENSG00000198786"]
|
435
|
+
>>> schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
|
436
|
+
"""
|
437
|
+
if not isinstance(field, FieldAttr):
|
438
|
+
raise TypeError(
|
439
|
+
"Argument `field` must be a Record field, e.g., `Feature.name`"
|
440
|
+
)
|
441
|
+
if len(values) == 0:
|
442
|
+
raise ValueError("Provide a list of at least one value")
|
443
|
+
if isinstance(values, DICT_KEYS_TYPE):
|
444
|
+
values = list(values)
|
445
|
+
registry = field.field.model
|
446
|
+
if registry != Feature and type is None:
|
447
|
+
type = NUMBER_TYPE
|
448
|
+
logger.debug("setting feature set to 'number'")
|
449
|
+
validated = registry.validate(values, field=field, mute=mute, organism=organism)
|
450
|
+
values_array = np.array(values)
|
451
|
+
validated_values = values_array[validated]
|
452
|
+
if validated.sum() != len(values):
|
453
|
+
not_validated_values = values_array[~validated]
|
454
|
+
msg = (
|
455
|
+
f"These values could not be validated: {not_validated_values.tolist()}\n"
|
456
|
+
f"If there are no typos, add them to their registry: {registry.__name__}"
|
457
|
+
)
|
458
|
+
if raise_validation_error:
|
459
|
+
raise ValidationError(msg)
|
460
|
+
elif len(validated_values) == 0:
|
461
|
+
return None # temporarily return None here
|
462
|
+
validated_features = registry.from_values(
|
463
|
+
validated_values,
|
464
|
+
field=field,
|
465
|
+
organism=organism,
|
466
|
+
source=source,
|
467
|
+
)
|
468
|
+
schema = Schema(
|
469
|
+
features=validated_features,
|
470
|
+
name=name,
|
471
|
+
dtype=get_type_str(type),
|
472
|
+
)
|
473
|
+
return schema
|
474
|
+
|
475
|
+
@classmethod
|
476
|
+
def from_df(
|
477
|
+
cls,
|
478
|
+
df: pd.DataFrame,
|
479
|
+
field: FieldAttr = Feature.name,
|
480
|
+
name: str | None = None,
|
481
|
+
mute: bool = False,
|
482
|
+
organism: Record | str | None = None,
|
483
|
+
source: Record | None = None,
|
484
|
+
) -> Schema | None:
|
485
|
+
"""Create feature set for validated features."""
|
486
|
+
registry = field.field.model
|
487
|
+
validated = registry.validate(
|
488
|
+
df.columns, field=field, mute=mute, organism=organism
|
489
|
+
)
|
490
|
+
if validated.sum() == 0:
|
491
|
+
if mute is True:
|
492
|
+
logger.warning("no validated features, skip creating feature set")
|
493
|
+
return None
|
494
|
+
if registry == Feature:
|
495
|
+
validated_features = Feature.from_values( # type: ignore
|
496
|
+
df.columns, field=field, organism=organism
|
497
|
+
)
|
498
|
+
schema = Schema(
|
499
|
+
validated_features, name=name, dtype=None, otype="DataFrame"
|
500
|
+
)
|
501
|
+
else:
|
502
|
+
dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
|
503
|
+
if len(set(dtypes)) != 1:
|
504
|
+
raise ValueError(f"data types are heterogeneous: {set(dtypes)}")
|
505
|
+
dtype = convert_pandas_dtype_to_lamin_dtype(dtypes[0])
|
506
|
+
validated_features = registry.from_values(
|
507
|
+
df.columns[validated],
|
508
|
+
field=field,
|
509
|
+
organism=organism,
|
510
|
+
source=source,
|
511
|
+
)
|
512
|
+
schema = Schema(
|
513
|
+
features=validated_features,
|
514
|
+
name=name,
|
515
|
+
dtype=get_type_str(dtype),
|
516
|
+
otype="DataFrame",
|
517
|
+
)
|
518
|
+
return schema
|
519
|
+
|
520
|
+
def save(self, *args, **kwargs) -> Schema:
|
521
|
+
"""Save."""
|
522
|
+
from .save import bulk_create
|
523
|
+
|
524
|
+
super().save(*args, **kwargs)
|
525
|
+
if hasattr(self, "_components"):
|
526
|
+
# analogous to save_schema_links in core._data.py
|
527
|
+
# which is called to save feature sets in artifact.save()
|
528
|
+
links = []
|
529
|
+
for slot, component in self._components.items():
|
530
|
+
kwargs = {
|
531
|
+
"composite_id": self.id,
|
532
|
+
"component_id": component.id,
|
533
|
+
"slot": slot,
|
534
|
+
}
|
535
|
+
links.append(Schema.components.through(**kwargs))
|
536
|
+
bulk_create(links, ignore_conflicts=True)
|
537
|
+
if hasattr(self, "_features"):
|
538
|
+
assert self.n > 0 # noqa: S101
|
539
|
+
related_name, records = self._features
|
540
|
+
# only the following method preserves the order
|
541
|
+
# .set() does not preserve the order but orders by
|
542
|
+
# the feature primary key
|
543
|
+
through_model = getattr(self, related_name).through
|
544
|
+
related_model_split = self.itype.split(".")
|
545
|
+
if len(related_model_split) == 1:
|
546
|
+
related_field = related_model_split[0].lower()
|
547
|
+
else:
|
548
|
+
related_field = related_model_split[1].lower()
|
549
|
+
related_field_id = f"{related_field}_id"
|
550
|
+
links = [
|
551
|
+
through_model(**{"schema_id": self.id, related_field_id: record.id})
|
552
|
+
for record in records
|
553
|
+
]
|
554
|
+
through_model.objects.bulk_create(links, ignore_conflicts=True)
|
555
|
+
return self
|
556
|
+
|
557
|
+
@property
|
558
|
+
def members(self) -> QuerySet:
|
559
|
+
"""A queryset for the individual records of the set."""
|
560
|
+
if self._state.adding:
|
561
|
+
# this should return a queryset and not a list...
|
562
|
+
# need to fix this
|
563
|
+
return self._features[1]
|
564
|
+
related_name = self._get_related_name()
|
565
|
+
if related_name is None:
|
566
|
+
related_name = "features"
|
567
|
+
return self.__getattribute__(related_name).order_by("links_schema__id")
|
568
|
+
|
569
|
+
@property
|
570
|
+
def coerce_dtype(self) -> bool:
|
571
|
+
"""Whether dtypes should be coerced during validation.
|
572
|
+
|
573
|
+
For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
|
574
|
+
"""
|
575
|
+
if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]: # type: ignore
|
576
|
+
return self._aux["af"]["0"] # type: ignore
|
577
|
+
else:
|
578
|
+
return False
|
579
|
+
|
580
|
+
@coerce_dtype.setter
|
581
|
+
def coerce_dtype(self, value: bool) -> None:
|
582
|
+
if self._aux is None: # type: ignore
|
583
|
+
self._aux = {} # type: ignore
|
584
|
+
if "af" not in self._aux:
|
585
|
+
self._aux["af"] = {}
|
586
|
+
self._aux["af"]["0"] = value
|
587
|
+
|
588
|
+
@coerce_dtype.setter
|
589
|
+
def coerce_dtype(self, value: bool) -> None:
|
590
|
+
if self._aux is None:
|
591
|
+
self._aux = {}
|
592
|
+
if "af" not in self._aux:
|
593
|
+
self._aux["af"] = {}
|
594
|
+
self._aux["af"]["0"] = value
|
595
|
+
|
596
|
+
# @property
|
597
|
+
# def index_feature(self) -> None | Feature:
|
598
|
+
# # index_feature: `Record | None = None` A :class:`~lamindb.Feature` to validate the index of a `DataFrame`.
|
599
|
+
# """The uid of the index feature, if `index_feature` was set."""
|
600
|
+
# if self._index_feature_uid is None:
|
601
|
+
# return None
|
602
|
+
# else:
|
603
|
+
# return self.features.get(uid=self._index_feature_uid)
|
604
|
+
|
605
|
+
# @property
|
606
|
+
# def _index_feature_uid(self) -> None | str:
|
607
|
+
# """The uid of the index feature, if `index_feature` was set."""
|
608
|
+
# if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
|
609
|
+
# return self._aux["af"]["1"]
|
610
|
+
# else:
|
611
|
+
# return None
|
612
|
+
|
613
|
+
# @_index_feature_uid.setter
|
614
|
+
# def _index_feature_uid(self, value: str) -> None:
|
615
|
+
# if self._aux is None:
|
616
|
+
# self._aux = {}
|
617
|
+
# if "af" not in self._aux:
|
618
|
+
# self._aux["af"] = {}
|
619
|
+
# self._aux["af"]["1"] = value
|
620
|
+
|
621
|
+
@property
|
622
|
+
@deprecated("itype")
|
623
|
+
def registry(self) -> str:
|
624
|
+
return self.itype
|
625
|
+
|
626
|
+
@registry.setter
|
627
|
+
def registry(self, value) -> None:
|
628
|
+
self.itype = value
|
629
|
+
|
630
|
+
@property
|
631
|
+
def slots(self) -> dict[str, Schema]:
|
632
|
+
"""Slots.
|
633
|
+
|
634
|
+
Examples::
|
635
|
+
|
636
|
+
# define composite schema
|
637
|
+
anndata_schema = ln.Schema(
|
638
|
+
name="small_dataset1_anndata_schema",
|
639
|
+
otype="AnnData",
|
640
|
+
components={"obs": obs_schema, "var": var_schema},
|
641
|
+
).save()
|
642
|
+
|
643
|
+
# access slots
|
644
|
+
anndata_schema.slots
|
645
|
+
# {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}
|
646
|
+
"""
|
647
|
+
if hasattr(self, "_slots"):
|
648
|
+
return self._slots
|
649
|
+
if self.itype == "Composite":
|
650
|
+
self._slots = {
|
651
|
+
link.slot: link.component
|
652
|
+
for link in self.components.through.filter(composite_id=self.id).all()
|
653
|
+
}
|
654
|
+
return self._slots
|
655
|
+
return {}
|
656
|
+
|
657
|
+
def describe(self, return_str=False) -> None | str:
|
658
|
+
"""Describe schema."""
|
659
|
+
message = str(self)
|
660
|
+
# display slots for composite schemas
|
661
|
+
if self.itype == "Composite":
|
662
|
+
message + "\nslots:"
|
663
|
+
for slot, schema in self.slots.items():
|
664
|
+
message += f"\n {slot}: " + str(schema)
|
665
|
+
if return_str:
|
666
|
+
return message
|
667
|
+
else:
|
668
|
+
print(message)
|
669
|
+
return None
|
670
|
+
|
671
|
+
|
672
|
+
def get_type_str(dtype: str | None) -> str | None:
|
673
|
+
if dtype is not None:
|
674
|
+
type_str = dtype.__name__ if not isinstance(dtype, str) else dtype # type: ignore
|
675
|
+
else:
|
676
|
+
type_str = None
|
677
|
+
return type_str
|
678
|
+
|
679
|
+
|
680
|
+
def _get_related_name(self: Schema) -> str:
|
681
|
+
related_models = dict_related_model_to_related_name(self, instance=self._state.db)
|
682
|
+
related_name = related_models.get(self.itype)
|
683
|
+
return related_name
|
684
|
+
|
685
|
+
|
686
|
+
class SchemaFeature(BasicRecord, LinkORM):
|
687
|
+
id: int = models.BigAutoField(primary_key=True)
|
688
|
+
schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature")
|
689
|
+
feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")
|
690
|
+
|
691
|
+
class Meta:
|
692
|
+
unique_together = ("schema", "feature")
|
693
|
+
|
694
|
+
|
695
|
+
class SchemaParam(BasicRecord, LinkORM):
|
696
|
+
id: int = models.BigAutoField(primary_key=True)
|
697
|
+
schema: Schema = ForeignKey(Schema, CASCADE, related_name="+")
|
698
|
+
param: Param = ForeignKey(Param, PROTECT, related_name="+")
|
699
|
+
|
700
|
+
class Meta:
|
701
|
+
unique_together = ("schema", "param")
|
702
|
+
|
703
|
+
|
704
|
+
class ArtifactSchema(BasicRecord, LinkORM, TracksRun):
|
705
|
+
id: int = models.BigAutoField(primary_key=True)
|
706
|
+
artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="_links_schema")
|
707
|
+
schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
|
708
|
+
slot: str | None = CharField(null=True)
|
709
|
+
feature_ref_is_semantic: bool | None = BooleanField(null=True)
|
710
|
+
|
711
|
+
class Meta:
|
712
|
+
unique_together = (("artifact", "schema"), ("artifact", "slot"))
|
713
|
+
|
714
|
+
|
715
|
+
class SchemaComponent(BasicRecord, LinkORM, TracksRun):
|
716
|
+
id: int = models.BigAutoField(primary_key=True)
|
717
|
+
composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_composite")
|
718
|
+
component: Schema = ForeignKey(Schema, PROTECT, related_name="links_component")
|
719
|
+
slot: str | None = CharField(null=True)
|
720
|
+
|
721
|
+
class Meta:
|
722
|
+
unique_together = (("composite", "component"), ("composite", "slot"))
|
723
|
+
|
724
|
+
|
725
|
+
Schema._get_related_name = _get_related_name
|
726
|
+
# excluded on docs via
|
727
|
+
# https://github.com/laminlabs/lndocs/blob/8c1963de65445107ea69b3fd59354c3828e067d1/lndocs/lamin_sphinx/__init__.py#L584-L588
|
728
|
+
delattr(Schema, "validated_by") # we don't want to expose these
|
729
|
+
delattr(Schema, "validated_by_id") # we don't want to expose these
|
730
|
+
delattr(Schema, "validated_schemas") # we don't want to expose these
|
731
|
+
delattr(Schema, "composite") # we don't want to expose these
|
732
|
+
delattr(Schema, "composite_id") # we don't want to expose these
|