lamindb 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +30 -25
- lamindb/_tracked.py +1 -1
- lamindb/_view.py +2 -3
- lamindb/base/__init__.py +1 -1
- lamindb/base/ids.py +1 -10
- lamindb/core/__init__.py +7 -65
- lamindb/core/_compat.py +60 -0
- lamindb/core/_context.py +43 -20
- lamindb/core/_settings.py +6 -6
- lamindb/core/_sync_git.py +1 -1
- lamindb/core/loaders.py +30 -19
- lamindb/core/storage/_backed_access.py +4 -2
- lamindb/core/storage/_tiledbsoma.py +8 -6
- lamindb/core/storage/_zarr.py +104 -25
- lamindb/core/storage/objects.py +63 -28
- lamindb/core/storage/paths.py +4 -1
- lamindb/core/types.py +10 -0
- lamindb/curators/__init__.py +100 -85
- lamindb/errors.py +1 -1
- lamindb/integrations/_vitessce.py +4 -4
- lamindb/migrations/0089_subsequent_runs.py +159 -0
- lamindb/migrations/0090_runproject_project_runs.py +73 -0
- lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
- lamindb/models/__init__.py +79 -0
- lamindb/{core → models}/_describe.py +3 -3
- lamindb/{core → models}/_django.py +8 -5
- lamindb/{core → models}/_feature_manager.py +103 -87
- lamindb/{_from_values.py → models/_from_values.py} +5 -2
- lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
- lamindb/{core → models}/_label_manager.py +10 -17
- lamindb/{core/relations.py → models/_relations.py} +8 -1
- lamindb/models/artifact.py +2602 -0
- lamindb/{_can_curate.py → models/can_curate.py} +349 -180
- lamindb/models/collection.py +683 -0
- lamindb/models/core.py +135 -0
- lamindb/models/feature.py +643 -0
- lamindb/models/flextable.py +163 -0
- lamindb/{_parents.py → models/has_parents.py} +55 -49
- lamindb/models/project.py +384 -0
- lamindb/{_query_manager.py → models/query_manager.py} +10 -8
- lamindb/{_query_set.py → models/query_set.py} +40 -26
- lamindb/models/record.py +1762 -0
- lamindb/models/run.py +563 -0
- lamindb/{_save.py → models/save.py} +9 -7
- lamindb/models/schema.py +732 -0
- lamindb/models/transform.py +360 -0
- lamindb/models/ulabel.py +249 -0
- {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
- {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/RECORD +51 -51
- lamindb/_artifact.py +0 -1379
- lamindb/_collection.py +0 -440
- lamindb/_feature.py +0 -316
- lamindb/_is_versioned.py +0 -40
- lamindb/_record.py +0 -1064
- lamindb/_run.py +0 -60
- lamindb/_schema.py +0 -347
- lamindb/_storage.py +0 -15
- lamindb/_transform.py +0 -170
- lamindb/_ulabel.py +0 -56
- lamindb/_utils.py +0 -9
- lamindb/base/validation.py +0 -63
- lamindb/core/_data.py +0 -491
- lamindb/core/fields.py +0 -12
- lamindb/models.py +0 -4475
- {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
- {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,643 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import importlib
|
4
|
+
from typing import TYPE_CHECKING, Any, get_args, overload
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
from django.db import models
|
8
|
+
from django.db.models import CASCADE, PROTECT, Q
|
9
|
+
from django.db.models.query_utils import DeferredAttribute
|
10
|
+
from django.db.utils import IntegrityError
|
11
|
+
from lamin_utils import logger
|
12
|
+
from lamindb_setup._init_instance import get_schema_module_name
|
13
|
+
from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict
|
14
|
+
from pandas.api.types import CategoricalDtype, is_string_dtype
|
15
|
+
|
16
|
+
from lamindb.base.fields import (
|
17
|
+
BooleanField,
|
18
|
+
CharField,
|
19
|
+
ForeignKey,
|
20
|
+
JSONField,
|
21
|
+
TextField,
|
22
|
+
)
|
23
|
+
from lamindb.base.types import FeatureDtype, FieldAttr
|
24
|
+
from lamindb.errors import FieldValidationError, ValidationError
|
25
|
+
|
26
|
+
from ..base.ids import base62_12
|
27
|
+
from ._relations import dict_module_name_to_model_name
|
28
|
+
from .can_curate import CanCurate
|
29
|
+
from .query_set import RecordList
|
30
|
+
from .record import BasicRecord, Record, Registry, _get_record_kwargs
|
31
|
+
from .run import (
|
32
|
+
TracksRun,
|
33
|
+
TracksUpdates,
|
34
|
+
)
|
35
|
+
|
36
|
+
if TYPE_CHECKING:
|
37
|
+
from collections.abc import Iterable
|
38
|
+
|
39
|
+
from pandas.core.dtypes.base import ExtensionDtype
|
40
|
+
|
41
|
+
from .schema import Schema
|
42
|
+
|
43
|
+
FEATURE_DTYPES = set(get_args(FeatureDtype))
|
44
|
+
|
45
|
+
|
46
|
+
def parse_dtype_single_cat(
|
47
|
+
dtype_str: str,
|
48
|
+
related_registries: dict[str, Record] | None = None,
|
49
|
+
is_itype: bool = False,
|
50
|
+
) -> dict:
|
51
|
+
from .artifact import Artifact
|
52
|
+
|
53
|
+
assert isinstance(dtype_str, str) # noqa: S101
|
54
|
+
if related_registries is None:
|
55
|
+
related_registries = dict_module_name_to_model_name(Artifact)
|
56
|
+
split_result = dtype_str.split("[")
|
57
|
+
# has sub type
|
58
|
+
sub_type_str = ""
|
59
|
+
if len(split_result) == 2:
|
60
|
+
registry_str = split_result[0]
|
61
|
+
assert "]" in split_result[1] # noqa: S101
|
62
|
+
sub_type_field_split = split_result[1].split("].")
|
63
|
+
if len(sub_type_field_split) == 1:
|
64
|
+
sub_type_str = sub_type_field_split[0].strip("]")
|
65
|
+
field_str = ""
|
66
|
+
else:
|
67
|
+
sub_type_str = sub_type_field_split[0]
|
68
|
+
field_str = sub_type_field_split[1]
|
69
|
+
elif len(split_result) == 1:
|
70
|
+
registry_field_split = split_result[0].split(".")
|
71
|
+
if (
|
72
|
+
len(registry_field_split) == 2 and registry_field_split[1][0].isupper()
|
73
|
+
) or len(registry_field_split) == 3:
|
74
|
+
# bionty.CellType or bionty.CellType.name
|
75
|
+
registry_str = f"{registry_field_split[0]}.{registry_field_split[1]}"
|
76
|
+
field_str = (
|
77
|
+
"" if len(registry_field_split) == 2 else registry_field_split[2]
|
78
|
+
)
|
79
|
+
else:
|
80
|
+
# ULabel or ULabel.name
|
81
|
+
registry_str = registry_field_split[0]
|
82
|
+
field_str = (
|
83
|
+
"" if len(registry_field_split) == 1 else registry_field_split[1]
|
84
|
+
)
|
85
|
+
if not is_itype:
|
86
|
+
if registry_str not in related_registries:
|
87
|
+
raise ValidationError(
|
88
|
+
f"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType"
|
89
|
+
)
|
90
|
+
registry = related_registries[registry_str]
|
91
|
+
else:
|
92
|
+
if "." in registry_str:
|
93
|
+
registry_str_split = registry_str.split(".")
|
94
|
+
assert len(registry_str_split) == 2, registry_str # noqa: S101
|
95
|
+
module_name, class_name = registry_str_split
|
96
|
+
module_name = get_schema_module_name(module_name)
|
97
|
+
else:
|
98
|
+
module_name, class_name = "lamindb", registry_str
|
99
|
+
module = importlib.import_module(module_name)
|
100
|
+
registry = getattr(module, class_name)
|
101
|
+
if sub_type_str != "":
|
102
|
+
pass
|
103
|
+
# validate that the subtype is a record in the registry with is_type = True
|
104
|
+
if field_str != "":
|
105
|
+
pass
|
106
|
+
# validate that field_str is an actual field of the module
|
107
|
+
else:
|
108
|
+
field_str = registry._name_field if hasattr(registry, "_name_field") else "name"
|
109
|
+
return {
|
110
|
+
"registry": registry, # should be typed as CanCurate
|
111
|
+
"registry_str": registry_str,
|
112
|
+
"subtype_str": sub_type_str,
|
113
|
+
"field_str": field_str,
|
114
|
+
"field": getattr(registry, field_str),
|
115
|
+
}
|
116
|
+
|
117
|
+
|
118
|
+
def parse_dtype(dtype_str: str, is_param: bool = False) -> list[dict[str, str]]:
|
119
|
+
from .artifact import Artifact
|
120
|
+
|
121
|
+
allowed_dtypes = FEATURE_DTYPES
|
122
|
+
if is_param:
|
123
|
+
allowed_dtypes.add("dict")
|
124
|
+
is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]")
|
125
|
+
result = []
|
126
|
+
if is_composed_cat:
|
127
|
+
related_registries = dict_module_name_to_model_name(Artifact)
|
128
|
+
registries_str = dtype_str.replace("cat[", "")[:-1] # strip last ]
|
129
|
+
if registries_str != "":
|
130
|
+
registry_str_list = registries_str.split("|")
|
131
|
+
for cat_single_dtype_str in registry_str_list:
|
132
|
+
single_result = parse_dtype_single_cat(
|
133
|
+
cat_single_dtype_str, related_registries
|
134
|
+
)
|
135
|
+
result.append(single_result)
|
136
|
+
elif dtype_str not in allowed_dtypes:
|
137
|
+
raise ValueError(
|
138
|
+
f"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!"
|
139
|
+
)
|
140
|
+
return result
|
141
|
+
|
142
|
+
|
143
|
+
def get_dtype_str_from_dtype(dtype: Any, is_itype: bool = False) -> str:
|
144
|
+
if (
|
145
|
+
not isinstance(dtype, list)
|
146
|
+
and hasattr(dtype, "__name__")
|
147
|
+
and dtype.__name__ in FEATURE_DTYPES
|
148
|
+
):
|
149
|
+
dtype_str = dtype.__name__
|
150
|
+
else:
|
151
|
+
error_message = (
|
152
|
+
"dtype has to be a record, a record field, or a list of records, not {}"
|
153
|
+
)
|
154
|
+
if isinstance(dtype, Registry):
|
155
|
+
dtype = [dtype]
|
156
|
+
elif isinstance(dtype, DeferredAttribute):
|
157
|
+
dtype = [dtype]
|
158
|
+
elif not isinstance(dtype, list):
|
159
|
+
raise ValueError(error_message.format(dtype))
|
160
|
+
dtype_str = ""
|
161
|
+
for single_dtype in dtype:
|
162
|
+
if not isinstance(single_dtype, Registry) and not isinstance(
|
163
|
+
single_dtype, DeferredAttribute
|
164
|
+
):
|
165
|
+
raise ValueError(error_message.format(single_dtype))
|
166
|
+
if isinstance(single_dtype, Registry):
|
167
|
+
dtype_str += single_dtype.__get_name_with_module__() + "|"
|
168
|
+
else:
|
169
|
+
dtype_str += (
|
170
|
+
single_dtype.field.model.__get_name_with_module__()
|
171
|
+
+ f".{single_dtype.field.name}"
|
172
|
+
+ "|"
|
173
|
+
)
|
174
|
+
dtype_str = dtype_str.rstrip("|")
|
175
|
+
if not is_itype:
|
176
|
+
dtype_str = f"cat[{dtype_str}]"
|
177
|
+
return dtype_str
|
178
|
+
|
179
|
+
|
180
|
+
def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str:
|
181
|
+
if is_string_dtype(pandas_dtype):
|
182
|
+
if not isinstance(pandas_dtype, CategoricalDtype):
|
183
|
+
dtype = "str"
|
184
|
+
else:
|
185
|
+
dtype = "cat"
|
186
|
+
# there are string-like categoricals and "pure" categoricals (pd.Categorical)
|
187
|
+
elif isinstance(pandas_dtype, CategoricalDtype):
|
188
|
+
dtype = "cat"
|
189
|
+
else:
|
190
|
+
# strip precision qualifiers
|
191
|
+
dtype = "".join(dt for dt in pandas_dtype.name if not dt.isdigit())
|
192
|
+
if dtype.startswith("datetime"):
|
193
|
+
dtype = dtype.split("[")[0]
|
194
|
+
assert dtype in FEATURE_DTYPES # noqa: S101
|
195
|
+
return dtype
|
196
|
+
|
197
|
+
|
198
|
+
def process_init_feature_param(args, kwargs, is_param: bool = False):
|
199
|
+
# now we proceed with the user-facing constructor
|
200
|
+
if len(args) != 0:
|
201
|
+
raise ValueError("Only keyword args allowed")
|
202
|
+
name: str = kwargs.pop("name", None)
|
203
|
+
dtype: type | str | None = kwargs.pop("dtype", None)
|
204
|
+
is_type: bool = kwargs.pop("is_type", None)
|
205
|
+
type_: Feature | str | None = kwargs.pop("type", None)
|
206
|
+
description: str | None = kwargs.pop("description", None)
|
207
|
+
if kwargs:
|
208
|
+
valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Feature)])
|
209
|
+
raise FieldValidationError(f"Only {valid_keywords} are valid keyword arguments")
|
210
|
+
kwargs["name"] = name
|
211
|
+
kwargs["type"] = type_
|
212
|
+
kwargs["is_type"] = is_type
|
213
|
+
if not is_param:
|
214
|
+
kwargs["description"] = description
|
215
|
+
# cast dtype
|
216
|
+
if dtype is None and not is_type:
|
217
|
+
raise ValidationError(
|
218
|
+
f"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype"
|
219
|
+
)
|
220
|
+
dtype_str = None
|
221
|
+
if dtype is not None:
|
222
|
+
if not isinstance(dtype, str):
|
223
|
+
dtype_str = get_dtype_str_from_dtype(dtype)
|
224
|
+
else:
|
225
|
+
dtype_str = dtype
|
226
|
+
parse_dtype(dtype_str, is_param=is_param)
|
227
|
+
kwargs["dtype"] = dtype_str
|
228
|
+
return kwargs
|
229
|
+
|
230
|
+
|
231
|
+
class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
232
|
+
"""Dataset dimensions.
|
233
|
+
|
234
|
+
A feature represents a dimension of a dataset, such as a column in a
|
235
|
+
`DataFrame`. The `Feature` registry organizes metadata of features.
|
236
|
+
|
237
|
+
The `Feature` registry helps you organize and query datasets based on their
|
238
|
+
features and corresponding label annotations. For instance, when working
|
239
|
+
with a "T cell" label, it could be measured through different features
|
240
|
+
such as `"cell_type_by_expert"` where an expert manually classified the
|
241
|
+
cell, or `"cell_type_by_model"` where a computational model made the
|
242
|
+
classification.
|
243
|
+
|
244
|
+
The two most important metadata of a feature are its `name` and the `dtype`.
|
245
|
+
In addition to typical data types, LaminDB has a `"num"` `dtype` to
|
246
|
+
concisely denote the union of all numerical types.
|
247
|
+
|
248
|
+
Args:
|
249
|
+
name: `str` Name of the feature, typically. column name.
|
250
|
+
dtype: `FeatureDtype | Registry | list[Registry] | FieldAttr` See :class:`~lamindb.base.types.FeatureDtype`.
|
251
|
+
For categorical types, can define from which registry values are
|
252
|
+
sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`.
|
253
|
+
unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
|
254
|
+
description: `str | None = None` A description.
|
255
|
+
synonyms: `str | None = None` Bar-separated synonyms.
|
256
|
+
nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.
|
257
|
+
default_value: `Any | None = None` Default value for the feature.
|
258
|
+
cat_filters: `dict[str, str] | None = None` Subset a registry by additional filters to define valid categories.
|
259
|
+
|
260
|
+
Note:
|
261
|
+
|
262
|
+
For more control, you can use :mod:`bionty` registries to manage simple
|
263
|
+
biological entities like genes, proteins & cell markers. Or you define
|
264
|
+
custom registries to manage high-level derived features like gene sets.
|
265
|
+
|
266
|
+
See Also:
|
267
|
+
:meth:`~lamindb.Feature.from_df`
|
268
|
+
Create feature records from DataFrame.
|
269
|
+
:attr:`~lamindb.Artifact.features`
|
270
|
+
Feature manager of an artifact or collection.
|
271
|
+
:class:`~lamindb.ULabel`
|
272
|
+
Universal labels.
|
273
|
+
:class:`~lamindb.Schema`
|
274
|
+
Feature sets.
|
275
|
+
|
276
|
+
Example:
|
277
|
+
|
278
|
+
A simple `"str"` feature.
|
279
|
+
|
280
|
+
>>> ln.Feature(
|
281
|
+
... name="sample_note",
|
282
|
+
... dtype="str",
|
283
|
+
... ).save()
|
284
|
+
|
285
|
+
A dtype `"cat[ULabel]"` can be more easily passed as below.
|
286
|
+
|
287
|
+
>>> ln.Feature(
|
288
|
+
... name="project",
|
289
|
+
... dtype=ln.ULabel,
|
290
|
+
... ).save()
|
291
|
+
|
292
|
+
A dtype `"cat[ULabel|bionty.CellType]"` can be more easily passed as below.
|
293
|
+
|
294
|
+
>>> ln.Feature(
|
295
|
+
... name="cell_type",
|
296
|
+
... dtype=[ln.ULabel, bt.CellType],
|
297
|
+
... ).save()
|
298
|
+
|
299
|
+
Hint:
|
300
|
+
|
301
|
+
*Features* and *labels* denote two ways of using entities to organize data:
|
302
|
+
|
303
|
+
1. A feature qualifies *what* is measured, i.e., a numerical or categorical random variable
|
304
|
+
2. A label *is* a measured value, i.e., a category
|
305
|
+
|
306
|
+
Consider annotating a dataset by that it measured expression of 30k
|
307
|
+
genes: genes relate to the dataset as feature identifiers through a
|
308
|
+
feature set with 30k members. Now consider annotating the artifact by
|
309
|
+
whether that it measured the knock-out of 3 genes: here, the 3 genes act
|
310
|
+
as labels of the dataset.
|
311
|
+
|
312
|
+
Re-shaping data can introduce ambiguity among features & labels. If this
|
313
|
+
happened, ask yourself what the joint measurement was: a feature
|
314
|
+
qualifies variables in a joint measurement. The canonical data matrix
|
315
|
+
lists jointly measured variables in the columns.
|
316
|
+
|
317
|
+
"""
|
318
|
+
|
319
|
+
class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
320
|
+
abstract = False
|
321
|
+
|
322
|
+
_name_field: str = "name"
|
323
|
+
_aux_fields: dict[str, tuple[str, type]] = {
|
324
|
+
"0": ("default_value", bool),
|
325
|
+
"1": ("nullable", bool),
|
326
|
+
}
|
327
|
+
|
328
|
+
id: int = models.AutoField(primary_key=True)
|
329
|
+
"""Internal id, valid only in one DB instance."""
|
330
|
+
uid: str = CharField(
|
331
|
+
editable=False, unique=True, db_index=True, max_length=12, default=base62_12
|
332
|
+
)
|
333
|
+
"""Universal id, valid across DB instances."""
|
334
|
+
name: str = CharField(max_length=150, db_index=True, unique=True)
|
335
|
+
"""Name of feature (hard unique constraint `unique=True`)."""
|
336
|
+
dtype: FeatureDtype | None = CharField(db_index=True, null=True)
|
337
|
+
"""Data type (:class:`~lamindb.base.types.FeatureDtype`).
|
338
|
+
|
339
|
+
For categorical types, can define from which registry values are
|
340
|
+
sampled, e.g., `'cat[ULabel]'` or `'cat[bionty.CellType]'`. Unions are also
|
341
|
+
allowed if the feature samples from two registries, e.g., `'cat[ULabel|bionty.CellType]'`
|
342
|
+
"""
|
343
|
+
type: Feature | None = ForeignKey(
|
344
|
+
"self", PROTECT, null=True, related_name="records"
|
345
|
+
)
|
346
|
+
"""Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction').
|
347
|
+
|
348
|
+
Allows to group features by type, e.g., all read outs, all metrics, etc.
|
349
|
+
"""
|
350
|
+
records: Feature
|
351
|
+
"""Records of this type."""
|
352
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
353
|
+
"""Distinguish types from instances of the type."""
|
354
|
+
unit: str | None = CharField(max_length=30, db_index=True, null=True)
|
355
|
+
"""Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
|
356
|
+
description: str | None = CharField(db_index=True, null=True)
|
357
|
+
"""A description."""
|
358
|
+
array_rank: int = models.SmallIntegerField(default=0, db_index=True)
|
359
|
+
"""Rank of feature.
|
360
|
+
|
361
|
+
Number of indices of the array: 0 for scalar, 1 for vector, 2 for matrix.
|
362
|
+
|
363
|
+
Is called `.ndim` in `numpy` and `pytorch` but shouldn't be confused with
|
364
|
+
the dimension of the feature space.
|
365
|
+
"""
|
366
|
+
array_size: int = models.IntegerField(default=0, db_index=True)
|
367
|
+
"""Number of elements of the feature.
|
368
|
+
|
369
|
+
Total number of elements (product of shape components) of the array.
|
370
|
+
|
371
|
+
- A number or string (a scalar): 1
|
372
|
+
- A 50-dimensional embedding: 50
|
373
|
+
- A 25 x 25 image: 625
|
374
|
+
"""
|
375
|
+
array_shape: list[int] | None = JSONField(default=None, db_default=None, null=True)
|
376
|
+
"""Shape of the feature.
|
377
|
+
|
378
|
+
- A number or string (a scalar): [1]
|
379
|
+
- A 50-dimensional embedding: [50]
|
380
|
+
- A 25 x 25 image: [25, 25]
|
381
|
+
|
382
|
+
Is stored as a list rather than a tuple because it's serialized as JSON.
|
383
|
+
"""
|
384
|
+
proxy_dtype: FeatureDtype | None = CharField(default=None, null=True)
|
385
|
+
"""Proxy data type.
|
386
|
+
|
387
|
+
If the feature is an image it's often stored via a path to the image file. Hence, while the dtype might be
|
388
|
+
image with a certain shape, the proxy dtype would be str.
|
389
|
+
"""
|
390
|
+
synonyms: str | None = TextField(null=True)
|
391
|
+
"""Bar-separated (|) synonyms (optional)."""
|
392
|
+
# we define the below ManyToMany on the feature model because it parallels
|
393
|
+
# how other registries (like Gene, Protein, etc.) relate to Schema
|
394
|
+
# it makes the API more consistent
|
395
|
+
schemas: Schema = models.ManyToManyField(
|
396
|
+
"Schema", through="SchemaFeature", related_name="features"
|
397
|
+
)
|
398
|
+
"""Feature sets linked to this feature."""
|
399
|
+
_expect_many: bool = models.BooleanField(default=True, db_default=True)
|
400
|
+
"""Indicates whether values for this feature are expected to occur a single or multiple times for an artifact (default `True`).
|
401
|
+
|
402
|
+
- if it's `True` (default), the values come from an observation-level aggregation and a dtype of `datetime` on the observation-level mean `set[datetime]` on the artifact-level
|
403
|
+
- if it's `False` it's an artifact-level value and datetime means datetime; this is an edge case because an arbitrary artifact would always be a set of arbitrary measurements that would need to be aggregated ("one just happens to measure a single cell line in that artifact")
|
404
|
+
"""
|
405
|
+
_curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
|
406
|
+
# backward fields
|
407
|
+
values: FeatureValue
|
408
|
+
"""Values for this feature."""
|
409
|
+
|
410
|
+
@overload
|
411
|
+
def __init__(
|
412
|
+
self,
|
413
|
+
name: str,
|
414
|
+
dtype: FeatureDtype | Registry | list[Registry] | FieldAttr,
|
415
|
+
type: Feature | None = None,
|
416
|
+
is_type: bool = False,
|
417
|
+
unit: str | None = None,
|
418
|
+
description: str | None = None,
|
419
|
+
synonyms: str | None = None,
|
420
|
+
nullable: bool = True,
|
421
|
+
default_value: str | None = None,
|
422
|
+
cat_filters: dict[str, str] | None = None,
|
423
|
+
): ...
|
424
|
+
|
425
|
+
@overload
|
426
|
+
def __init__(
|
427
|
+
self,
|
428
|
+
*db_args,
|
429
|
+
): ...
|
430
|
+
|
431
|
+
def __init__(
|
432
|
+
self,
|
433
|
+
*args,
|
434
|
+
**kwargs,
|
435
|
+
):
|
436
|
+
if len(args) == len(self._meta.concrete_fields):
|
437
|
+
super().__init__(*args, **kwargs)
|
438
|
+
return None
|
439
|
+
dtype = kwargs.get("dtype", None)
|
440
|
+
default_value = kwargs.pop("default_value", None)
|
441
|
+
nullable = kwargs.pop("nullable", True) # default value of nullable
|
442
|
+
cat_filters = kwargs.pop("cat_filters", None)
|
443
|
+
kwargs = process_init_feature_param(args, kwargs)
|
444
|
+
super().__init__(*args, **kwargs)
|
445
|
+
self.default_value = default_value
|
446
|
+
self.nullable = nullable
|
447
|
+
dtype_str = kwargs.pop("dtype", None)
|
448
|
+
if cat_filters:
|
449
|
+
assert "|" not in dtype_str # noqa: S101
|
450
|
+
assert "]]" not in dtype_str # noqa: S101
|
451
|
+
fill_in = ", ".join(
|
452
|
+
f"{key}='{value}'" for (key, value) in cat_filters.items()
|
453
|
+
)
|
454
|
+
dtype_str = dtype_str.replace("]", f"[{fill_in}]]")
|
455
|
+
self.dtype = dtype_str
|
456
|
+
if not self._state.adding:
|
457
|
+
if not (
|
458
|
+
self.dtype.startswith("cat")
|
459
|
+
if dtype == "cat"
|
460
|
+
else self.dtype == dtype_str
|
461
|
+
):
|
462
|
+
raise ValidationError(
|
463
|
+
f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
|
464
|
+
)
|
465
|
+
|
466
|
+
@classmethod
|
467
|
+
def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
|
468
|
+
"""Create Feature records for columns."""
|
469
|
+
field = Feature.name if field is None else field
|
470
|
+
registry = field.field.model # type: ignore
|
471
|
+
if registry != Feature:
|
472
|
+
raise ValueError("field must be a Feature FieldAttr!")
|
473
|
+
categoricals = categoricals_from_df(df)
|
474
|
+
dtypes = {}
|
475
|
+
for name, col in df.items():
|
476
|
+
if name in categoricals:
|
477
|
+
dtypes[name] = "cat"
|
478
|
+
else:
|
479
|
+
dtypes[name] = convert_pandas_dtype_to_lamin_dtype(col.dtype)
|
480
|
+
with logger.mute(): # silence the warning "loaded record with exact same name "
|
481
|
+
features = [
|
482
|
+
Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()
|
483
|
+
] # type: ignore
|
484
|
+
assert len(features) == len(df.columns) # noqa: S101
|
485
|
+
return RecordList(features)
|
486
|
+
|
487
|
+
def save(self, *args, **kwargs) -> Feature:
|
488
|
+
"""Save."""
|
489
|
+
super().save(*args, **kwargs)
|
490
|
+
return self
|
491
|
+
|
492
|
+
@property
|
493
|
+
def default_value(self) -> Any:
|
494
|
+
"""A default value that overwrites missing values (default `None`).
|
495
|
+
|
496
|
+
This takes effect when you call `Curator.standardize()`.
|
497
|
+
|
498
|
+
If `default_value = None`, missing values like `pd.NA` or `np.nan` are kept.
|
499
|
+
"""
|
500
|
+
if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]: # type: ignore
|
501
|
+
return self._aux["af"]["0"] # type: ignore
|
502
|
+
else:
|
503
|
+
return None
|
504
|
+
|
505
|
+
@default_value.setter
|
506
|
+
def default_value(self, value: bool) -> None:
|
507
|
+
if self._aux is None: # type: ignore
|
508
|
+
self._aux = {} # type: ignore
|
509
|
+
if "af" not in self._aux:
|
510
|
+
self._aux["af"] = {}
|
511
|
+
self._aux["af"]["0"] = value
|
512
|
+
|
513
|
+
@property
|
514
|
+
def nullable(self) -> bool:
|
515
|
+
"""Indicates whether the feature can have nullable values (default `True`).
|
516
|
+
|
517
|
+
Example::
|
518
|
+
|
519
|
+
import lamindb as ln
|
520
|
+
import pandas as pd
|
521
|
+
|
522
|
+
disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save()
|
523
|
+
schema = ln.Schema(features=[disease]).save()
|
524
|
+
dataset = {"disease": pd.Categorical([pd.NA, "asthma"])}
|
525
|
+
df = pd.DataFrame(dataset)
|
526
|
+
curator = ln.curators.DataFrameCurator(df, schema)
|
527
|
+
try:
|
528
|
+
curator.validate()
|
529
|
+
except ln.errors.ValidationError as e:
|
530
|
+
assert str(e).startswith("non-nullable series 'disease' contains null values")
|
531
|
+
|
532
|
+
"""
|
533
|
+
if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
|
534
|
+
value = self._aux["af"]["1"]
|
535
|
+
return True if value is None else value
|
536
|
+
else:
|
537
|
+
return True
|
538
|
+
|
539
|
+
@nullable.setter
|
540
|
+
def nullable(self, value: bool) -> None:
|
541
|
+
assert isinstance(value, bool), value # noqa: S101
|
542
|
+
if self._aux is None:
|
543
|
+
self._aux = {}
|
544
|
+
if "af" not in self._aux:
|
545
|
+
self._aux["af"] = {}
|
546
|
+
self._aux["af"]["1"] = value
|
547
|
+
|
548
|
+
|
549
|
+
class FeatureValue(Record, TracksRun):
|
550
|
+
"""Non-categorical features values.
|
551
|
+
|
552
|
+
Categorical feature values are stored in their respective registries:
|
553
|
+
:class:`~lamindb.ULabel`, :class:`~bionty.CellType`, etc.
|
554
|
+
|
555
|
+
Unlike for ULabel, in `FeatureValue`, values are grouped by features and
|
556
|
+
not by an ontological hierarchy.
|
557
|
+
"""
|
558
|
+
|
559
|
+
# we do not have a unique constraint on feature & value because it leads to hashing errors
|
560
|
+
# for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
|
561
|
+
# we do not hash values because we have `get_or_create` logic all over the place
|
562
|
+
# and also for checking whether the (feature, value) combination exists
|
563
|
+
# there does not seem an issue with querying for a dict-like value
|
564
|
+
# https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001
|
565
|
+
|
566
|
+
_name_field: str = "value"
|
567
|
+
|
568
|
+
feature: Feature | None = ForeignKey(
|
569
|
+
Feature, CASCADE, null=True, related_name="values", default=None
|
570
|
+
)
|
571
|
+
"""The dimension metadata."""
|
572
|
+
value: Any = models.JSONField()
|
573
|
+
"""The JSON-like value."""
|
574
|
+
hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
|
575
|
+
"""Value hash."""
|
576
|
+
|
577
|
+
class Meta(BasicRecord.Meta, TracksRun.Meta):
|
578
|
+
constraints = [
|
579
|
+
# For simple types, use direct value comparison
|
580
|
+
models.UniqueConstraint(
|
581
|
+
fields=["feature", "value"],
|
582
|
+
name="unique_simple_feature_value",
|
583
|
+
condition=Q(hash__isnull=True),
|
584
|
+
),
|
585
|
+
# For complex types (dictionaries), use hash
|
586
|
+
models.UniqueConstraint(
|
587
|
+
fields=["feature", "hash"],
|
588
|
+
name="unique_complex_feature_value",
|
589
|
+
condition=Q(hash__isnull=False),
|
590
|
+
),
|
591
|
+
]
|
592
|
+
|
593
|
+
@classmethod
|
594
|
+
def get_or_create(cls, feature, value):
|
595
|
+
# Simple types: int, float, str, bool
|
596
|
+
if isinstance(value, (int, float, str, bool)):
|
597
|
+
try:
|
598
|
+
return (
|
599
|
+
cls.objects.create(feature=feature, value=value, hash=None),
|
600
|
+
False,
|
601
|
+
)
|
602
|
+
except IntegrityError:
|
603
|
+
return cls.objects.get(feature=feature, value=value), True
|
604
|
+
|
605
|
+
# Complex types: dict, list
|
606
|
+
else:
|
607
|
+
hash = hash_dict(value)
|
608
|
+
try:
|
609
|
+
return (
|
610
|
+
cls.objects.create(feature=feature, value=value, hash=hash),
|
611
|
+
False,
|
612
|
+
)
|
613
|
+
except IntegrityError:
|
614
|
+
return cls.objects.get(feature=feature, hash=hash), True
|
615
|
+
|
616
|
+
|
617
|
+
def suggest_categorical_for_str_iterable(
|
618
|
+
iterable: Iterable[str], key: str = None
|
619
|
+
) -> str:
|
620
|
+
c = pd.Categorical(iterable)
|
621
|
+
message = ""
|
622
|
+
if len(c.categories) < len(c):
|
623
|
+
if key != "":
|
624
|
+
key_note = f" for feature {key}"
|
625
|
+
else:
|
626
|
+
key_note = ""
|
627
|
+
message = f"You have few permissible values{key_note}, consider dtype 'cat' instead of 'str'"
|
628
|
+
return message
|
629
|
+
|
630
|
+
|
631
|
+
def categoricals_from_df(df: pd.DataFrame) -> dict:
|
632
|
+
"""Returns categorical columns."""
|
633
|
+
string_cols = [col for col in df.columns if is_string_dtype(df[col])]
|
634
|
+
categoricals = {
|
635
|
+
col: df[col]
|
636
|
+
for col in df.columns
|
637
|
+
if isinstance(df[col].dtype, CategoricalDtype)
|
638
|
+
}
|
639
|
+
for key in string_cols:
|
640
|
+
message = suggest_categorical_for_str_iterable(df[key], key)
|
641
|
+
if message:
|
642
|
+
logger.warning(message)
|
643
|
+
return categoricals
|