lamindb 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. lamindb/__init__.py +30 -25
  2. lamindb/_tracked.py +1 -1
  3. lamindb/_view.py +2 -3
  4. lamindb/base/__init__.py +1 -1
  5. lamindb/base/ids.py +1 -10
  6. lamindb/core/__init__.py +7 -65
  7. lamindb/core/_compat.py +60 -0
  8. lamindb/core/_context.py +43 -20
  9. lamindb/core/_settings.py +6 -6
  10. lamindb/core/_sync_git.py +1 -1
  11. lamindb/core/loaders.py +30 -19
  12. lamindb/core/storage/_backed_access.py +4 -2
  13. lamindb/core/storage/_tiledbsoma.py +8 -6
  14. lamindb/core/storage/_zarr.py +104 -25
  15. lamindb/core/storage/objects.py +63 -28
  16. lamindb/core/storage/paths.py +4 -1
  17. lamindb/core/types.py +10 -0
  18. lamindb/curators/__init__.py +100 -85
  19. lamindb/errors.py +1 -1
  20. lamindb/integrations/_vitessce.py +4 -4
  21. lamindb/migrations/0089_subsequent_runs.py +159 -0
  22. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  23. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  24. lamindb/models/__init__.py +79 -0
  25. lamindb/{core → models}/_describe.py +3 -3
  26. lamindb/{core → models}/_django.py +8 -5
  27. lamindb/{core → models}/_feature_manager.py +103 -87
  28. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  29. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  30. lamindb/{core → models}/_label_manager.py +10 -17
  31. lamindb/{core/relations.py → models/_relations.py} +8 -1
  32. lamindb/models/artifact.py +2602 -0
  33. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  34. lamindb/models/collection.py +683 -0
  35. lamindb/models/core.py +135 -0
  36. lamindb/models/feature.py +643 -0
  37. lamindb/models/flextable.py +163 -0
  38. lamindb/{_parents.py → models/has_parents.py} +55 -49
  39. lamindb/models/project.py +384 -0
  40. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  41. lamindb/{_query_set.py → models/query_set.py} +40 -26
  42. lamindb/models/record.py +1762 -0
  43. lamindb/models/run.py +563 -0
  44. lamindb/{_save.py → models/save.py} +9 -7
  45. lamindb/models/schema.py +732 -0
  46. lamindb/models/transform.py +360 -0
  47. lamindb/models/ulabel.py +249 -0
  48. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
  49. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/RECORD +51 -51
  50. lamindb/_artifact.py +0 -1379
  51. lamindb/_collection.py +0 -440
  52. lamindb/_feature.py +0 -316
  53. lamindb/_is_versioned.py +0 -40
  54. lamindb/_record.py +0 -1064
  55. lamindb/_run.py +0 -60
  56. lamindb/_schema.py +0 -347
  57. lamindb/_storage.py +0 -15
  58. lamindb/_transform.py +0 -170
  59. lamindb/_ulabel.py +0 -56
  60. lamindb/_utils.py +0 -9
  61. lamindb/base/validation.py +0 -63
  62. lamindb/core/_data.py +0 -491
  63. lamindb/core/fields.py +0 -12
  64. lamindb/models.py +0 -4475
  65. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
  66. {lamindb-1.1.1.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,643 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ from typing import TYPE_CHECKING, Any, get_args, overload
5
+
6
+ import pandas as pd
7
+ from django.db import models
8
+ from django.db.models import CASCADE, PROTECT, Q
9
+ from django.db.models.query_utils import DeferredAttribute
10
+ from django.db.utils import IntegrityError
11
+ from lamin_utils import logger
12
+ from lamindb_setup._init_instance import get_schema_module_name
13
+ from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict
14
+ from pandas.api.types import CategoricalDtype, is_string_dtype
15
+
16
+ from lamindb.base.fields import (
17
+ BooleanField,
18
+ CharField,
19
+ ForeignKey,
20
+ JSONField,
21
+ TextField,
22
+ )
23
+ from lamindb.base.types import FeatureDtype, FieldAttr
24
+ from lamindb.errors import FieldValidationError, ValidationError
25
+
26
+ from ..base.ids import base62_12
27
+ from ._relations import dict_module_name_to_model_name
28
+ from .can_curate import CanCurate
29
+ from .query_set import RecordList
30
+ from .record import BasicRecord, Record, Registry, _get_record_kwargs
31
+ from .run import (
32
+ TracksRun,
33
+ TracksUpdates,
34
+ )
35
+
36
+ if TYPE_CHECKING:
37
+ from collections.abc import Iterable
38
+
39
+ from pandas.core.dtypes.base import ExtensionDtype
40
+
41
+ from .schema import Schema
42
+
43
+ FEATURE_DTYPES = set(get_args(FeatureDtype))
44
+
45
+
46
+ def parse_dtype_single_cat(
47
+ dtype_str: str,
48
+ related_registries: dict[str, Record] | None = None,
49
+ is_itype: bool = False,
50
+ ) -> dict:
51
+ from .artifact import Artifact
52
+
53
+ assert isinstance(dtype_str, str) # noqa: S101
54
+ if related_registries is None:
55
+ related_registries = dict_module_name_to_model_name(Artifact)
56
+ split_result = dtype_str.split("[")
57
+ # has sub type
58
+ sub_type_str = ""
59
+ if len(split_result) == 2:
60
+ registry_str = split_result[0]
61
+ assert "]" in split_result[1] # noqa: S101
62
+ sub_type_field_split = split_result[1].split("].")
63
+ if len(sub_type_field_split) == 1:
64
+ sub_type_str = sub_type_field_split[0].strip("]")
65
+ field_str = ""
66
+ else:
67
+ sub_type_str = sub_type_field_split[0]
68
+ field_str = sub_type_field_split[1]
69
+ elif len(split_result) == 1:
70
+ registry_field_split = split_result[0].split(".")
71
+ if (
72
+ len(registry_field_split) == 2 and registry_field_split[1][0].isupper()
73
+ ) or len(registry_field_split) == 3:
74
+ # bionty.CellType or bionty.CellType.name
75
+ registry_str = f"{registry_field_split[0]}.{registry_field_split[1]}"
76
+ field_str = (
77
+ "" if len(registry_field_split) == 2 else registry_field_split[2]
78
+ )
79
+ else:
80
+ # ULabel or ULabel.name
81
+ registry_str = registry_field_split[0]
82
+ field_str = (
83
+ "" if len(registry_field_split) == 1 else registry_field_split[1]
84
+ )
85
+ if not is_itype:
86
+ if registry_str not in related_registries:
87
+ raise ValidationError(
88
+ f"'{registry_str}' is an invalid dtype, has to be registry, e.g. ULabel or bionty.CellType"
89
+ )
90
+ registry = related_registries[registry_str]
91
+ else:
92
+ if "." in registry_str:
93
+ registry_str_split = registry_str.split(".")
94
+ assert len(registry_str_split) == 2, registry_str # noqa: S101
95
+ module_name, class_name = registry_str_split
96
+ module_name = get_schema_module_name(module_name)
97
+ else:
98
+ module_name, class_name = "lamindb", registry_str
99
+ module = importlib.import_module(module_name)
100
+ registry = getattr(module, class_name)
101
+ if sub_type_str != "":
102
+ pass
103
+ # validate that the subtype is a record in the registry with is_type = True
104
+ if field_str != "":
105
+ pass
106
+ # validate that field_str is an actual field of the module
107
+ else:
108
+ field_str = registry._name_field if hasattr(registry, "_name_field") else "name"
109
+ return {
110
+ "registry": registry, # should be typed as CanCurate
111
+ "registry_str": registry_str,
112
+ "subtype_str": sub_type_str,
113
+ "field_str": field_str,
114
+ "field": getattr(registry, field_str),
115
+ }
116
+
117
+
118
+ def parse_dtype(dtype_str: str, is_param: bool = False) -> list[dict[str, str]]:
119
+ from .artifact import Artifact
120
+
121
+ allowed_dtypes = FEATURE_DTYPES
122
+ if is_param:
123
+ allowed_dtypes.add("dict")
124
+ is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]")
125
+ result = []
126
+ if is_composed_cat:
127
+ related_registries = dict_module_name_to_model_name(Artifact)
128
+ registries_str = dtype_str.replace("cat[", "")[:-1] # strip last ]
129
+ if registries_str != "":
130
+ registry_str_list = registries_str.split("|")
131
+ for cat_single_dtype_str in registry_str_list:
132
+ single_result = parse_dtype_single_cat(
133
+ cat_single_dtype_str, related_registries
134
+ )
135
+ result.append(single_result)
136
+ elif dtype_str not in allowed_dtypes:
137
+ raise ValueError(
138
+ f"dtype is '{dtype_str}' but has to be one of {FEATURE_DTYPES}!"
139
+ )
140
+ return result
141
+
142
+
143
+ def get_dtype_str_from_dtype(dtype: Any, is_itype: bool = False) -> str:
144
+ if (
145
+ not isinstance(dtype, list)
146
+ and hasattr(dtype, "__name__")
147
+ and dtype.__name__ in FEATURE_DTYPES
148
+ ):
149
+ dtype_str = dtype.__name__
150
+ else:
151
+ error_message = (
152
+ "dtype has to be a record, a record field, or a list of records, not {}"
153
+ )
154
+ if isinstance(dtype, Registry):
155
+ dtype = [dtype]
156
+ elif isinstance(dtype, DeferredAttribute):
157
+ dtype = [dtype]
158
+ elif not isinstance(dtype, list):
159
+ raise ValueError(error_message.format(dtype))
160
+ dtype_str = ""
161
+ for single_dtype in dtype:
162
+ if not isinstance(single_dtype, Registry) and not isinstance(
163
+ single_dtype, DeferredAttribute
164
+ ):
165
+ raise ValueError(error_message.format(single_dtype))
166
+ if isinstance(single_dtype, Registry):
167
+ dtype_str += single_dtype.__get_name_with_module__() + "|"
168
+ else:
169
+ dtype_str += (
170
+ single_dtype.field.model.__get_name_with_module__()
171
+ + f".{single_dtype.field.name}"
172
+ + "|"
173
+ )
174
+ dtype_str = dtype_str.rstrip("|")
175
+ if not is_itype:
176
+ dtype_str = f"cat[{dtype_str}]"
177
+ return dtype_str
178
+
179
+
180
+ def convert_pandas_dtype_to_lamin_dtype(pandas_dtype: ExtensionDtype) -> str:
181
+ if is_string_dtype(pandas_dtype):
182
+ if not isinstance(pandas_dtype, CategoricalDtype):
183
+ dtype = "str"
184
+ else:
185
+ dtype = "cat"
186
+ # there are string-like categoricals and "pure" categoricals (pd.Categorical)
187
+ elif isinstance(pandas_dtype, CategoricalDtype):
188
+ dtype = "cat"
189
+ else:
190
+ # strip precision qualifiers
191
+ dtype = "".join(dt for dt in pandas_dtype.name if not dt.isdigit())
192
+ if dtype.startswith("datetime"):
193
+ dtype = dtype.split("[")[0]
194
+ assert dtype in FEATURE_DTYPES # noqa: S101
195
+ return dtype
196
+
197
+
198
+ def process_init_feature_param(args, kwargs, is_param: bool = False):
199
+ # now we proceed with the user-facing constructor
200
+ if len(args) != 0:
201
+ raise ValueError("Only keyword args allowed")
202
+ name: str = kwargs.pop("name", None)
203
+ dtype: type | str | None = kwargs.pop("dtype", None)
204
+ is_type: bool = kwargs.pop("is_type", None)
205
+ type_: Feature | str | None = kwargs.pop("type", None)
206
+ description: str | None = kwargs.pop("description", None)
207
+ if kwargs:
208
+ valid_keywords = ", ".join([val[0] for val in _get_record_kwargs(Feature)])
209
+ raise FieldValidationError(f"Only {valid_keywords} are valid keyword arguments")
210
+ kwargs["name"] = name
211
+ kwargs["type"] = type_
212
+ kwargs["is_type"] = is_type
213
+ if not is_param:
214
+ kwargs["description"] = description
215
+ # cast dtype
216
+ if dtype is None and not is_type:
217
+ raise ValidationError(
218
+ f"Please pass dtype, one of {FEATURE_DTYPES} or a composed categorical dtype"
219
+ )
220
+ dtype_str = None
221
+ if dtype is not None:
222
+ if not isinstance(dtype, str):
223
+ dtype_str = get_dtype_str_from_dtype(dtype)
224
+ else:
225
+ dtype_str = dtype
226
+ parse_dtype(dtype_str, is_param=is_param)
227
+ kwargs["dtype"] = dtype_str
228
+ return kwargs
229
+
230
+
231
+ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
232
+ """Dataset dimensions.
233
+
234
+ A feature represents a dimension of a dataset, such as a column in a
235
+ `DataFrame`. The `Feature` registry organizes metadata of features.
236
+
237
+ The `Feature` registry helps you organize and query datasets based on their
238
+ features and corresponding label annotations. For instance, when working
239
+ with a "T cell" label, it could be measured through different features
240
+ such as `"cell_type_by_expert"` where an expert manually classified the
241
+ cell, or `"cell_type_by_model"` where a computational model made the
242
+ classification.
243
+
244
+ The two most important metadata of a feature are its `name` and the `dtype`.
245
+ In addition to typical data types, LaminDB has a `"num"` `dtype` to
246
+ concisely denote the union of all numerical types.
247
+
248
+ Args:
249
+ name: `str` Name of the feature, typically. column name.
250
+ dtype: `FeatureDtype | Registry | list[Registry] | FieldAttr` See :class:`~lamindb.base.types.FeatureDtype`.
251
+ For categorical types, can define from which registry values are
252
+ sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`.
253
+ unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
254
+ description: `str | None = None` A description.
255
+ synonyms: `str | None = None` Bar-separated synonyms.
256
+ nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.
257
+ default_value: `Any | None = None` Default value for the feature.
258
+ cat_filters: `dict[str, str] | None = None` Subset a registry by additional filters to define valid categories.
259
+
260
+ Note:
261
+
262
+ For more control, you can use :mod:`bionty` registries to manage simple
263
+ biological entities like genes, proteins & cell markers. Or you define
264
+ custom registries to manage high-level derived features like gene sets.
265
+
266
+ See Also:
267
+ :meth:`~lamindb.Feature.from_df`
268
+ Create feature records from DataFrame.
269
+ :attr:`~lamindb.Artifact.features`
270
+ Feature manager of an artifact or collection.
271
+ :class:`~lamindb.ULabel`
272
+ Universal labels.
273
+ :class:`~lamindb.Schema`
274
+ Feature sets.
275
+
276
+ Example:
277
+
278
+ A simple `"str"` feature.
279
+
280
+ >>> ln.Feature(
281
+ ... name="sample_note",
282
+ ... dtype="str",
283
+ ... ).save()
284
+
285
+ A dtype `"cat[ULabel]"` can be more easily passed as below.
286
+
287
+ >>> ln.Feature(
288
+ ... name="project",
289
+ ... dtype=ln.ULabel,
290
+ ... ).save()
291
+
292
+ A dtype `"cat[ULabel|bionty.CellType]"` can be more easily passed as below.
293
+
294
+ >>> ln.Feature(
295
+ ... name="cell_type",
296
+ ... dtype=[ln.ULabel, bt.CellType],
297
+ ... ).save()
298
+
299
+ Hint:
300
+
301
+ *Features* and *labels* denote two ways of using entities to organize data:
302
+
303
+ 1. A feature qualifies *what* is measured, i.e., a numerical or categorical random variable
304
+ 2. A label *is* a measured value, i.e., a category
305
+
306
+ Consider annotating a dataset by that it measured expression of 30k
307
+ genes: genes relate to the dataset as feature identifiers through a
308
+ feature set with 30k members. Now consider annotating the artifact by
309
+ whether that it measured the knock-out of 3 genes: here, the 3 genes act
310
+ as labels of the dataset.
311
+
312
+ Re-shaping data can introduce ambiguity among features & labels. If this
313
+ happened, ask yourself what the joint measurement was: a feature
314
+ qualifies variables in a joint measurement. The canonical data matrix
315
+ lists jointly measured variables in the columns.
316
+
317
+ """
318
+
319
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
320
+ abstract = False
321
+
322
+ _name_field: str = "name"
323
+ _aux_fields: dict[str, tuple[str, type]] = {
324
+ "0": ("default_value", bool),
325
+ "1": ("nullable", bool),
326
+ }
327
+
328
+ id: int = models.AutoField(primary_key=True)
329
+ """Internal id, valid only in one DB instance."""
330
+ uid: str = CharField(
331
+ editable=False, unique=True, db_index=True, max_length=12, default=base62_12
332
+ )
333
+ """Universal id, valid across DB instances."""
334
+ name: str = CharField(max_length=150, db_index=True, unique=True)
335
+ """Name of feature (hard unique constraint `unique=True`)."""
336
+ dtype: FeatureDtype | None = CharField(db_index=True, null=True)
337
+ """Data type (:class:`~lamindb.base.types.FeatureDtype`).
338
+
339
+ For categorical types, can define from which registry values are
340
+ sampled, e.g., `'cat[ULabel]'` or `'cat[bionty.CellType]'`. Unions are also
341
+ allowed if the feature samples from two registries, e.g., `'cat[ULabel|bionty.CellType]'`
342
+ """
343
+ type: Feature | None = ForeignKey(
344
+ "self", PROTECT, null=True, related_name="records"
345
+ )
346
+ """Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction').
347
+
348
+ Allows to group features by type, e.g., all read outs, all metrics, etc.
349
+ """
350
+ records: Feature
351
+ """Records of this type."""
352
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
353
+ """Distinguish types from instances of the type."""
354
+ unit: str | None = CharField(max_length=30, db_index=True, null=True)
355
+ """Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
356
+ description: str | None = CharField(db_index=True, null=True)
357
+ """A description."""
358
+ array_rank: int = models.SmallIntegerField(default=0, db_index=True)
359
+ """Rank of feature.
360
+
361
+ Number of indices of the array: 0 for scalar, 1 for vector, 2 for matrix.
362
+
363
+ Is called `.ndim` in `numpy` and `pytorch` but shouldn't be confused with
364
+ the dimension of the feature space.
365
+ """
366
+ array_size: int = models.IntegerField(default=0, db_index=True)
367
+ """Number of elements of the feature.
368
+
369
+ Total number of elements (product of shape components) of the array.
370
+
371
+ - A number or string (a scalar): 1
372
+ - A 50-dimensional embedding: 50
373
+ - A 25 x 25 image: 625
374
+ """
375
+ array_shape: list[int] | None = JSONField(default=None, db_default=None, null=True)
376
+ """Shape of the feature.
377
+
378
+ - A number or string (a scalar): [1]
379
+ - A 50-dimensional embedding: [50]
380
+ - A 25 x 25 image: [25, 25]
381
+
382
+ Is stored as a list rather than a tuple because it's serialized as JSON.
383
+ """
384
+ proxy_dtype: FeatureDtype | None = CharField(default=None, null=True)
385
+ """Proxy data type.
386
+
387
+ If the feature is an image it's often stored via a path to the image file. Hence, while the dtype might be
388
+ image with a certain shape, the proxy dtype would be str.
389
+ """
390
+ synonyms: str | None = TextField(null=True)
391
+ """Bar-separated (|) synonyms (optional)."""
392
+ # we define the below ManyToMany on the feature model because it parallels
393
+ # how other registries (like Gene, Protein, etc.) relate to Schema
394
+ # it makes the API more consistent
395
+ schemas: Schema = models.ManyToManyField(
396
+ "Schema", through="SchemaFeature", related_name="features"
397
+ )
398
+ """Feature sets linked to this feature."""
399
+ _expect_many: bool = models.BooleanField(default=True, db_default=True)
400
+ """Indicates whether values for this feature are expected to occur a single or multiple times for an artifact (default `True`).
401
+
402
+ - if it's `True` (default), the values come from an observation-level aggregation and a dtype of `datetime` on the observation-level mean `set[datetime]` on the artifact-level
403
+ - if it's `False` it's an artifact-level value and datetime means datetime; this is an edge case because an arbitrary artifact would always be a set of arbitrary measurements that would need to be aggregated ("one just happens to measure a single cell line in that artifact")
404
+ """
405
+ _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
406
+ # backward fields
407
+ values: FeatureValue
408
+ """Values for this feature."""
409
+
410
+ @overload
411
+ def __init__(
412
+ self,
413
+ name: str,
414
+ dtype: FeatureDtype | Registry | list[Registry] | FieldAttr,
415
+ type: Feature | None = None,
416
+ is_type: bool = False,
417
+ unit: str | None = None,
418
+ description: str | None = None,
419
+ synonyms: str | None = None,
420
+ nullable: bool = True,
421
+ default_value: str | None = None,
422
+ cat_filters: dict[str, str] | None = None,
423
+ ): ...
424
+
425
+ @overload
426
+ def __init__(
427
+ self,
428
+ *db_args,
429
+ ): ...
430
+
431
+ def __init__(
432
+ self,
433
+ *args,
434
+ **kwargs,
435
+ ):
436
+ if len(args) == len(self._meta.concrete_fields):
437
+ super().__init__(*args, **kwargs)
438
+ return None
439
+ dtype = kwargs.get("dtype", None)
440
+ default_value = kwargs.pop("default_value", None)
441
+ nullable = kwargs.pop("nullable", True) # default value of nullable
442
+ cat_filters = kwargs.pop("cat_filters", None)
443
+ kwargs = process_init_feature_param(args, kwargs)
444
+ super().__init__(*args, **kwargs)
445
+ self.default_value = default_value
446
+ self.nullable = nullable
447
+ dtype_str = kwargs.pop("dtype", None)
448
+ if cat_filters:
449
+ assert "|" not in dtype_str # noqa: S101
450
+ assert "]]" not in dtype_str # noqa: S101
451
+ fill_in = ", ".join(
452
+ f"{key}='{value}'" for (key, value) in cat_filters.items()
453
+ )
454
+ dtype_str = dtype_str.replace("]", f"[{fill_in}]]")
455
+ self.dtype = dtype_str
456
+ if not self._state.adding:
457
+ if not (
458
+ self.dtype.startswith("cat")
459
+ if dtype == "cat"
460
+ else self.dtype == dtype_str
461
+ ):
462
+ raise ValidationError(
463
+ f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
464
+ )
465
+
466
+ @classmethod
467
+ def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> RecordList:
468
+ """Create Feature records for columns."""
469
+ field = Feature.name if field is None else field
470
+ registry = field.field.model # type: ignore
471
+ if registry != Feature:
472
+ raise ValueError("field must be a Feature FieldAttr!")
473
+ categoricals = categoricals_from_df(df)
474
+ dtypes = {}
475
+ for name, col in df.items():
476
+ if name in categoricals:
477
+ dtypes[name] = "cat"
478
+ else:
479
+ dtypes[name] = convert_pandas_dtype_to_lamin_dtype(col.dtype)
480
+ with logger.mute(): # silence the warning "loaded record with exact same name "
481
+ features = [
482
+ Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()
483
+ ] # type: ignore
484
+ assert len(features) == len(df.columns) # noqa: S101
485
+ return RecordList(features)
486
+
487
+ def save(self, *args, **kwargs) -> Feature:
488
+ """Save."""
489
+ super().save(*args, **kwargs)
490
+ return self
491
+
492
+ @property
493
+ def default_value(self) -> Any:
494
+ """A default value that overwrites missing values (default `None`).
495
+
496
+ This takes effect when you call `Curator.standardize()`.
497
+
498
+ If `default_value = None`, missing values like `pd.NA` or `np.nan` are kept.
499
+ """
500
+ if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]: # type: ignore
501
+ return self._aux["af"]["0"] # type: ignore
502
+ else:
503
+ return None
504
+
505
+ @default_value.setter
506
+ def default_value(self, value: bool) -> None:
507
+ if self._aux is None: # type: ignore
508
+ self._aux = {} # type: ignore
509
+ if "af" not in self._aux:
510
+ self._aux["af"] = {}
511
+ self._aux["af"]["0"] = value
512
+
513
+ @property
514
+ def nullable(self) -> bool:
515
+ """Indicates whether the feature can have nullable values (default `True`).
516
+
517
+ Example::
518
+
519
+ import lamindb as ln
520
+ import pandas as pd
521
+
522
+ disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save()
523
+ schema = ln.Schema(features=[disease]).save()
524
+ dataset = {"disease": pd.Categorical([pd.NA, "asthma"])}
525
+ df = pd.DataFrame(dataset)
526
+ curator = ln.curators.DataFrameCurator(df, schema)
527
+ try:
528
+ curator.validate()
529
+ except ln.errors.ValidationError as e:
530
+ assert str(e).startswith("non-nullable series 'disease' contains null values")
531
+
532
+ """
533
+ if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
534
+ value = self._aux["af"]["1"]
535
+ return True if value is None else value
536
+ else:
537
+ return True
538
+
539
+ @nullable.setter
540
+ def nullable(self, value: bool) -> None:
541
+ assert isinstance(value, bool), value # noqa: S101
542
+ if self._aux is None:
543
+ self._aux = {}
544
+ if "af" not in self._aux:
545
+ self._aux["af"] = {}
546
+ self._aux["af"]["1"] = value
547
+
548
+
549
+ class FeatureValue(Record, TracksRun):
550
+ """Non-categorical features values.
551
+
552
+ Categorical feature values are stored in their respective registries:
553
+ :class:`~lamindb.ULabel`, :class:`~bionty.CellType`, etc.
554
+
555
+ Unlike for ULabel, in `FeatureValue`, values are grouped by features and
556
+ not by an ontological hierarchy.
557
+ """
558
+
559
+ # we do not have a unique constraint on feature & value because it leads to hashing errors
560
+ # for large dictionaries: https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0000
561
+ # we do not hash values because we have `get_or_create` logic all over the place
562
+ # and also for checking whether the (feature, value) combination exists
563
+ # there does not seem an issue with querying for a dict-like value
564
+ # https://lamin.ai/laminlabs/lamindata/transform/jgTrkoeuxAfs0001
565
+
566
+ _name_field: str = "value"
567
+
568
+ feature: Feature | None = ForeignKey(
569
+ Feature, CASCADE, null=True, related_name="values", default=None
570
+ )
571
+ """The dimension metadata."""
572
+ value: Any = models.JSONField()
573
+ """The JSON-like value."""
574
+ hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
575
+ """Value hash."""
576
+
577
+ class Meta(BasicRecord.Meta, TracksRun.Meta):
578
+ constraints = [
579
+ # For simple types, use direct value comparison
580
+ models.UniqueConstraint(
581
+ fields=["feature", "value"],
582
+ name="unique_simple_feature_value",
583
+ condition=Q(hash__isnull=True),
584
+ ),
585
+ # For complex types (dictionaries), use hash
586
+ models.UniqueConstraint(
587
+ fields=["feature", "hash"],
588
+ name="unique_complex_feature_value",
589
+ condition=Q(hash__isnull=False),
590
+ ),
591
+ ]
592
+
593
+ @classmethod
594
+ def get_or_create(cls, feature, value):
595
+ # Simple types: int, float, str, bool
596
+ if isinstance(value, (int, float, str, bool)):
597
+ try:
598
+ return (
599
+ cls.objects.create(feature=feature, value=value, hash=None),
600
+ False,
601
+ )
602
+ except IntegrityError:
603
+ return cls.objects.get(feature=feature, value=value), True
604
+
605
+ # Complex types: dict, list
606
+ else:
607
+ hash = hash_dict(value)
608
+ try:
609
+ return (
610
+ cls.objects.create(feature=feature, value=value, hash=hash),
611
+ False,
612
+ )
613
+ except IntegrityError:
614
+ return cls.objects.get(feature=feature, hash=hash), True
615
+
616
+
617
+ def suggest_categorical_for_str_iterable(
618
+ iterable: Iterable[str], key: str = None
619
+ ) -> str:
620
+ c = pd.Categorical(iterable)
621
+ message = ""
622
+ if len(c.categories) < len(c):
623
+ if key != "":
624
+ key_note = f" for feature {key}"
625
+ else:
626
+ key_note = ""
627
+ message = f"You have few permissible values{key_note}, consider dtype 'cat' instead of 'str'"
628
+ return message
629
+
630
+
631
+ def categoricals_from_df(df: pd.DataFrame) -> dict:
632
+ """Returns categorical columns."""
633
+ string_cols = [col for col in df.columns if is_string_dtype(df[col])]
634
+ categoricals = {
635
+ col: df[col]
636
+ for col in df.columns
637
+ if isinstance(df[col].dtype, CategoricalDtype)
638
+ }
639
+ for key in string_cols:
640
+ message = suggest_categorical_for_str_iterable(df[key], key)
641
+ if message:
642
+ logger.warning(message)
643
+ return categoricals