lamindb 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. lamindb/__init__.py +33 -26
  2. lamindb/_finish.py +9 -1
  3. lamindb/_tracked.py +26 -3
  4. lamindb/_view.py +2 -3
  5. lamindb/base/__init__.py +1 -1
  6. lamindb/base/ids.py +1 -10
  7. lamindb/base/users.py +1 -4
  8. lamindb/core/__init__.py +7 -65
  9. lamindb/core/_compat.py +60 -0
  10. lamindb/core/_context.py +50 -22
  11. lamindb/core/_mapped_collection.py +4 -2
  12. lamindb/core/_settings.py +6 -6
  13. lamindb/core/_sync_git.py +1 -1
  14. lamindb/core/_track_environment.py +2 -1
  15. lamindb/core/datasets/_small.py +3 -3
  16. lamindb/core/loaders.py +43 -20
  17. lamindb/core/storage/_anndata_accessor.py +8 -3
  18. lamindb/core/storage/_backed_access.py +14 -7
  19. lamindb/core/storage/_pyarrow_dataset.py +24 -9
  20. lamindb/core/storage/_tiledbsoma.py +8 -6
  21. lamindb/core/storage/_zarr.py +104 -25
  22. lamindb/core/storage/objects.py +63 -28
  23. lamindb/core/storage/paths.py +16 -13
  24. lamindb/core/types.py +10 -0
  25. lamindb/curators/__init__.py +176 -149
  26. lamindb/errors.py +1 -1
  27. lamindb/integrations/_vitessce.py +4 -4
  28. lamindb/migrations/0089_subsequent_runs.py +159 -0
  29. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  30. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  31. lamindb/models/__init__.py +79 -0
  32. lamindb/{core → models}/_describe.py +3 -3
  33. lamindb/{core → models}/_django.py +8 -5
  34. lamindb/{core → models}/_feature_manager.py +103 -87
  35. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  36. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  37. lamindb/{core → models}/_label_manager.py +10 -17
  38. lamindb/{core/relations.py → models/_relations.py} +8 -1
  39. lamindb/models/artifact.py +2602 -0
  40. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  41. lamindb/models/collection.py +683 -0
  42. lamindb/models/core.py +135 -0
  43. lamindb/models/feature.py +643 -0
  44. lamindb/models/flextable.py +163 -0
  45. lamindb/{_parents.py → models/has_parents.py} +55 -49
  46. lamindb/models/project.py +384 -0
  47. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  48. lamindb/{_query_set.py → models/query_set.py} +64 -32
  49. lamindb/models/record.py +1762 -0
  50. lamindb/models/run.py +563 -0
  51. lamindb/{_save.py → models/save.py} +18 -8
  52. lamindb/models/schema.py +732 -0
  53. lamindb/models/transform.py +360 -0
  54. lamindb/models/ulabel.py +249 -0
  55. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/METADATA +6 -6
  56. lamindb-1.2.0.dist-info/RECORD +95 -0
  57. lamindb/_artifact.py +0 -1361
  58. lamindb/_collection.py +0 -440
  59. lamindb/_feature.py +0 -316
  60. lamindb/_is_versioned.py +0 -40
  61. lamindb/_record.py +0 -1065
  62. lamindb/_run.py +0 -60
  63. lamindb/_schema.py +0 -347
  64. lamindb/_storage.py +0 -15
  65. lamindb/_transform.py +0 -170
  66. lamindb/_ulabel.py +0 -56
  67. lamindb/_utils.py +0 -9
  68. lamindb/base/validation.py +0 -63
  69. lamindb/core/_data.py +0 -491
  70. lamindb/core/fields.py +0 -12
  71. lamindb/models.py +0 -4435
  72. lamindb-1.1.0.dist-info/RECORD +0 -95
  73. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/LICENSE +0 -0
  74. {lamindb-1.1.0.dist-info → lamindb-1.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,732 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, overload
4
+
5
+ import numpy as np
6
+ from django.db import models
7
+ from django.db.models import CASCADE, PROTECT, ManyToManyField
8
+ from lamin_utils import logger
9
+ from lamindb_setup.core.hashing import HASH_LENGTH, hash_set
10
+
11
+ from lamindb.base import ids
12
+ from lamindb.base.fields import (
13
+ BooleanField,
14
+ CharField,
15
+ ForeignKey,
16
+ IntegerField,
17
+ JSONField,
18
+ )
19
+ from lamindb.base.types import FieldAttr, ListLike
20
+ from lamindb.errors import InvalidArgument
21
+
22
+ from ..base import deprecated
23
+ from ..errors import ValidationError
24
+ from ._relations import (
25
+ dict_related_model_to_related_name,
26
+ get_related_name,
27
+ )
28
+ from .can_curate import CanCurate
29
+ from .feature import (
30
+ Feature,
31
+ convert_pandas_dtype_to_lamin_dtype,
32
+ get_dtype_str_from_dtype,
33
+ )
34
+ from .record import (
35
+ BasicRecord,
36
+ LinkORM,
37
+ Record,
38
+ Registry,
39
+ init_self_from_db,
40
+ update_attributes,
41
+ )
42
+ from .run import Param, TracksRun, TracksUpdates
43
+
44
+ if TYPE_CHECKING:
45
+ from collections.abc import Iterable
46
+
47
+ import pandas as pd
48
+ from django.db.models.query_utils import DeferredAttribute
49
+
50
+ from .artifact import Artifact
51
+ from .project import Project
52
+ from .query_set import QuerySet
53
+
54
+
55
+ NUMBER_TYPE = "num"
56
+ DICT_KEYS_TYPE = type({}.keys()) # type: ignore
57
+
58
+
59
+ def validate_features(features: list[Record]) -> Record:
60
+ """Validate and return feature type."""
61
+ try:
62
+ if len(features) == 0:
63
+ raise ValueError("Provide list of features with at least one element")
64
+ except TypeError:
65
+ raise ValueError(
66
+ "Please pass a ListLike of features, not a single feature"
67
+ ) from None
68
+ if not hasattr(features, "__getitem__"):
69
+ raise TypeError("features has to be list-like")
70
+ if not isinstance(features[0], Record):
71
+ raise TypeError(
72
+ "features has to store feature records! use .from_values() otherwise"
73
+ )
74
+ feature_types = {feature.__class__ for feature in features}
75
+ if len(feature_types) > 1:
76
+ raise TypeError("schema can only contain a single type")
77
+ for feature in features:
78
+ if feature._state.adding:
79
+ raise ValueError("Can only construct feature sets from validated features")
80
+ return next(iter(feature_types)) # return value in set of cardinality 1
81
+
82
+
83
+ class Schema(Record, CanCurate, TracksRun):
84
+ """Schemas.
85
+
86
+ The simplest schema is a feature set such as the set of columns of a `DataFrame`.
87
+
88
+ A composite schema has multiple components, e.g., for an `AnnData`, one schema for `obs` and another one for `var`.
89
+
90
+ Args:
91
+ features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
92
+ records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
93
+ a set upon instantiation. If you'd like to pass values, use
94
+ :meth:`~lamindb.Schema.from_values` or
95
+ :meth:`~lamindb.Schema.from_df`.
96
+ components: `dict[str, Schema] | None = None` A dictionary mapping component names to
97
+ their corresponding :class:`~lamindb.Schema` objects for composite schemas.
98
+ name: `str | None = None` A name.
99
+ description: `str | None = None` A description.
100
+ dtype: `str | None = None` The simple type. Defaults to
101
+ `None` for sets of :class:`~lamindb.Feature` records.
102
+ Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
103
+ itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
104
+ type: `Schema | None = None` A type.
105
+ is_type: `bool = False` Distinguish types from instances of the type.
106
+ otype: `str | None = None` An object type to define the structure of a composite schema.
107
+ minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
108
+ ordered_set: `bool = False` Whether features are required to be ordered.
109
+ maximal_set: `bool = False` If `True`, no additional features are allowed.
110
+ slot: `str | None = None` The slot name when this schema is used as a component in a
111
+ composite schema.
112
+ coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
113
+ during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
114
+
115
+ .. dropdown:: Why does LaminDB model schemas, not just features?
116
+
117
+ 1. Performance: Imagine you measure the same panel of 20k transcripts in
118
+ 1M samples. By modeling the panel as a feature set, you can link all
119
+ your artifacts against one feature set and only need to store 1M
120
+ instead of 1M x 20k = 20B links.
121
+ 2. Interpretation: Model protein panels, gene panels, etc.
122
+ 3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
123
+
124
+ These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
125
+
126
+ Note:
127
+
128
+ A feature set can be identified by the `hash` of its feature uids.
129
+ It's stored in the `.hash` field.
130
+
131
+ A `slot` provides a string key to access feature sets. For instance, for the schema of an
132
+ `AnnData` object, it would be `'obs'` for `adata.obs`.
133
+
134
+ See Also:
135
+ :meth:`~lamindb.Schema.from_values`
136
+ Create from values.
137
+ :meth:`~lamindb.Schema.from_df`
138
+ Create from dataframe columns.
139
+
140
+ Examples:
141
+
142
+ Create a schema (feature set) from df with types:
143
+
144
+ >>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
145
+ >>> schema = ln.Schema.from_df(df)
146
+
147
+ Create a schema (feature set) from features:
148
+
149
+ >>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
150
+ >>> schema = ln.Schema(features)
151
+
152
+ Create a schema (feature set) from identifier values:
153
+
154
+ >>> import bionty as bt
155
+ >>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
156
+
157
+ """
158
+
159
+ class Meta(Record.Meta, TracksRun.Meta, TracksUpdates.Meta):
160
+ abstract = False
161
+
162
+ _name_field: str = "name"
163
+ _aux_fields: dict[str, tuple[str, type]] = {
164
+ "0": ("coerce_dtype", bool),
165
+ "1": ("_index_feature_uid", str),
166
+ }
167
+
168
+ id: int = models.AutoField(primary_key=True)
169
+ """Internal id, valid only in one DB instance."""
170
+ uid: str = CharField(editable=False, unique=True, db_index=True, max_length=20)
171
+ """A universal id (hash of the set of feature values)."""
172
+ name: str | None = CharField(max_length=150, null=True, db_index=True)
173
+ """A name."""
174
+ description: str | None = CharField(null=True, db_index=True)
175
+ """A description."""
176
+ n = IntegerField()
177
+ """Number of features in the set."""
178
+ dtype: str | None = CharField(max_length=64, null=True, editable=False)
179
+ """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
180
+
181
+ For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
182
+ """
183
+ itype: str | None = CharField(
184
+ max_length=120, db_index=True, null=True, editable=False
185
+ )
186
+ """A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
187
+
188
+ Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
189
+
190
+ .. versionchanged:: 1.0.0
191
+ Was called `registry` before.
192
+ """
193
+ type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
194
+ """Type of schema.
195
+
196
+ Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
197
+
198
+ You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`.
199
+
200
+ Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.
201
+ """
202
+ records: Schema
203
+ """Records of this type."""
204
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
205
+ """Distinguish types from instances of the type."""
206
+ otype: str | None = CharField(max_length=64, db_index=True, null=True)
207
+ """Default Python object type, e.g., DataFrame, AnnData."""
208
+ hash: str | None = CharField(
209
+ max_length=HASH_LENGTH, db_index=True, null=True, editable=False
210
+ )
211
+ """A hash of the set of feature identifiers.
212
+
213
+ For a composite schema, the hash of hashes.
214
+ """
215
+ minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
216
+ """Whether the schema contains a minimal set of linked features (default `True`).
217
+
218
+ If `False`, no features are linked to this schema.
219
+
220
+ If `True`, features are linked and considered as a minimally required set in validation.
221
+ """
222
+ ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
223
+ """Whether features are required to be ordered (default `False`)."""
224
+ maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
225
+ """If `False`, additional features are allowed (default `False`).
226
+
227
+ If `True`, the the minimal set is a maximal set and no additional features are allowed.
228
+ """
229
+ components: Schema = ManyToManyField(
230
+ "self", through="SchemaComponent", symmetrical=False, related_name="composites"
231
+ )
232
+ """Components of this schema."""
233
+ composites: Schema
234
+ """The composite schemas that contains this schema as a component.
235
+
236
+ For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
237
+ """
238
+ features: Feature
239
+ """The features contained in the schema."""
240
+ params: Param
241
+ """The params contained in the schema."""
242
+ artifacts: Artifact
243
+ """The artifacts that measure a feature set that matches this schema."""
244
+ validated_artifacts: Artifact
245
+ """The artifacts that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
246
+ projects: Project
247
+ """Linked projects."""
248
+ _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
249
+ # lamindb v2
250
+ # _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
251
+ # ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
252
+ # -- the following two fields are dynamically removed from the API for now
253
+ validated_by: Schema | None = ForeignKey(
254
+ "self", PROTECT, related_name="validated_schemas", default=None, null=True
255
+ )
256
+ # """The schema that validated this schema during curation.
257
+
258
+ # When performing validation, the schema that enforced validation is often less concrete than what is validated.
259
+
260
+ # For instance, the set of measured features might be a superset of the minimally required set of features.
261
+ # """
262
+ # validated_schemas: Schema
263
+ # """The schemas that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
264
+ composite: Schema | None = ForeignKey(
265
+ "self", PROTECT, related_name="+", default=None, null=True
266
+ )
267
+ # The legacy foreign key
268
+ slot: str | None = CharField(max_length=100, db_index=True, null=True)
269
+ # The legacy slot
270
+
271
+ @overload
272
+ def __init__(
273
+ self,
274
+ features: Iterable[Record] | None = None,
275
+ components: dict[str, Schema] | None = None,
276
+ name: str | None = None,
277
+ description: str | None = None,
278
+ dtype: str | None = None,
279
+ itype: str | Registry | FieldAttr | None = None,
280
+ type: Schema | None = None,
281
+ is_type: bool = False,
282
+ otype: str | None = None,
283
+ minimal_set: bool = True,
284
+ ordered_set: bool = False,
285
+ maximal_set: bool = False,
286
+ slot: str | None = None,
287
+ coerce_dtype: bool = False,
288
+ ): ...
289
+
290
+ @overload
291
+ def __init__(
292
+ self,
293
+ *db_args,
294
+ ): ...
295
+
296
+ def __init__(
297
+ self,
298
+ *args,
299
+ **kwargs,
300
+ ):
301
+ if len(args) == len(self._meta.concrete_fields):
302
+ super().__init__(*args, **kwargs)
303
+ return None
304
+ if len(args) > 1:
305
+ raise ValueError("Only one non-keyword arg allowed: features")
306
+
307
+ features: Iterable[Record] | None = (
308
+ args[0] if args else kwargs.pop("features", [])
309
+ )
310
+ # typing here anticipates transitioning to a ManyToMany
311
+ # between composites and components similar to feature_sets
312
+ # in lamindb v2
313
+ components: dict[str, Schema] = kwargs.pop("components", {})
314
+ name: str | None = kwargs.pop("name", None)
315
+ description: str | None = kwargs.pop("description", None)
316
+ dtype: str | None = kwargs.pop("dtype", None)
317
+ itype: str | Record | DeferredAttribute | None = kwargs.pop("itype", None)
318
+ type: Feature | None = kwargs.pop("type", None)
319
+ is_type: bool = kwargs.pop("is_type", False)
320
+ otype: str | None = kwargs.pop("otype", None)
321
+ minimal_set: bool = kwargs.pop("minimal_set", True)
322
+ ordered_set: bool = kwargs.pop("ordered_set", False)
323
+ maximal_set: bool = kwargs.pop("maximal_set", False)
324
+ slot: str | None = kwargs.pop("slot", None)
325
+ coerce_dtype: bool | None = kwargs.pop("coerce_dtype", None)
326
+
327
+ if kwargs:
328
+ raise ValueError(
329
+ f"Unexpected keyword arguments: {', '.join(kwargs.keys())}\n"
330
+ "Valid arguments are: features, description, dtype, itype, type, "
331
+ "is_type, otype, minimal_set, ordered_set, maximal_set, "
332
+ "slot, validated_by, coerce_dtype"
333
+ )
334
+
335
+ if features:
336
+ features_registry = validate_features(features)
337
+ itype_compare = features_registry.__get_name_with_module__()
338
+ if itype is not None:
339
+ assert itype == itype_compare, str(itype_compare) # noqa: S101
340
+ else:
341
+ itype = itype_compare
342
+ n_features = len(features)
343
+ else:
344
+ n_features = -1
345
+ if dtype is None:
346
+ dtype = None if itype is not None and itype == "Feature" else NUMBER_TYPE
347
+ else:
348
+ dtype = get_type_str(dtype)
349
+ components: dict[str, Schema]
350
+ if components:
351
+ itype = "Composite"
352
+ if otype is None:
353
+ raise InvalidArgument("Please pass otype != None for composite schemas")
354
+ if itype is not None and not isinstance(itype, str):
355
+ itype_str = get_dtype_str_from_dtype(itype, is_itype=True)
356
+ else:
357
+ itype_str = itype
358
+ validated_kwargs = {
359
+ "name": name,
360
+ "description": description,
361
+ "type": type,
362
+ "dtype": dtype,
363
+ "is_type": is_type,
364
+ "otype": otype,
365
+ "n": n_features,
366
+ "itype": itype_str,
367
+ "minimal_set": minimal_set,
368
+ "ordered_set": ordered_set,
369
+ "maximal_set": maximal_set,
370
+ }
371
+ if coerce_dtype:
372
+ validated_kwargs["_aux"] = {"af": {"0": coerce_dtype}}
373
+ if features:
374
+ hash = hash_set({feature.uid for feature in features})
375
+ elif components:
376
+ hash = hash_set({component.hash for component in components.values()})
377
+ else:
378
+ hash = hash_set({str(value) for value in validated_kwargs.values()})
379
+ validated_kwargs["hash"] = hash
380
+ validated_kwargs["slot"] = slot
381
+ schema = Schema.filter(hash=hash).one_or_none()
382
+ if schema is not None:
383
+ logger.important(f"returning existing schema with same hash: {schema}")
384
+ init_self_from_db(self, schema)
385
+ update_attributes(self, validated_kwargs)
386
+ return None
387
+ self._components: dict[str, Schema] = {}
388
+ if features:
389
+ self._features = (get_related_name(features_registry), features) # type: ignore
390
+ elif components:
391
+ for slot, component in components.items():
392
+ if component._state.adding:
393
+ raise InvalidArgument(
394
+ f"component {slot} {component} must be saved before use"
395
+ )
396
+ self._components = components
397
+ self._slots = components
398
+ validated_kwargs["uid"] = ids.base62_20()
399
+ super().__init__(**validated_kwargs)
400
+
401
+ @classmethod
402
+ def from_values( # type: ignore
403
+ cls,
404
+ values: ListLike,
405
+ field: FieldAttr = Feature.name,
406
+ type: str | None = None,
407
+ name: str | None = None,
408
+ mute: bool = False,
409
+ organism: Record | str | None = None,
410
+ source: Record | None = None,
411
+ raise_validation_error: bool = True,
412
+ ) -> Schema:
413
+ """Create feature set for validated features.
414
+
415
+ Args:
416
+ values: A list of values, like feature names or ids.
417
+ field: The field of a reference registry to map values.
418
+ type: The simple type.
419
+ Defaults to `None` if reference registry is :class:`~lamindb.Feature`,
420
+ defaults to `"float"` otherwise.
421
+ name: A name.
422
+ organism: An organism to resolve gene mapping.
423
+ source: A public ontology to resolve feature identifier mapping.
424
+ raise_validation_error: Whether to raise a validation error if some values are not valid.
425
+
426
+ Raises:
427
+ ValidationError: If some values are not valid.
428
+
429
+ Examples:
430
+
431
+ >>> features = [ln.Feature(name=feat, dtype="str").save() for feat in ["feat11", "feat21"]]
432
+ >>> schema = ln.Schema.from_values(features)
433
+
434
+ >>> genes = ["ENSG00000139618", "ENSG00000198786"]
435
+ >>> schema = ln.Schema.from_values(features, bt.Gene.ensembl_gene_id, "float")
436
+ """
437
+ if not isinstance(field, FieldAttr):
438
+ raise TypeError(
439
+ "Argument `field` must be a Record field, e.g., `Feature.name`"
440
+ )
441
+ if len(values) == 0:
442
+ raise ValueError("Provide a list of at least one value")
443
+ if isinstance(values, DICT_KEYS_TYPE):
444
+ values = list(values)
445
+ registry = field.field.model
446
+ if registry != Feature and type is None:
447
+ type = NUMBER_TYPE
448
+ logger.debug("setting feature set to 'number'")
449
+ validated = registry.validate(values, field=field, mute=mute, organism=organism)
450
+ values_array = np.array(values)
451
+ validated_values = values_array[validated]
452
+ if validated.sum() != len(values):
453
+ not_validated_values = values_array[~validated]
454
+ msg = (
455
+ f"These values could not be validated: {not_validated_values.tolist()}\n"
456
+ f"If there are no typos, add them to their registry: {registry.__name__}"
457
+ )
458
+ if raise_validation_error:
459
+ raise ValidationError(msg)
460
+ elif len(validated_values) == 0:
461
+ return None # temporarily return None here
462
+ validated_features = registry.from_values(
463
+ validated_values,
464
+ field=field,
465
+ organism=organism,
466
+ source=source,
467
+ )
468
+ schema = Schema(
469
+ features=validated_features,
470
+ name=name,
471
+ dtype=get_type_str(type),
472
+ )
473
+ return schema
474
+
475
+ @classmethod
476
+ def from_df(
477
+ cls,
478
+ df: pd.DataFrame,
479
+ field: FieldAttr = Feature.name,
480
+ name: str | None = None,
481
+ mute: bool = False,
482
+ organism: Record | str | None = None,
483
+ source: Record | None = None,
484
+ ) -> Schema | None:
485
+ """Create feature set for validated features."""
486
+ registry = field.field.model
487
+ validated = registry.validate(
488
+ df.columns, field=field, mute=mute, organism=organism
489
+ )
490
+ if validated.sum() == 0:
491
+ if mute is True:
492
+ logger.warning("no validated features, skip creating feature set")
493
+ return None
494
+ if registry == Feature:
495
+ validated_features = Feature.from_values( # type: ignore
496
+ df.columns, field=field, organism=organism
497
+ )
498
+ schema = Schema(
499
+ validated_features, name=name, dtype=None, otype="DataFrame"
500
+ )
501
+ else:
502
+ dtypes = [col.dtype for (_, col) in df.loc[:, validated].items()]
503
+ if len(set(dtypes)) != 1:
504
+ raise ValueError(f"data types are heterogeneous: {set(dtypes)}")
505
+ dtype = convert_pandas_dtype_to_lamin_dtype(dtypes[0])
506
+ validated_features = registry.from_values(
507
+ df.columns[validated],
508
+ field=field,
509
+ organism=organism,
510
+ source=source,
511
+ )
512
+ schema = Schema(
513
+ features=validated_features,
514
+ name=name,
515
+ dtype=get_type_str(dtype),
516
+ otype="DataFrame",
517
+ )
518
+ return schema
519
+
520
+ def save(self, *args, **kwargs) -> Schema:
521
+ """Save."""
522
+ from .save import bulk_create
523
+
524
+ super().save(*args, **kwargs)
525
+ if hasattr(self, "_components"):
526
+ # analogous to save_schema_links in core._data.py
527
+ # which is called to save feature sets in artifact.save()
528
+ links = []
529
+ for slot, component in self._components.items():
530
+ kwargs = {
531
+ "composite_id": self.id,
532
+ "component_id": component.id,
533
+ "slot": slot,
534
+ }
535
+ links.append(Schema.components.through(**kwargs))
536
+ bulk_create(links, ignore_conflicts=True)
537
+ if hasattr(self, "_features"):
538
+ assert self.n > 0 # noqa: S101
539
+ related_name, records = self._features
540
+ # only the following method preserves the order
541
+ # .set() does not preserve the order but orders by
542
+ # the feature primary key
543
+ through_model = getattr(self, related_name).through
544
+ related_model_split = self.itype.split(".")
545
+ if len(related_model_split) == 1:
546
+ related_field = related_model_split[0].lower()
547
+ else:
548
+ related_field = related_model_split[1].lower()
549
+ related_field_id = f"{related_field}_id"
550
+ links = [
551
+ through_model(**{"schema_id": self.id, related_field_id: record.id})
552
+ for record in records
553
+ ]
554
+ through_model.objects.bulk_create(links, ignore_conflicts=True)
555
+ return self
556
+
557
+ @property
558
+ def members(self) -> QuerySet:
559
+ """A queryset for the individual records of the set."""
560
+ if self._state.adding:
561
+ # this should return a queryset and not a list...
562
+ # need to fix this
563
+ return self._features[1]
564
+ related_name = self._get_related_name()
565
+ if related_name is None:
566
+ related_name = "features"
567
+ return self.__getattribute__(related_name).order_by("links_schema__id")
568
+
569
+ @property
570
+ def coerce_dtype(self) -> bool:
571
+ """Whether dtypes should be coerced during validation.
572
+
573
+ For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
574
+ """
575
+ if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]: # type: ignore
576
+ return self._aux["af"]["0"] # type: ignore
577
+ else:
578
+ return False
579
+
580
+ @coerce_dtype.setter
581
+ def coerce_dtype(self, value: bool) -> None:
582
+ if self._aux is None: # type: ignore
583
+ self._aux = {} # type: ignore
584
+ if "af" not in self._aux:
585
+ self._aux["af"] = {}
586
+ self._aux["af"]["0"] = value
587
+
588
+ @coerce_dtype.setter
589
+ def coerce_dtype(self, value: bool) -> None:
590
+ if self._aux is None:
591
+ self._aux = {}
592
+ if "af" not in self._aux:
593
+ self._aux["af"] = {}
594
+ self._aux["af"]["0"] = value
595
+
596
+ # @property
597
+ # def index_feature(self) -> None | Feature:
598
+ # # index_feature: `Record | None = None` A :class:`~lamindb.Feature` to validate the index of a `DataFrame`.
599
+ # """The uid of the index feature, if `index_feature` was set."""
600
+ # if self._index_feature_uid is None:
601
+ # return None
602
+ # else:
603
+ # return self.features.get(uid=self._index_feature_uid)
604
+
605
+ # @property
606
+ # def _index_feature_uid(self) -> None | str:
607
+ # """The uid of the index feature, if `index_feature` was set."""
608
+ # if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
609
+ # return self._aux["af"]["1"]
610
+ # else:
611
+ # return None
612
+
613
+ # @_index_feature_uid.setter
614
+ # def _index_feature_uid(self, value: str) -> None:
615
+ # if self._aux is None:
616
+ # self._aux = {}
617
+ # if "af" not in self._aux:
618
+ # self._aux["af"] = {}
619
+ # self._aux["af"]["1"] = value
620
+
621
+ @property
622
+ @deprecated("itype")
623
+ def registry(self) -> str:
624
+ return self.itype
625
+
626
+ @registry.setter
627
+ def registry(self, value) -> None:
628
+ self.itype = value
629
+
630
+ @property
631
+ def slots(self) -> dict[str, Schema]:
632
+ """Slots.
633
+
634
+ Examples::
635
+
636
+ # define composite schema
637
+ anndata_schema = ln.Schema(
638
+ name="small_dataset1_anndata_schema",
639
+ otype="AnnData",
640
+ components={"obs": obs_schema, "var": var_schema},
641
+ ).save()
642
+
643
+ # access slots
644
+ anndata_schema.slots
645
+ # {'obs': <Schema: obs_schema>, 'var': <Schema: var_schema>}
646
+ """
647
+ if hasattr(self, "_slots"):
648
+ return self._slots
649
+ if self.itype == "Composite":
650
+ self._slots = {
651
+ link.slot: link.component
652
+ for link in self.components.through.filter(composite_id=self.id).all()
653
+ }
654
+ return self._slots
655
+ return {}
656
+
657
+ def describe(self, return_str=False) -> None | str:
658
+ """Describe schema."""
659
+ message = str(self)
660
+ # display slots for composite schemas
661
+ if self.itype == "Composite":
662
+ message + "\nslots:"
663
+ for slot, schema in self.slots.items():
664
+ message += f"\n {slot}: " + str(schema)
665
+ if return_str:
666
+ return message
667
+ else:
668
+ print(message)
669
+ return None
670
+
671
+
672
+ def get_type_str(dtype: str | None) -> str | None:
673
+ if dtype is not None:
674
+ type_str = dtype.__name__ if not isinstance(dtype, str) else dtype # type: ignore
675
+ else:
676
+ type_str = None
677
+ return type_str
678
+
679
+
680
+ def _get_related_name(self: Schema) -> str:
681
+ related_models = dict_related_model_to_related_name(self, instance=self._state.db)
682
+ related_name = related_models.get(self.itype)
683
+ return related_name
684
+
685
+
686
+ class SchemaFeature(BasicRecord, LinkORM):
687
+ id: int = models.BigAutoField(primary_key=True)
688
+ schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature")
689
+ feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")
690
+
691
+ class Meta:
692
+ unique_together = ("schema", "feature")
693
+
694
+
695
+ class SchemaParam(BasicRecord, LinkORM):
696
+ id: int = models.BigAutoField(primary_key=True)
697
+ schema: Schema = ForeignKey(Schema, CASCADE, related_name="+")
698
+ param: Param = ForeignKey(Param, PROTECT, related_name="+")
699
+
700
+ class Meta:
701
+ unique_together = ("schema", "param")
702
+
703
+
704
+ class ArtifactSchema(BasicRecord, LinkORM, TracksRun):
705
+ id: int = models.BigAutoField(primary_key=True)
706
+ artifact: Artifact = ForeignKey("Artifact", CASCADE, related_name="_links_schema")
707
+ schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
708
+ slot: str | None = CharField(null=True)
709
+ feature_ref_is_semantic: bool | None = BooleanField(null=True)
710
+
711
+ class Meta:
712
+ unique_together = (("artifact", "schema"), ("artifact", "slot"))
713
+
714
+
715
+ class SchemaComponent(BasicRecord, LinkORM, TracksRun):
716
+ id: int = models.BigAutoField(primary_key=True)
717
+ composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_composite")
718
+ component: Schema = ForeignKey(Schema, PROTECT, related_name="links_component")
719
+ slot: str | None = CharField(null=True)
720
+
721
+ class Meta:
722
+ unique_together = (("composite", "component"), ("composite", "slot"))
723
+
724
+
725
+ Schema._get_related_name = _get_related_name
726
+ # excluded on docs via
727
+ # https://github.com/laminlabs/lndocs/blob/8c1963de65445107ea69b3fd59354c3828e067d1/lndocs/lamin_sphinx/__init__.py#L584-L588
728
+ delattr(Schema, "validated_by") # we don't want to expose these
729
+ delattr(Schema, "validated_by_id") # we don't want to expose these
730
+ delattr(Schema, "validated_schemas") # we don't want to expose these
731
+ delattr(Schema, "composite") # we don't want to expose these
732
+ delattr(Schema, "composite_id") # we don't want to expose these