lamindb 1.5.3__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +25 -6
- lamindb/_finish.py +5 -5
- lamindb/_tracked.py +1 -1
- lamindb/_view.py +4 -4
- lamindb/core/_context.py +32 -6
- lamindb/core/_settings.py +1 -1
- lamindb/core/datasets/mini_immuno.py +8 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_anndata_accessor.py +9 -9
- lamindb/core/storage/_valid_suffixes.py +1 -0
- lamindb/core/storage/_zarr.py +32 -107
- lamindb/curators/__init__.py +19 -2
- lamindb/curators/_cellxgene_schemas/__init__.py +3 -3
- lamindb/curators/_legacy.py +15 -19
- lamindb/curators/core.py +247 -80
- lamindb/errors.py +2 -2
- lamindb/migrations/0069_squashed.py +8 -8
- lamindb/migrations/0071_lamindbv1_migrate_schema.py +3 -3
- lamindb/migrations/0073_merge_ourprojects.py +7 -7
- lamindb/migrations/0075_lamindbv1_part5.py +1 -1
- lamindb/migrations/0077_lamindbv1_part6b.py +3 -3
- lamindb/migrations/0080_polish_lamindbv1.py +2 -2
- lamindb/migrations/0088_schema_components.py +1 -1
- lamindb/migrations/0090_runproject_project_runs.py +2 -2
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +1 -1
- lamindb/migrations/0094_writeloglock_writelogmigrationstate_and_more.py +84 -0
- lamindb/migrations/0095_remove_rundata_flextable.py +155 -0
- lamindb/migrations/0096_remove_artifact__param_values_and_more.py +266 -0
- lamindb/migrations/0097_remove_schemaparam_param_remove_paramvalue_param_and_more.py +27 -0
- lamindb/migrations/0098_alter_feature_type_alter_project_type_and_more.py +656 -0
- lamindb/migrations/0099_alter_writelog_seqno.py +22 -0
- lamindb/migrations/0100_branch_alter_artifact__branch_code_and_more.py +102 -0
- lamindb/migrations/0101_alter_artifact_hash_alter_feature_name_and_more.py +444 -0
- lamindb/migrations/0102_remove_writelog_branch_code_and_more.py +72 -0
- lamindb/migrations/0103_remove_writelog_migration_state_and_more.py +46 -0
- lamindb/migrations/{0090_squashed.py → 0103_squashed.py} +1013 -1009
- lamindb/models/__init__.py +35 -18
- lamindb/models/_describe.py +4 -4
- lamindb/models/_django.py +38 -4
- lamindb/models/_feature_manager.py +66 -123
- lamindb/models/_from_values.py +13 -13
- lamindb/models/_label_manager.py +8 -6
- lamindb/models/_relations.py +7 -7
- lamindb/models/artifact.py +166 -156
- lamindb/models/can_curate.py +25 -25
- lamindb/models/collection.py +48 -18
- lamindb/models/core.py +3 -3
- lamindb/models/feature.py +88 -60
- lamindb/models/has_parents.py +17 -17
- lamindb/models/project.py +52 -24
- lamindb/models/query_manager.py +5 -5
- lamindb/models/query_set.py +61 -37
- lamindb/models/record.py +158 -1583
- lamindb/models/run.py +39 -176
- lamindb/models/save.py +6 -6
- lamindb/models/schema.py +32 -43
- lamindb/models/sqlrecord.py +1743 -0
- lamindb/models/transform.py +17 -33
- lamindb/models/ulabel.py +21 -15
- {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/METADATA +7 -11
- lamindb-1.6.0.dist-info/RECORD +118 -0
- lamindb/core/storage/_anndata_sizes.py +0 -41
- lamindb/models/flextable.py +0 -163
- lamindb-1.5.3.dist-info/RECORD +0 -109
- {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/LICENSE +0 -0
- {lamindb-1.5.3.dist-info → lamindb-1.6.0.dist-info}/WHEEL +0 -0
lamindb/models/can_curate.py
CHANGED
@@ -14,19 +14,19 @@ from ._from_values import (
|
|
14
14
|
_from_values,
|
15
15
|
get_organism_record_from_field,
|
16
16
|
)
|
17
|
-
from .
|
17
|
+
from .sqlrecord import SQLRecord, get_name_field
|
18
18
|
|
19
19
|
if TYPE_CHECKING:
|
20
20
|
from lamin_utils._inspect import InspectResult
|
21
21
|
|
22
22
|
from lamindb.base.types import ListLike, StrField
|
23
23
|
|
24
|
-
from .query_set import
|
24
|
+
from .query_set import SQLRecordList
|
25
25
|
|
26
26
|
|
27
|
-
def _check_if_record_in_db(record: str |
|
27
|
+
def _check_if_record_in_db(record: str | SQLRecord | None, using_key: str | None):
|
28
28
|
"""Check if the record is from the using_key DB."""
|
29
|
-
if isinstance(record,
|
29
|
+
if isinstance(record, SQLRecord):
|
30
30
|
if using_key is not None and using_key != "default":
|
31
31
|
if record._state.db != using_key:
|
32
32
|
raise ValueError(
|
@@ -55,8 +55,8 @@ def _inspect(
|
|
55
55
|
field: StrField | None = None,
|
56
56
|
*,
|
57
57
|
mute: bool = False,
|
58
|
-
organism: str |
|
59
|
-
source:
|
58
|
+
organism: str | SQLRecord | None = None,
|
59
|
+
source: SQLRecord | None = None,
|
60
60
|
from_source: bool = True,
|
61
61
|
strict_source: bool = False,
|
62
62
|
) -> pd.DataFrame | dict[str, list[str]]:
|
@@ -69,7 +69,7 @@ def _inspect(
|
|
69
69
|
queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()
|
70
70
|
registry = queryset.model
|
71
71
|
model_name = registry._meta.model.__name__
|
72
|
-
if isinstance(source,
|
72
|
+
if isinstance(source, SQLRecord):
|
73
73
|
_check_if_record_in_db(source, queryset.db)
|
74
74
|
# if strict_source mode, restrict the query to the passed ontology source
|
75
75
|
# otherwise, inspect across records present in the DB from all ontology sources and no-source
|
@@ -158,8 +158,8 @@ def _validate(
|
|
158
158
|
field: StrField | None = None,
|
159
159
|
*,
|
160
160
|
mute: bool = False,
|
161
|
-
organism: str |
|
162
|
-
source:
|
161
|
+
organism: str | SQLRecord | None = None,
|
162
|
+
source: SQLRecord | None = None,
|
163
163
|
strict_source: bool = False,
|
164
164
|
) -> np.ndarray:
|
165
165
|
"""{}""" # noqa: D415
|
@@ -172,7 +172,7 @@ def _validate(
|
|
172
172
|
|
173
173
|
queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()
|
174
174
|
registry = queryset.model
|
175
|
-
if isinstance(source,
|
175
|
+
if isinstance(source, SQLRecord):
|
176
176
|
_check_if_record_in_db(source, queryset.db)
|
177
177
|
if strict_source:
|
178
178
|
queryset = queryset.filter(source=source)
|
@@ -224,8 +224,8 @@ def _standardize(
|
|
224
224
|
source_aware: bool = True,
|
225
225
|
keep: Literal["first", "last", False] = "first",
|
226
226
|
synonyms_field: str = "synonyms",
|
227
|
-
organism: str |
|
228
|
-
source:
|
227
|
+
organism: str | SQLRecord | None = None,
|
228
|
+
source: SQLRecord | None = None,
|
229
229
|
strict_source: bool = False,
|
230
230
|
) -> list[str] | dict[str, str]:
|
231
231
|
"""{}""" # noqa: D415
|
@@ -240,7 +240,7 @@ def _standardize(
|
|
240
240
|
)
|
241
241
|
queryset = cls.all() if isinstance(cls, (QuerySet, Manager)) else cls.objects.all()
|
242
242
|
registry = queryset.model
|
243
|
-
if isinstance(source,
|
243
|
+
if isinstance(source, SQLRecord):
|
244
244
|
_check_if_record_in_db(source, queryset.db)
|
245
245
|
if strict_source:
|
246
246
|
queryset = queryset.filter(source=source)
|
@@ -431,7 +431,7 @@ def _check_synonyms_field_exist(record: CanCurate):
|
|
431
431
|
|
432
432
|
def _filter_queryset_with_organism(
|
433
433
|
queryset: QuerySet,
|
434
|
-
organism:
|
434
|
+
organism: SQLRecord | None = None,
|
435
435
|
values_list_field: str | None = None,
|
436
436
|
values_list_fields: list[str] | None = None,
|
437
437
|
):
|
@@ -453,7 +453,7 @@ def _filter_queryset_with_organism(
|
|
453
453
|
|
454
454
|
|
455
455
|
class CanCurate:
|
456
|
-
"""Base class providing :class:`~lamindb.models.
|
456
|
+
"""Base class providing :class:`~lamindb.models.SQLRecord`-based validation."""
|
457
457
|
|
458
458
|
@classmethod
|
459
459
|
def inspect(
|
@@ -462,8 +462,8 @@ class CanCurate:
|
|
462
462
|
field: StrField | None = None,
|
463
463
|
*,
|
464
464
|
mute: bool = False,
|
465
|
-
organism: Union[str,
|
466
|
-
source:
|
465
|
+
organism: Union[str, SQLRecord, None] = None,
|
466
|
+
source: SQLRecord | None = None,
|
467
467
|
from_source: bool = True,
|
468
468
|
strict_source: bool = False,
|
469
469
|
) -> InspectResult:
|
@@ -518,8 +518,8 @@ class CanCurate:
|
|
518
518
|
field: StrField | None = None,
|
519
519
|
*,
|
520
520
|
mute: bool = False,
|
521
|
-
organism: Union[str,
|
522
|
-
source:
|
521
|
+
organism: Union[str, SQLRecord, None] = None,
|
522
|
+
source: SQLRecord | None = None,
|
523
523
|
strict_source: bool = False,
|
524
524
|
) -> np.ndarray:
|
525
525
|
"""Validate values against existing values of a string field.
|
@@ -571,16 +571,16 @@ class CanCurate:
|
|
571
571
|
values: ListLike,
|
572
572
|
field: StrField | None = None,
|
573
573
|
create: bool = False,
|
574
|
-
organism: Union[
|
575
|
-
source:
|
574
|
+
organism: Union[SQLRecord, str, None] = None,
|
575
|
+
source: SQLRecord | None = None,
|
576
576
|
mute: bool = False,
|
577
|
-
) ->
|
577
|
+
) -> SQLRecordList:
|
578
578
|
"""Bulk create validated records by parsing values for an identifier such as a name or an id).
|
579
579
|
|
580
580
|
Args:
|
581
581
|
values: A list of values for an identifier, e.g.
|
582
582
|
`["name1", "name2"]`.
|
583
|
-
field: A `
|
583
|
+
field: A `SQLRecord` field to look up, e.g., `bt.CellMarker.name`.
|
584
584
|
create: Whether to create records if they don't exist.
|
585
585
|
organism: A `bionty.Organism` name or record.
|
586
586
|
source: A `bionty.Source` record to validate against to create records for.
|
@@ -629,8 +629,8 @@ class CanCurate:
|
|
629
629
|
source_aware: bool = True,
|
630
630
|
keep: Literal["first", "last", False] = "first",
|
631
631
|
synonyms_field: str = "synonyms",
|
632
|
-
organism: Union[str,
|
633
|
-
source:
|
632
|
+
organism: Union[str, SQLRecord, None] = None,
|
633
|
+
source: SQLRecord | None = None,
|
634
634
|
strict_source: bool = False,
|
635
635
|
) -> list[str] | dict[str, str]:
|
636
636
|
"""Maps input synonyms to standardized names.
|
lamindb/models/collection.py
CHANGED
@@ -37,15 +37,15 @@ from .artifact import (
|
|
37
37
|
save_schema_links,
|
38
38
|
)
|
39
39
|
from .has_parents import view_lineage
|
40
|
-
from .
|
41
|
-
|
42
|
-
|
43
|
-
|
40
|
+
from .run import Run, TracksRun, TracksUpdates
|
41
|
+
from .sqlrecord import (
|
42
|
+
BaseSQLRecord,
|
43
|
+
IsLink,
|
44
|
+
SQLRecord,
|
44
45
|
_get_record_kwargs,
|
45
46
|
init_self_from_db,
|
46
47
|
update_attributes,
|
47
48
|
)
|
48
|
-
from .run import Run, TracksRun, TracksUpdates
|
49
49
|
|
50
50
|
if TYPE_CHECKING:
|
51
51
|
from collections.abc import Iterable, Iterator
|
@@ -128,7 +128,7 @@ def _load_concat_artifacts(
|
|
128
128
|
return concat_object
|
129
129
|
|
130
130
|
|
131
|
-
class Collection(
|
131
|
+
class Collection(SQLRecord, IsVersioned, TracksRun, TracksUpdates):
|
132
132
|
"""Collections of artifacts.
|
133
133
|
|
134
134
|
Collections provide a simple way of versioning collections of artifacts.
|
@@ -158,7 +158,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
158
158
|
|
159
159
|
"""
|
160
160
|
|
161
|
-
class Meta(
|
161
|
+
class Meta(SQLRecord.Meta, IsVersioned.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
162
162
|
abstract = False
|
163
163
|
|
164
164
|
_len_full_uid: int = 20
|
@@ -272,7 +272,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
272
272
|
run: Run | None = kwargs.pop("run", None)
|
273
273
|
revises: Collection | None = kwargs.pop("revises", None)
|
274
274
|
version: str | None = kwargs.pop("version", None)
|
275
|
-
|
275
|
+
branch_id: int | None = kwargs.pop("branch_id", 1)
|
276
276
|
key: str
|
277
277
|
if "name" in kwargs:
|
278
278
|
key = kwargs.pop("name")
|
@@ -340,7 +340,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
340
340
|
hash=hash,
|
341
341
|
run=run,
|
342
342
|
version=version,
|
343
|
-
|
343
|
+
branch_id=branch_id,
|
344
344
|
revises=revises,
|
345
345
|
_skip_validation=_skip_validation,
|
346
346
|
)
|
@@ -349,6 +349,38 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
349
349
|
_track_run_input(revises, run=run)
|
350
350
|
_track_run_input(artifacts, run=run)
|
351
351
|
|
352
|
+
@classmethod
|
353
|
+
def get(
|
354
|
+
cls,
|
355
|
+
idlike: int | str | None = None,
|
356
|
+
*,
|
357
|
+
is_run_input: bool | Run = False,
|
358
|
+
**expressions,
|
359
|
+
) -> Artifact:
|
360
|
+
"""Get a single collection.
|
361
|
+
|
362
|
+
Args:
|
363
|
+
idlike: Either a uid stub, uid or an integer id.
|
364
|
+
is_run_input: Whether to track this collection as run input.
|
365
|
+
expressions: Fields and values passed as Django query expressions.
|
366
|
+
|
367
|
+
Raises:
|
368
|
+
:exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
|
369
|
+
|
370
|
+
See Also:
|
371
|
+
- Method in `SQLRecord` base class: :meth:`~lamindb.models.SQLRecord.get`
|
372
|
+
|
373
|
+
Examples:
|
374
|
+
|
375
|
+
::
|
376
|
+
|
377
|
+
collection = ln.Collection.get("okxPW6GIKBfRBE3B0000")
|
378
|
+
collection = ln.Collection.get(key="scrna/collection1")
|
379
|
+
"""
|
380
|
+
from .query_set import QuerySet
|
381
|
+
|
382
|
+
return QuerySet(model=cls).get(idlike, is_run_input=is_run_input, **expressions)
|
383
|
+
|
352
384
|
def append(self, artifact: Artifact, run: Run | None = None) -> Collection:
|
353
385
|
"""Append an artifact to the collection.
|
354
386
|
|
@@ -557,14 +589,12 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
557
589
|
|
558
590
|
>>> collection.delete()
|
559
591
|
"""
|
560
|
-
# change
|
561
|
-
|
562
|
-
if self.
|
563
|
-
self.
|
592
|
+
# change branch_id to trash
|
593
|
+
trash_branch_id = -1
|
594
|
+
if self.branch_id > trash_branch_id and permanent is not True:
|
595
|
+
self.branch_id = trash_branch_id
|
564
596
|
self.save()
|
565
|
-
logger.warning(
|
566
|
-
f"moved collection to trash (_branch_code = {trash__branch_code})"
|
567
|
-
)
|
597
|
+
logger.warning(f"moved collection to trash (branch_id = {trash_branch_id})")
|
568
598
|
return
|
569
599
|
|
570
600
|
# permanent delete
|
@@ -619,7 +649,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
619
649
|
|
620
650
|
>>> collection.restore()
|
621
651
|
"""
|
622
|
-
self.
|
652
|
+
self.branch_id = 1
|
623
653
|
self.save()
|
624
654
|
|
625
655
|
@property
|
@@ -691,7 +721,7 @@ def from_artifacts(artifacts: Iterable[Artifact]) -> tuple[str, dict[str, str]]:
|
|
691
721
|
return hash
|
692
722
|
|
693
723
|
|
694
|
-
class CollectionArtifact(
|
724
|
+
class CollectionArtifact(BaseSQLRecord, IsLink, TracksRun):
|
695
725
|
id: int = models.BigAutoField(primary_key=True)
|
696
726
|
collection: Collection = ForeignKey(
|
697
727
|
Collection, CASCADE, related_name="links_artifact"
|
lamindb/models/core.py
CHANGED
@@ -12,8 +12,8 @@ from lamindb.base.fields import (
|
|
12
12
|
)
|
13
13
|
|
14
14
|
from ..base.ids import base62_12
|
15
|
-
from .record import Record
|
16
15
|
from .run import TracksRun, TracksUpdates
|
16
|
+
from .sqlrecord import SQLRecord
|
17
17
|
|
18
18
|
if TYPE_CHECKING:
|
19
19
|
from pathlib import Path
|
@@ -23,7 +23,7 @@ if TYPE_CHECKING:
|
|
23
23
|
from .artifact import Artifact
|
24
24
|
|
25
25
|
|
26
|
-
class Storage(
|
26
|
+
class Storage(SQLRecord, TracksRun, TracksUpdates):
|
27
27
|
"""Storage locations of artifacts such as S3 buckets or local directories.
|
28
28
|
|
29
29
|
A storage location is either a directory/folder (local or in the cloud) or
|
@@ -68,7 +68,7 @@ class Storage(Record, TracksRun, TracksUpdates):
|
|
68
68
|
>>> ln.settings.storage = "./storage_2" # or a cloud bucket
|
69
69
|
"""
|
70
70
|
|
71
|
-
class Meta(
|
71
|
+
class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
72
72
|
abstract = False
|
73
73
|
|
74
74
|
_name_field: str = "root"
|
lamindb/models/feature.py
CHANGED
@@ -6,12 +6,12 @@ from typing import TYPE_CHECKING, Any, get_args, overload
|
|
6
6
|
import numpy as np
|
7
7
|
import pandas as pd
|
8
8
|
from django.db import models
|
9
|
-
from django.db.models import CASCADE, PROTECT
|
9
|
+
from django.db.models import CASCADE, PROTECT
|
10
10
|
from django.db.models.query_utils import DeferredAttribute
|
11
11
|
from django.db.utils import IntegrityError
|
12
12
|
from lamin_utils import logger
|
13
13
|
from lamindb_setup._init_instance import get_schema_module_name
|
14
|
-
from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict
|
14
|
+
from lamindb_setup.core.hashing import HASH_LENGTH, hash_dict, hash_string
|
15
15
|
from pandas.api.types import CategoricalDtype, is_string_dtype
|
16
16
|
from pandas.core.dtypes.base import ExtensionDtype
|
17
17
|
|
@@ -28,12 +28,12 @@ from lamindb.errors import FieldValidationError, ValidationError
|
|
28
28
|
from ..base.ids import base62_12
|
29
29
|
from ._relations import dict_module_name_to_model_name
|
30
30
|
from .can_curate import CanCurate
|
31
|
-
from .query_set import
|
32
|
-
from .record import BasicRecord, Record, Registry, _get_record_kwargs
|
31
|
+
from .query_set import SQLRecordList
|
33
32
|
from .run import (
|
34
33
|
TracksRun,
|
35
34
|
TracksUpdates,
|
36
35
|
)
|
36
|
+
from .sqlrecord import BaseSQLRecord, Registry, SQLRecord, _get_record_kwargs
|
37
37
|
|
38
38
|
if TYPE_CHECKING:
|
39
39
|
from collections.abc import Iterable
|
@@ -50,6 +50,18 @@ def parse_dtype(dtype_str: str, is_param: bool = False) -> list[dict[str, str]]:
|
|
50
50
|
allowed_dtypes = FEATURE_DTYPES
|
51
51
|
if is_param:
|
52
52
|
allowed_dtypes.add("dict")
|
53
|
+
|
54
|
+
# Handle list[...] types
|
55
|
+
if dtype_str.startswith("list[") and dtype_str.endswith("]"):
|
56
|
+
inner_dtype_str = dtype_str[5:-1] # Remove "list[" and "]"
|
57
|
+
# Recursively parse the inner type
|
58
|
+
inner_result = parse_dtype(inner_dtype_str, is_param)
|
59
|
+
# Add "list": True to each component
|
60
|
+
for component in inner_result:
|
61
|
+
if isinstance(component, dict):
|
62
|
+
component["list"] = True # type: ignore
|
63
|
+
return inner_result
|
64
|
+
|
53
65
|
is_composed_cat = dtype_str.startswith("cat[") and dtype_str.endswith("]")
|
54
66
|
result = []
|
55
67
|
if is_composed_cat:
|
@@ -71,7 +83,7 @@ def parse_dtype(dtype_str: str, is_param: bool = False) -> list[dict[str, str]]:
|
|
71
83
|
|
72
84
|
def parse_cat_dtype(
|
73
85
|
dtype_str: str,
|
74
|
-
related_registries: dict[str,
|
86
|
+
related_registries: dict[str, SQLRecord] | None = None,
|
75
87
|
is_itype: bool = False,
|
76
88
|
) -> dict[str, Any]:
|
77
89
|
"""Parses a categorical dtype string into its components (registry, field, subtypes)."""
|
@@ -119,8 +131,17 @@ def parse_cat_dtype(
|
|
119
131
|
if "." in registry_str:
|
120
132
|
registry_str_split = registry_str.split(".")
|
121
133
|
assert len(registry_str_split) == 2, registry_str # noqa: S101
|
122
|
-
|
123
|
-
module_name = get_schema_module_name(
|
134
|
+
module_name_attempt, class_name = registry_str_split
|
135
|
+
module_name = get_schema_module_name(
|
136
|
+
module_name_attempt, raise_import_error=False
|
137
|
+
)
|
138
|
+
if module_name is None:
|
139
|
+
raise ImportError(
|
140
|
+
f"Can not parse dtype {dtype_str} because {module_name_attempt} "
|
141
|
+
f"was not found.\nInstall the module with `pip install {module_name_attempt}`\n"
|
142
|
+
"and also add the module to this instance via instance settings page "
|
143
|
+
"under 'schema modules'."
|
144
|
+
)
|
124
145
|
else:
|
125
146
|
module_name, class_name = "lamindb", registry_str
|
126
147
|
module = importlib.import_module(module_name)
|
@@ -143,12 +164,30 @@ def parse_cat_dtype(
|
|
143
164
|
|
144
165
|
|
145
166
|
def serialize_dtype(
|
146
|
-
dtype: Registry
|
167
|
+
dtype: Registry
|
168
|
+
| SQLRecord
|
169
|
+
| FieldAttr
|
170
|
+
| list[SQLRecord]
|
171
|
+
| list[Registry]
|
172
|
+
| list[str]
|
173
|
+
| list[float]
|
174
|
+
| str
|
175
|
+
| type,
|
147
176
|
is_itype: bool = False,
|
148
177
|
) -> str:
|
149
178
|
"""Converts a data type object into its string representation."""
|
179
|
+
from .record import Record
|
150
180
|
from .ulabel import ULabel
|
151
181
|
|
182
|
+
# Handle generic types like list[str], list[Registry], etc.
|
183
|
+
if hasattr(dtype, "__origin__") and dtype.__origin__ is list:
|
184
|
+
# Get the inner type from list[T]
|
185
|
+
inner_type = dtype.__args__[0] if dtype.__args__ else None # type: ignore
|
186
|
+
if inner_type is not None:
|
187
|
+
# Recursively serialize the inner type
|
188
|
+
inner_dtype_str = serialize_dtype(inner_type, is_itype=is_itype)
|
189
|
+
return f"list[{inner_dtype_str}]"
|
190
|
+
|
152
191
|
if (
|
153
192
|
not isinstance(dtype, list)
|
154
193
|
and hasattr(dtype, "__name__")
|
@@ -167,21 +206,24 @@ def serialize_dtype(
|
|
167
206
|
dtype_str = serialize_pandas_dtype(dtype)
|
168
207
|
else:
|
169
208
|
error_message = "dtype has to be a registry, a ulabel subtype, a registry field, or a list of registries or fields, not {}"
|
170
|
-
if isinstance(dtype, (Registry, DeferredAttribute, ULabel)):
|
209
|
+
if isinstance(dtype, (Registry, DeferredAttribute, ULabel, Record)):
|
171
210
|
dtype = [dtype]
|
172
211
|
elif not isinstance(dtype, list):
|
173
212
|
raise ValueError(error_message.format(dtype))
|
174
213
|
dtype_str = ""
|
175
214
|
for one_dtype in dtype:
|
176
|
-
if not isinstance(one_dtype, (Registry, DeferredAttribute, ULabel)):
|
215
|
+
if not isinstance(one_dtype, (Registry, DeferredAttribute, ULabel, Record)):
|
177
216
|
raise ValueError(error_message.format(one_dtype))
|
178
217
|
if isinstance(one_dtype, Registry):
|
179
218
|
dtype_str += one_dtype.__get_name_with_module__() + "|"
|
180
|
-
elif isinstance(one_dtype, ULabel):
|
219
|
+
elif isinstance(one_dtype, (ULabel, Record)):
|
181
220
|
assert one_dtype.is_type, ( # noqa: S101
|
182
221
|
f"ulabel has to be a type if acting as dtype, {one_dtype} has `is_type` False"
|
183
222
|
)
|
184
|
-
|
223
|
+
if isinstance(one_dtype, ULabel):
|
224
|
+
dtype_str += f"ULabel[{one_dtype.name}]"
|
225
|
+
else:
|
226
|
+
dtype_str += f"Record[{one_dtype.name}]"
|
185
227
|
else:
|
186
228
|
name = one_dtype.field.name
|
187
229
|
field_ext = f".{name}" if name != "name" else ""
|
@@ -247,10 +289,10 @@ def process_init_feature_param(args, kwargs, is_param: bool = False):
|
|
247
289
|
return kwargs
|
248
290
|
|
249
291
|
|
250
|
-
class Feature(
|
251
|
-
"""
|
292
|
+
class Feature(SQLRecord, CanCurate, TracksRun, TracksUpdates):
|
293
|
+
"""Variables, such as dataframe columns or run parameters.
|
252
294
|
|
253
|
-
A feature represents a dimension of a dataset, such as a column in a
|
295
|
+
A feature often represents a dimension of a dataset, such as a column in a
|
254
296
|
`DataFrame`. The `Feature` registry organizes metadata of features.
|
255
297
|
|
256
298
|
The `Feature` registry helps you organize and query datasets based on their
|
@@ -317,6 +359,13 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
317
359
|
... dtype=[ln.ULabel, bt.CellType],
|
318
360
|
... ).save()
|
319
361
|
|
362
|
+
A multivalue feature with a list of cell types.
|
363
|
+
|
364
|
+
>>> ln.Feature(
|
365
|
+
... name="cell_types",
|
366
|
+
... dtype=list[bt.CellType], # or list[str] for a list of strings
|
367
|
+
... ).save()
|
368
|
+
|
320
369
|
Hint:
|
321
370
|
|
322
371
|
*Features* and *labels* denote two ways of using entities to organize data:
|
@@ -337,7 +386,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
337
386
|
|
338
387
|
"""
|
339
388
|
|
340
|
-
class Meta(
|
389
|
+
class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
|
341
390
|
abstract = False
|
342
391
|
|
343
392
|
_name_field: str = "name"
|
@@ -353,19 +402,19 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
353
402
|
editable=False, unique=True, db_index=True, max_length=12, default=base62_12
|
354
403
|
)
|
355
404
|
"""Universal id, valid across DB instances."""
|
356
|
-
name: str = CharField(max_length=150, db_index=True
|
357
|
-
"""Name of feature
|
405
|
+
name: str = CharField(max_length=150, db_index=True)
|
406
|
+
"""Name of feature."""
|
358
407
|
dtype: Dtype | None = CharField(db_index=True, null=True)
|
359
408
|
"""Data type (:class:`~lamindb.base.types.Dtype`)."""
|
360
409
|
type: Feature | None = ForeignKey(
|
361
|
-
"self", PROTECT, null=True, related_name="
|
410
|
+
"self", PROTECT, null=True, related_name="features"
|
362
411
|
)
|
363
412
|
"""Type of feature (e.g., 'Readout', 'Metric', 'Metadata', 'ExpertAnnotation', 'ModelPrediction').
|
364
413
|
|
365
414
|
Allows to group features by type, e.g., all read outs, all metrics, etc.
|
366
415
|
"""
|
367
|
-
|
368
|
-
"""
|
416
|
+
features: Feature
|
417
|
+
"""Features of this type (can only be non-empty if `is_type` is `True`)."""
|
369
418
|
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
370
419
|
"""Distinguish types from instances of the type."""
|
371
420
|
unit: str | None = CharField(max_length=30, db_index=True, null=True)
|
@@ -413,10 +462,10 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
413
462
|
"Schema", through="SchemaFeature", related_name="features"
|
414
463
|
)
|
415
464
|
"""Feature sets linked to this feature."""
|
416
|
-
_expect_many: bool = models.BooleanField(default=
|
417
|
-
"""Indicates whether values for this feature are expected to occur a single or multiple times for an artifact (default `
|
465
|
+
_expect_many: bool = models.BooleanField(default=None, db_default=None, null=True)
|
466
|
+
"""Indicates whether values for this feature are expected to occur a single or multiple times for an artifact (default `None`).
|
418
467
|
|
419
|
-
- if it's `True` (default), the values come from an observation-level aggregation and a dtype of `datetime` on the observation-level
|
468
|
+
- if it's `True` (default), the values come from an observation-level aggregation and a dtype of `datetime` on the observation-level means `set[datetime]` on the artifact-level
|
420
469
|
- if it's `False` it's an artifact-level value and datetime means datetime; this is an edge case because an arbitrary artifact would always be a set of arbitrary measurements that would need to be aggregated ("one just happens to measure a single cell line in that artifact")
|
421
470
|
"""
|
422
471
|
_curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
|
@@ -484,7 +533,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
484
533
|
)
|
485
534
|
|
486
535
|
@classmethod
|
487
|
-
def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) ->
|
536
|
+
def from_df(cls, df: pd.DataFrame, field: FieldAttr | None = None) -> SQLRecordList:
|
488
537
|
"""Create Feature records for columns."""
|
489
538
|
field = Feature.name if field is None else field
|
490
539
|
registry = field.field.model # type: ignore
|
@@ -502,7 +551,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
502
551
|
Feature(name=name, dtype=dtype) for name, dtype in dtypes.items()
|
503
552
|
] # type: ignore
|
504
553
|
assert len(features) == len(df.columns) # noqa: S101
|
505
|
-
return
|
554
|
+
return SQLRecordList(features)
|
506
555
|
|
507
556
|
def save(self, *args, **kwargs) -> Feature:
|
508
557
|
"""Save."""
|
@@ -606,7 +655,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
606
655
|
# return "Artifact"
|
607
656
|
|
608
657
|
|
609
|
-
class FeatureValue(
|
658
|
+
class FeatureValue(SQLRecord, TracksRun):
|
610
659
|
"""Non-categorical features values.
|
611
660
|
|
612
661
|
Categorical feature values are stored in their respective registries:
|
@@ -634,44 +683,23 @@ class FeatureValue(Record, TracksRun):
|
|
634
683
|
hash: str = CharField(max_length=HASH_LENGTH, null=True, db_index=True)
|
635
684
|
"""Value hash."""
|
636
685
|
|
637
|
-
class Meta(
|
638
|
-
|
639
|
-
# For simple types, use direct value comparison
|
640
|
-
models.UniqueConstraint(
|
641
|
-
fields=["feature", "value"],
|
642
|
-
name="unique_simple_feature_value",
|
643
|
-
condition=Q(hash__isnull=True),
|
644
|
-
),
|
645
|
-
# For complex types (dictionaries), use hash
|
646
|
-
models.UniqueConstraint(
|
647
|
-
fields=["feature", "hash"],
|
648
|
-
name="unique_complex_feature_value",
|
649
|
-
condition=Q(hash__isnull=False),
|
650
|
-
),
|
651
|
-
]
|
686
|
+
class Meta(BaseSQLRecord.Meta, TracksRun.Meta):
|
687
|
+
unique_together = ("feature", "hash")
|
652
688
|
|
653
689
|
@classmethod
|
654
690
|
def get_or_create(cls, feature, value):
|
655
|
-
#
|
656
|
-
if isinstance(value,
|
657
|
-
|
658
|
-
return (
|
659
|
-
cls.objects.create(feature=feature, value=value, hash=None),
|
660
|
-
False,
|
661
|
-
)
|
662
|
-
except IntegrityError:
|
663
|
-
return cls.objects.get(feature=feature, value=value), True
|
664
|
-
|
665
|
-
# Complex types: dict, list
|
691
|
+
# simple values: (int, float, str, bool, datetime)
|
692
|
+
if not isinstance(value, dict):
|
693
|
+
hash = hash_string(str(value))
|
666
694
|
else:
|
667
695
|
hash = hash_dict(value)
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
696
|
+
try:
|
697
|
+
return (
|
698
|
+
cls.objects.create(feature=feature, value=value, hash=hash),
|
699
|
+
False,
|
700
|
+
)
|
701
|
+
except IntegrityError:
|
702
|
+
return cls.objects.get(feature=feature, hash=hash), True
|
675
703
|
|
676
704
|
|
677
705
|
def suggest_categorical_for_str_iterable(
|