lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +203 -102
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/_tiledbsoma.py +29 -13
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3349
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1534 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +382 -287
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +177 -106
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/collection.py +73 -52
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +51 -17
- lamindb/models/has_parents.py +69 -14
- lamindb/models/project.py +1 -1
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +247 -172
- lamindb/models/record.py +65 -247
- lamindb/models/run.py +4 -4
- lamindb/models/save.py +8 -2
- lamindb/models/schema.py +456 -184
- lamindb/models/transform.py +2 -2
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from collections.abc import Iterable, Iterator
|
4
|
+
from typing import TYPE_CHECKING, Literal
|
5
|
+
|
6
|
+
from lamin_utils import logger
|
7
|
+
from lamindb_setup.core._docs import doc_args
|
8
|
+
|
9
|
+
from ..core._mapped_collection import MappedCollection
|
10
|
+
from ..core.storage._backed_access import _open_dataframe
|
11
|
+
from .artifact import Artifact, _track_run_input
|
12
|
+
from .collection import Collection, _load_concat_artifacts
|
13
|
+
|
14
|
+
if TYPE_CHECKING:
|
15
|
+
from anndata import AnnData
|
16
|
+
from pandas import DataFrame
|
17
|
+
from polars import LazyFrame as PolarsLazyFrame
|
18
|
+
from pyarrow.dataset import Dataset as PyArrowDataset
|
19
|
+
from upath import UPath
|
20
|
+
|
21
|
+
|
22
|
+
UNORDERED_WARNING = (
|
23
|
+
"this query set is unordered, consider using `.order_by()` first "
|
24
|
+
"to avoid opening the artifacts in an arbitrary order"
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
class ArtifactSet(Iterable):
|
29
|
+
"""Abstract class representing sets of artifacts returned by queries.
|
30
|
+
|
31
|
+
This class automatically extends :class:`~lamindb.models.BasicQuerySet`
|
32
|
+
and :class:`~lamindb.models.QuerySet` when the base model is :class:`~lamindb.Artifact`.
|
33
|
+
|
34
|
+
Examples:
|
35
|
+
|
36
|
+
>>> artifacts = ln.Artifact.filter(otype="AnnData")
|
37
|
+
>>> artifacts # an instance of ArtifactQuerySet inheriting from ArtifactSet
|
38
|
+
"""
|
39
|
+
|
40
|
+
@doc_args(Collection.load.__doc__)
|
41
|
+
def load(
|
42
|
+
self,
|
43
|
+
join: Literal["inner", "outer"] = "outer",
|
44
|
+
is_run_input: bool | None = None,
|
45
|
+
**kwargs,
|
46
|
+
) -> DataFrame | AnnData:
|
47
|
+
"""{}""" # noqa: D415
|
48
|
+
if not self.ordered: # type: ignore
|
49
|
+
logger.warning(UNORDERED_WARNING)
|
50
|
+
|
51
|
+
artifacts: list[Artifact] = list(self)
|
52
|
+
concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
|
53
|
+
# track only if successful
|
54
|
+
_track_run_input(artifacts, is_run_input)
|
55
|
+
return concat_object
|
56
|
+
|
57
|
+
@doc_args(Collection.open.__doc__)
|
58
|
+
def open(
|
59
|
+
self,
|
60
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
61
|
+
is_run_input: bool | None = None,
|
62
|
+
**kwargs,
|
63
|
+
) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
|
64
|
+
"""{}""" # noqa: D415
|
65
|
+
if not self.ordered: # type: ignore
|
66
|
+
logger.warning(UNORDERED_WARNING)
|
67
|
+
|
68
|
+
artifacts: list[Artifact] = list(self)
|
69
|
+
paths: list[UPath] = [artifact.path for artifact in artifacts]
|
70
|
+
|
71
|
+
dataframe = _open_dataframe(paths, engine=engine, **kwargs)
|
72
|
+
# track only if successful
|
73
|
+
_track_run_input(artifacts, is_run_input)
|
74
|
+
return dataframe
|
75
|
+
|
76
|
+
@doc_args(Collection.mapped.__doc__)
|
77
|
+
def mapped(
|
78
|
+
self,
|
79
|
+
layers_keys: str | list[str] | None = None,
|
80
|
+
obs_keys: str | list[str] | None = None,
|
81
|
+
obsm_keys: str | list[str] | None = None,
|
82
|
+
obs_filter: dict[str, str | list[str]] | None = None,
|
83
|
+
join: Literal["inner", "outer"] | None = "inner",
|
84
|
+
encode_labels: bool | list[str] = True,
|
85
|
+
unknown_label: str | dict[str, str] | None = None,
|
86
|
+
cache_categories: bool = True,
|
87
|
+
parallel: bool = False,
|
88
|
+
dtype: str | None = None,
|
89
|
+
stream: bool = False,
|
90
|
+
is_run_input: bool | None = None,
|
91
|
+
) -> MappedCollection:
|
92
|
+
"""{}""" # noqa: D415
|
93
|
+
if not self.ordered: # type: ignore
|
94
|
+
logger.warning(UNORDERED_WARNING)
|
95
|
+
|
96
|
+
artifacts: list[Artifact] = []
|
97
|
+
paths: list[UPath] = []
|
98
|
+
for artifact in self:
|
99
|
+
if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
|
100
|
+
logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
|
101
|
+
continue
|
102
|
+
elif not stream:
|
103
|
+
paths.append(artifact.cache())
|
104
|
+
else:
|
105
|
+
paths.append(artifact.path)
|
106
|
+
artifacts.append(artifact)
|
107
|
+
ds = MappedCollection(
|
108
|
+
paths,
|
109
|
+
layers_keys,
|
110
|
+
obs_keys,
|
111
|
+
obsm_keys,
|
112
|
+
obs_filter,
|
113
|
+
join,
|
114
|
+
encode_labels,
|
115
|
+
unknown_label,
|
116
|
+
cache_categories,
|
117
|
+
parallel,
|
118
|
+
dtype,
|
119
|
+
)
|
120
|
+
# track only if successful
|
121
|
+
_track_run_input(artifacts, is_run_input)
|
122
|
+
return ds
|
lamindb/models/collection.py
CHANGED
@@ -24,7 +24,7 @@ from lamindb.base.fields import (
|
|
24
24
|
|
25
25
|
from ..base.ids import base62_20
|
26
26
|
from ..core._mapped_collection import MappedCollection
|
27
|
-
from ..core.storage.
|
27
|
+
from ..core.storage._backed_access import _open_dataframe
|
28
28
|
from ..errors import FieldValidationError
|
29
29
|
from ..models._is_versioned import process_revises
|
30
30
|
from ._is_versioned import IsVersioned
|
@@ -48,8 +48,9 @@ from .record import (
|
|
48
48
|
from .run import Run, TracksRun, TracksUpdates
|
49
49
|
|
50
50
|
if TYPE_CHECKING:
|
51
|
-
from collections.abc import Iterable
|
51
|
+
from collections.abc import Iterable, Iterator
|
52
52
|
|
53
|
+
from polars import LazyFrame as PolarsLazyFrame
|
53
54
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
54
55
|
|
55
56
|
from ..core.storage import UPath
|
@@ -94,6 +95,39 @@ if TYPE_CHECKING:
|
|
94
95
|
# return feature_sets_union
|
95
96
|
|
96
97
|
|
98
|
+
def _load_concat_artifacts(
|
99
|
+
artifacts: list[Artifact], join: Literal["inner", "outer"] = "outer", **kwargs
|
100
|
+
) -> pd.DataFrame | ad.AnnData:
|
101
|
+
suffixes = {artifact.suffix for artifact in artifacts}
|
102
|
+
# Why is that? - Sergei
|
103
|
+
if len(suffixes) != 1:
|
104
|
+
raise ValueError(
|
105
|
+
"Can only load collections where all artifacts have the same suffix"
|
106
|
+
)
|
107
|
+
|
108
|
+
# because we're tracking data flow on the collection-level, here, we don't
|
109
|
+
# want to track it on the artifact-level
|
110
|
+
first_object = artifacts[0].load(is_run_input=False)
|
111
|
+
is_dataframe = isinstance(first_object, pd.DataFrame)
|
112
|
+
is_anndata = isinstance(first_object, ad.AnnData)
|
113
|
+
if not is_dataframe and not is_anndata:
|
114
|
+
raise ValueError(f"Unable to concatenate {suffixes.pop()} objects.")
|
115
|
+
|
116
|
+
objects = [first_object]
|
117
|
+
artifact_uids = [artifacts[0].uid]
|
118
|
+
for artifact in artifacts[1:]:
|
119
|
+
objects.append(artifact.load(is_run_input=False))
|
120
|
+
artifact_uids.append(artifact.uid)
|
121
|
+
|
122
|
+
if is_dataframe:
|
123
|
+
concat_object = pd.concat(objects, join=join, **kwargs)
|
124
|
+
elif is_anndata:
|
125
|
+
label = kwargs.pop("label", "artifact_uid")
|
126
|
+
keys = kwargs.pop("keys", artifact_uids)
|
127
|
+
concat_object = ad.concat(objects, join=join, label=label, keys=keys, **kwargs)
|
128
|
+
return concat_object
|
129
|
+
|
130
|
+
|
97
131
|
class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
98
132
|
"""Collections of artifacts.
|
99
133
|
|
@@ -342,13 +376,25 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
342
376
|
run=run,
|
343
377
|
)
|
344
378
|
|
345
|
-
def open(
|
346
|
-
|
379
|
+
def open(
|
380
|
+
self,
|
381
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
382
|
+
is_run_input: bool | None = None,
|
383
|
+
**kwargs,
|
384
|
+
) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
|
385
|
+
"""Open a dataset for streaming.
|
347
386
|
|
348
|
-
Works for `pyarrow` compatible formats
|
387
|
+
Works for `pyarrow` and `polars` compatible formats
|
388
|
+
(`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
|
389
|
+
|
390
|
+
Args:
|
391
|
+
engine: Which module to use for lazy loading of a dataframe
|
392
|
+
from `pyarrow` or `polars` compatible formats.
|
393
|
+
is_run_input: Whether to track this artifact as run input.
|
394
|
+
**kwargs: Keyword arguments for `pyarrow.dataset.dataset` or `polars.scan_*` functions.
|
349
395
|
|
350
396
|
Notes:
|
351
|
-
For more info, see
|
397
|
+
For more info, see guide: :doc:`/arrays`.
|
352
398
|
"""
|
353
399
|
if self._state.adding:
|
354
400
|
artifacts = self._artifacts
|
@@ -356,31 +402,12 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
356
402
|
else:
|
357
403
|
artifacts = self.ordered_artifacts.all()
|
358
404
|
paths = [artifact.path for artifact in artifacts]
|
359
|
-
|
360
|
-
|
361
|
-
fs = paths[0].fs
|
362
|
-
for path in paths[1:]:
|
363
|
-
# this assumes that the filesystems are cached by fsspec
|
364
|
-
if path.fs is not fs:
|
365
|
-
raise ValueError(
|
366
|
-
"The collection has artifacts with different filesystems, this is not supported."
|
367
|
-
)
|
368
|
-
if not _is_pyarrow_dataset(paths):
|
369
|
-
suffixes = {path.suffix for path in paths}
|
370
|
-
suffixes_str = ", ".join(suffixes)
|
371
|
-
err_msg = (
|
372
|
-
"This collection is not compatible with pyarrow.dataset.dataset(), "
|
373
|
-
)
|
374
|
-
err_msg += (
|
375
|
-
f"the artifacts have incompatible file types: {suffixes_str}"
|
376
|
-
if len(suffixes) > 1
|
377
|
-
else f"the file type {suffixes_str} is not supported by pyarrow."
|
378
|
-
)
|
379
|
-
raise ValueError(err_msg)
|
380
|
-
dataset = _open_pyarrow_dataset(paths)
|
405
|
+
|
406
|
+
dataframe = _open_dataframe(paths, engine=engine, **kwargs)
|
381
407
|
# track only if successful
|
408
|
+
# is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
|
382
409
|
_track_run_input(self, is_run_input)
|
383
|
-
return
|
410
|
+
return dataframe
|
384
411
|
|
385
412
|
def mapped(
|
386
413
|
self,
|
@@ -403,8 +430,8 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
403
430
|
<https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
|
404
431
|
virtually concatenating `AnnData` arrays.
|
405
432
|
|
406
|
-
|
407
|
-
cache first
|
433
|
+
By default (`stream=False`) `AnnData` arrays are moved into a local
|
434
|
+
cache first.
|
408
435
|
|
409
436
|
`__getitem__` of the `MappedCollection` object takes a single integer index
|
410
437
|
and returns a dictionary with the observation data sample for this index from
|
@@ -416,7 +443,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
416
443
|
|
417
444
|
For a guide, see :doc:`docs:scrna-mappedcollection`.
|
418
445
|
|
419
|
-
This method currently only works for collections of `AnnData` artifacts.
|
446
|
+
This method currently only works for collections or query sets of `AnnData` artifacts.
|
420
447
|
|
421
448
|
Args:
|
422
449
|
layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
|
@@ -445,6 +472,11 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
445
472
|
>>> ds = ln.Collection.get(description="my collection")
|
446
473
|
>>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
|
447
474
|
>>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
|
475
|
+
>>> # also works for query sets of artifacts, '...' represents some filtering condition
|
476
|
+
>>> # additional filtering on artifacts of the collection
|
477
|
+
>>> mapped = collection.artifacts.all().filter(...).order_by("-created_at").mapped()
|
478
|
+
>>> # or directly from a query set of artifacts
|
479
|
+
>>> mapped = ln.Artifact.filter(..., otype="AnnData").order_by("-created_at").mapped()
|
448
480
|
"""
|
449
481
|
path_list = []
|
450
482
|
if self._state.adding:
|
@@ -474,6 +506,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
474
506
|
dtype,
|
475
507
|
)
|
476
508
|
# track only if successful
|
509
|
+
# is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
|
477
510
|
_track_run_input(self, is_run_input)
|
478
511
|
return ds
|
479
512
|
|
@@ -490,6 +523,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
490
523
|
path_list = []
|
491
524
|
for artifact in self.ordered_artifacts.all():
|
492
525
|
path_list.append(artifact.cache())
|
526
|
+
# is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
|
493
527
|
_track_run_input(self, is_run_input)
|
494
528
|
return path_list
|
495
529
|
|
@@ -498,29 +532,16 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
498
532
|
join: Literal["inner", "outer"] = "outer",
|
499
533
|
is_run_input: bool | None = None,
|
500
534
|
**kwargs,
|
501
|
-
) ->
|
502
|
-
"""
|
535
|
+
) -> pd.DataFrame | ad.AnnData:
|
536
|
+
"""Cache and load to memory.
|
503
537
|
|
504
|
-
Returns in-memory
|
538
|
+
Returns an in-memory concatenated `DataFrame` or `AnnData` object.
|
505
539
|
"""
|
506
540
|
# cannot call _track_run_input here, see comment further down
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
"Can only load collections where all artifacts have the same suffix"
|
512
|
-
)
|
513
|
-
# because we're tracking data flow on the collection-level, here, we don't
|
514
|
-
# want to track it on the artifact-level
|
515
|
-
objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
|
516
|
-
artifact_uids = [artifact.uid for artifact in all_artifacts]
|
517
|
-
if isinstance(objects[0], pd.DataFrame):
|
518
|
-
concat_object = pd.concat(objects, join=join)
|
519
|
-
elif isinstance(objects[0], ad.AnnData):
|
520
|
-
concat_object = ad.concat(
|
521
|
-
objects, join=join, label="artifact_uid", keys=artifact_uids
|
522
|
-
)
|
523
|
-
# only call it here because there might be errors during concat
|
541
|
+
artifacts = self.ordered_artifacts.all()
|
542
|
+
concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
|
543
|
+
# only call it here because there might be errors during load or concat
|
544
|
+
# is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
|
524
545
|
_track_run_input(self, is_run_input)
|
525
546
|
return concat_object
|
526
547
|
|
lamindb/models/core.py
CHANGED
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
|
|
24
24
|
|
25
25
|
|
26
26
|
class Storage(Record, TracksRun, TracksUpdates):
|
27
|
-
"""Storage locations.
|
27
|
+
"""Storage locations of artifacts such as S3 buckets or local directories.
|
28
28
|
|
29
29
|
A storage location is either a directory/folder (local or in the cloud) or
|
30
30
|
an entire S3/GCP bucket.
|
lamindb/models/feature.py
CHANGED
@@ -143,40 +143,50 @@ def parse_cat_dtype(
|
|
143
143
|
|
144
144
|
|
145
145
|
def serialize_dtype(
|
146
|
-
dtype: Record | FieldAttr | list[Record]
|
146
|
+
dtype: Registry | Record | FieldAttr | list[Record] | list[Registry] | str,
|
147
|
+
is_itype: bool = False,
|
147
148
|
) -> str:
|
148
149
|
"""Converts a data type object into its string representation."""
|
150
|
+
from .ulabel import ULabel
|
151
|
+
|
149
152
|
if (
|
150
153
|
not isinstance(dtype, list)
|
151
154
|
and hasattr(dtype, "__name__")
|
152
155
|
and dtype.__name__ in FEATURE_DTYPES
|
153
156
|
):
|
154
157
|
dtype_str = dtype.__name__
|
158
|
+
elif dtype is dict:
|
159
|
+
dtype_str = "dict"
|
160
|
+
elif is_itype and isinstance(dtype, str):
|
161
|
+
if dtype not in "Feature":
|
162
|
+
parse_cat_dtype(
|
163
|
+
dtype_str=dtype, is_itype=True
|
164
|
+
) # throws an error if invalid
|
165
|
+
dtype_str = dtype
|
155
166
|
elif isinstance(dtype, (ExtensionDtype, np.dtype)):
|
156
167
|
dtype_str = serialize_pandas_dtype(dtype)
|
157
168
|
else:
|
158
|
-
error_message =
|
159
|
-
|
160
|
-
)
|
161
|
-
if isinstance(dtype, Registry):
|
162
|
-
dtype = [dtype]
|
163
|
-
elif isinstance(dtype, DeferredAttribute):
|
169
|
+
error_message = "dtype has to be a registry, a ulabel subtype, a registry field, or a list of registries or fields, not {}"
|
170
|
+
if isinstance(dtype, (Registry, DeferredAttribute, ULabel)):
|
164
171
|
dtype = [dtype]
|
165
172
|
elif not isinstance(dtype, list):
|
166
173
|
raise ValueError(error_message.format(dtype))
|
167
174
|
dtype_str = ""
|
168
|
-
for
|
169
|
-
if not isinstance(
|
170
|
-
|
171
|
-
):
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
+
for one_dtype in dtype:
|
176
|
+
if not isinstance(one_dtype, (Registry, DeferredAttribute, ULabel)):
|
177
|
+
raise ValueError(error_message.format(one_dtype))
|
178
|
+
if isinstance(one_dtype, Registry):
|
179
|
+
dtype_str += one_dtype.__get_name_with_module__() + "|"
|
180
|
+
elif isinstance(one_dtype, ULabel):
|
181
|
+
assert one_dtype.is_type, ( # noqa: S101
|
182
|
+
f"ulabel has to be a type if acting as dtype, {one_dtype} has `is_type` False"
|
183
|
+
)
|
184
|
+
dtype_str += f"ULabel[{one_dtype.name}]"
|
175
185
|
else:
|
186
|
+
name = one_dtype.field.name
|
187
|
+
field_ext = f".{name}" if name != "name" else ""
|
176
188
|
dtype_str += (
|
177
|
-
|
178
|
-
+ f".{single_dtype.field.name}"
|
179
|
-
+ "|"
|
189
|
+
one_dtype.field.model.__get_name_with_module__() + field_ext + "|"
|
180
190
|
)
|
181
191
|
dtype_str = dtype_str.rstrip("|")
|
182
192
|
if not is_itype:
|
@@ -571,6 +581,30 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
571
581
|
self._aux = self._aux or {}
|
572
582
|
self._aux.setdefault("af", {})["2"] = value
|
573
583
|
|
584
|
+
# we'll enable this later
|
585
|
+
# @property
|
586
|
+
# def observational_unit(self) -> Literal["Artifact", "Observation"]:
|
587
|
+
# """Default observational unit on which the feature is measured.
|
588
|
+
|
589
|
+
# Currently, we only make a distinction between artifact-level and observation-level features.
|
590
|
+
|
591
|
+
# For example, a feature `"ml_split"` that stores `"test"` & `"train"` labels is typically defined on the artifact level.
|
592
|
+
# When accessing `artifact.features.get_values(["ml_split"])`, you expect a single value, either `"test"` or `"train"`.
|
593
|
+
|
594
|
+
# However, when accessing an artifact annotation with a feature that's defined on the observation-level, say `"cell_type"`, you expect a set of values. So,
|
595
|
+
# `artifact.features.get_values(["cell_type_from_expert"])` should return a set: `{"T cell", "B cell"}`.
|
596
|
+
|
597
|
+
# The value of `observational_unit` is currently auto-managed: if using `artifact.featueres.add_values()`,
|
598
|
+
# it will be set to `Artifact`. In a curator, the value depends on whether it's an artifact- or observation-level slot
|
599
|
+
# (e.g. `.uns` is artifact-level in `AnnData` whereas `.obs` is observation-level).
|
600
|
+
|
601
|
+
# Note: This attribute might in the future be used to distinguish different types of observational units (e.g. single cells vs. physical samples vs. study subjects etc.).
|
602
|
+
# """
|
603
|
+
# if self._expect_many:
|
604
|
+
# return "Observation" # this here might be replaced with the specific observational unit
|
605
|
+
# else:
|
606
|
+
# return "Artifact"
|
607
|
+
|
574
608
|
|
575
609
|
class FeatureValue(Record, TracksRun):
|
576
610
|
"""Non-categorical features values.
|
lamindb/models/has_parents.py
CHANGED
@@ -84,10 +84,44 @@ class HasParents:
|
|
84
84
|
return view_parents(
|
85
85
|
record=self, # type: ignore
|
86
86
|
field=field,
|
87
|
+
with_parents=True,
|
87
88
|
with_children=with_children,
|
88
89
|
distance=distance,
|
89
90
|
)
|
90
91
|
|
92
|
+
def view_children(
|
93
|
+
self,
|
94
|
+
field: StrField | None = None,
|
95
|
+
distance: int = 5,
|
96
|
+
):
|
97
|
+
"""View children in an ontology.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
field: Field to display on graph
|
101
|
+
distance: Maximum distance still shown.
|
102
|
+
|
103
|
+
Ontological hierarchies: :class:`~lamindb.ULabel` (project & sub-project), :class:`~bionty.CellType` (cell type & subtype).
|
104
|
+
|
105
|
+
Examples:
|
106
|
+
>>> import bionty as bt
|
107
|
+
>>> bt.Tissue.from_source(name="subsegmental bronchus").save()
|
108
|
+
>>> record = bt.Tissue.get(name="respiratory tube")
|
109
|
+
>>> record.view_parents()
|
110
|
+
>>> tissue.view_parents(with_children=True)
|
111
|
+
"""
|
112
|
+
if field is None:
|
113
|
+
field = get_name_field(self)
|
114
|
+
if not isinstance(field, str):
|
115
|
+
field = field.field.name
|
116
|
+
|
117
|
+
return view_parents(
|
118
|
+
record=self, # type: ignore
|
119
|
+
field=field,
|
120
|
+
with_parents=False,
|
121
|
+
with_children=True,
|
122
|
+
distance=distance,
|
123
|
+
)
|
124
|
+
|
91
125
|
def query_parents(self) -> QuerySet:
|
92
126
|
"""Query parents in an ontology."""
|
93
127
|
return _query_relatives([self], "parents", self.__class__) # type: ignore
|
@@ -210,6 +244,7 @@ def view_lineage(
|
|
210
244
|
def view_parents(
|
211
245
|
record: Record,
|
212
246
|
field: str,
|
247
|
+
with_parents: bool = True,
|
213
248
|
with_children: bool = False,
|
214
249
|
distance: int = 100,
|
215
250
|
attr_name: Literal["parents", "predecessors"] = "parents",
|
@@ -223,11 +258,12 @@ def view_parents(
|
|
223
258
|
import pandas as pd
|
224
259
|
|
225
260
|
df_edges = None
|
226
|
-
df_edges_parents =
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
261
|
+
df_edges_parents = None
|
262
|
+
df_edges_children = None
|
263
|
+
if with_parents:
|
264
|
+
df_edges_parents = _df_edges_from_parents(
|
265
|
+
record=record, field=field, distance=distance, attr_name=attr_name
|
266
|
+
)
|
231
267
|
if with_children:
|
232
268
|
df_edges_children = _df_edges_from_parents(
|
233
269
|
record=record,
|
@@ -236,13 +272,32 @@ def view_parents(
|
|
236
272
|
children=True,
|
237
273
|
attr_name=attr_name,
|
238
274
|
)
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
275
|
+
# Rename the columns to swap source and target
|
276
|
+
df_edges_children = df_edges_children.rename(
|
277
|
+
columns={
|
278
|
+
"source": "temp_target",
|
279
|
+
"source_label": "temp_target_label",
|
280
|
+
"source_record": "temp_target_record",
|
281
|
+
"target": "source",
|
282
|
+
"target_label": "source_label",
|
283
|
+
"target_record": "source_record",
|
284
|
+
}
|
285
|
+
)
|
286
|
+
df_edges_children = df_edges_children.rename(
|
287
|
+
columns={
|
288
|
+
"temp_target": "target",
|
289
|
+
"temp_target_label": "target_label",
|
290
|
+
"temp_target_record": "target_record",
|
291
|
+
}
|
292
|
+
)
|
293
|
+
if df_edges_parents is not None and df_edges_children is not None:
|
294
|
+
df_edges = pd.concat([df_edges_parents, df_edges_children]).drop_duplicates()
|
295
|
+
elif df_edges_parents is not None:
|
296
|
+
df_edges = df_edges_parents
|
297
|
+
elif df_edges_children is not None:
|
298
|
+
df_edges = df_edges_children
|
299
|
+
else:
|
300
|
+
return None
|
246
301
|
|
247
302
|
record_label = _record_label(record, field)
|
248
303
|
|
@@ -520,14 +575,14 @@ def _get_all_child_runs(data: Artifact | Collection) -> list:
|
|
520
575
|
run_inputs_outputs += [(r, outputs_run)]
|
521
576
|
|
522
577
|
child_runs.update(
|
523
|
-
Run.filter(
|
578
|
+
Run.filter( # type: ignore
|
524
579
|
**{f"input_{name}s__uid__in": [i.uid for i in outputs_run]}
|
525
580
|
).list()
|
526
581
|
)
|
527
582
|
# for artifacts, also include collections in the lineage
|
528
583
|
if name == "artifact":
|
529
584
|
child_runs.update(
|
530
|
-
Run.filter(
|
585
|
+
Run.filter( # type: ignore
|
531
586
|
input_collections__uid__in=[i.uid for i in outputs_run]
|
532
587
|
).list()
|
533
588
|
)
|
lamindb/models/project.py
CHANGED
@@ -36,7 +36,7 @@ if TYPE_CHECKING:
|
|
36
36
|
|
37
37
|
|
38
38
|
class Person(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
|
39
|
-
"""People.
|
39
|
+
"""People such as authors of a study or collaborators in a project.
|
40
40
|
|
41
41
|
This registry is distinct from `User` and exists for project management.
|
42
42
|
|