lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +216 -133
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3559
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1546 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/models/__init__.py +12 -2
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +384 -301
- lamindb/models/_from_values.py +1 -1
- lamindb/models/_is_versioned.py +5 -15
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +354 -177
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/can_curate.py +4 -1
- lamindb/models/collection.py +79 -56
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +78 -47
- lamindb/models/has_parents.py +24 -9
- lamindb/models/project.py +3 -3
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +251 -206
- lamindb/models/record.py +211 -344
- lamindb/models/run.py +59 -5
- lamindb/models/save.py +9 -5
- lamindb/models/schema.py +673 -196
- lamindb/models/transform.py +5 -14
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
- lamindb-1.5.0.dist-info/RECORD +108 -0
- lamindb-1.3.2.dist-info/RECORD +0 -95
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
- {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from collections.abc import Iterable, Iterator
|
4
|
+
from typing import TYPE_CHECKING, Literal
|
5
|
+
|
6
|
+
from lamin_utils import logger
|
7
|
+
from lamindb_setup.core._docs import doc_args
|
8
|
+
|
9
|
+
from ..core._mapped_collection import MappedCollection
|
10
|
+
from ..core.storage._backed_access import _open_dataframe
|
11
|
+
from .artifact import Artifact, _track_run_input
|
12
|
+
from .collection import Collection, _load_concat_artifacts
|
13
|
+
|
14
|
+
if TYPE_CHECKING:
|
15
|
+
from anndata import AnnData
|
16
|
+
from pandas import DataFrame
|
17
|
+
from polars import LazyFrame as PolarsLazyFrame
|
18
|
+
from pyarrow.dataset import Dataset as PyArrowDataset
|
19
|
+
from upath import UPath
|
20
|
+
|
21
|
+
|
22
|
+
UNORDERED_WARNING = (
|
23
|
+
"this query set is unordered, consider using `.order_by()` first "
|
24
|
+
"to avoid opening the artifacts in an arbitrary order"
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
class ArtifactSet(Iterable):
|
29
|
+
"""Abstract class representing sets of artifacts returned by queries.
|
30
|
+
|
31
|
+
This class automatically extends :class:`~lamindb.models.BasicQuerySet`
|
32
|
+
and :class:`~lamindb.models.QuerySet` when the base model is :class:`~lamindb.Artifact`.
|
33
|
+
|
34
|
+
Examples:
|
35
|
+
|
36
|
+
>>> artifacts = ln.Artifact.filter(otype="AnnData")
|
37
|
+
>>> artifacts # an instance of ArtifactQuerySet inheriting from ArtifactSet
|
38
|
+
"""
|
39
|
+
|
40
|
+
@doc_args(Collection.load.__doc__)
|
41
|
+
def load(
|
42
|
+
self,
|
43
|
+
join: Literal["inner", "outer"] = "outer",
|
44
|
+
is_run_input: bool | None = None,
|
45
|
+
**kwargs,
|
46
|
+
) -> DataFrame | AnnData:
|
47
|
+
"""{}""" # noqa: D415
|
48
|
+
if not self.ordered: # type: ignore
|
49
|
+
logger.warning(UNORDERED_WARNING)
|
50
|
+
|
51
|
+
artifacts: list[Artifact] = list(self)
|
52
|
+
concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
|
53
|
+
# track only if successful
|
54
|
+
_track_run_input(artifacts, is_run_input)
|
55
|
+
return concat_object
|
56
|
+
|
57
|
+
@doc_args(Collection.open.__doc__)
|
58
|
+
def open(
|
59
|
+
self,
|
60
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
61
|
+
is_run_input: bool | None = None,
|
62
|
+
**kwargs,
|
63
|
+
) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
|
64
|
+
"""{}""" # noqa: D415
|
65
|
+
if not self.ordered: # type: ignore
|
66
|
+
logger.warning(UNORDERED_WARNING)
|
67
|
+
|
68
|
+
artifacts: list[Artifact] = list(self)
|
69
|
+
paths: list[UPath] = [artifact.path for artifact in artifacts]
|
70
|
+
|
71
|
+
dataframe = _open_dataframe(paths, engine=engine, **kwargs)
|
72
|
+
# track only if successful
|
73
|
+
_track_run_input(artifacts, is_run_input)
|
74
|
+
return dataframe
|
75
|
+
|
76
|
+
@doc_args(Collection.mapped.__doc__)
|
77
|
+
def mapped(
|
78
|
+
self,
|
79
|
+
layers_keys: str | list[str] | None = None,
|
80
|
+
obs_keys: str | list[str] | None = None,
|
81
|
+
obsm_keys: str | list[str] | None = None,
|
82
|
+
obs_filter: dict[str, str | list[str]] | None = None,
|
83
|
+
join: Literal["inner", "outer"] | None = "inner",
|
84
|
+
encode_labels: bool | list[str] = True,
|
85
|
+
unknown_label: str | dict[str, str] | None = None,
|
86
|
+
cache_categories: bool = True,
|
87
|
+
parallel: bool = False,
|
88
|
+
dtype: str | None = None,
|
89
|
+
stream: bool = False,
|
90
|
+
is_run_input: bool | None = None,
|
91
|
+
) -> MappedCollection:
|
92
|
+
"""{}""" # noqa: D415
|
93
|
+
if not self.ordered: # type: ignore
|
94
|
+
logger.warning(UNORDERED_WARNING)
|
95
|
+
|
96
|
+
artifacts: list[Artifact] = []
|
97
|
+
paths: list[UPath] = []
|
98
|
+
for artifact in self:
|
99
|
+
if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
|
100
|
+
logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
|
101
|
+
continue
|
102
|
+
elif not stream:
|
103
|
+
paths.append(artifact.cache())
|
104
|
+
else:
|
105
|
+
paths.append(artifact.path)
|
106
|
+
artifacts.append(artifact)
|
107
|
+
ds = MappedCollection(
|
108
|
+
paths,
|
109
|
+
layers_keys,
|
110
|
+
obs_keys,
|
111
|
+
obsm_keys,
|
112
|
+
obs_filter,
|
113
|
+
join,
|
114
|
+
encode_labels,
|
115
|
+
unknown_label,
|
116
|
+
cache_categories,
|
117
|
+
parallel,
|
118
|
+
dtype,
|
119
|
+
)
|
120
|
+
# track only if successful
|
121
|
+
_track_run_input(artifacts, is_run_input)
|
122
|
+
return ds
|
lamindb/models/can_curate.py
CHANGED
@@ -57,6 +57,7 @@ def _inspect(
|
|
57
57
|
mute: bool = False,
|
58
58
|
organism: str | Record | None = None,
|
59
59
|
source: Record | None = None,
|
60
|
+
from_source: bool = True,
|
60
61
|
strict_source: bool = False,
|
61
62
|
) -> pd.DataFrame | dict[str, list[str]]:
|
62
63
|
"""{}""" # noqa: D415
|
@@ -94,7 +95,7 @@ def _inspect(
|
|
94
95
|
)
|
95
96
|
nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
|
96
97
|
|
97
|
-
if len(nonval) > 0 and hasattr(registry, "source_id"):
|
98
|
+
if from_source and len(nonval) > 0 and hasattr(registry, "source_id"):
|
98
99
|
try:
|
99
100
|
public_result = registry.public(
|
100
101
|
organism=organism_record, source=source
|
@@ -463,6 +464,7 @@ class CanCurate:
|
|
463
464
|
mute: bool = False,
|
464
465
|
organism: Union[str, Record, None] = None,
|
465
466
|
source: Record | None = None,
|
467
|
+
from_source: bool = True,
|
466
468
|
strict_source: bool = False,
|
467
469
|
) -> InspectResult:
|
468
470
|
"""Inspect if values are mappable to a field.
|
@@ -506,6 +508,7 @@ class CanCurate:
|
|
506
508
|
strict_source=strict_source,
|
507
509
|
organism=organism,
|
508
510
|
source=source,
|
511
|
+
from_source=from_source,
|
509
512
|
)
|
510
513
|
|
511
514
|
@classmethod
|
lamindb/models/collection.py
CHANGED
@@ -24,7 +24,7 @@ from lamindb.base.fields import (
|
|
24
24
|
|
25
25
|
from ..base.ids import base62_20
|
26
26
|
from ..core._mapped_collection import MappedCollection
|
27
|
-
from ..core.storage.
|
27
|
+
from ..core.storage._backed_access import _open_dataframe
|
28
28
|
from ..errors import FieldValidationError
|
29
29
|
from ..models._is_versioned import process_revises
|
30
30
|
from ._is_versioned import IsVersioned
|
@@ -48,8 +48,9 @@ from .record import (
|
|
48
48
|
from .run import Run, TracksRun, TracksUpdates
|
49
49
|
|
50
50
|
if TYPE_CHECKING:
|
51
|
-
from collections.abc import Iterable
|
51
|
+
from collections.abc import Iterable, Iterator
|
52
52
|
|
53
|
+
from polars import LazyFrame as PolarsLazyFrame
|
53
54
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
54
55
|
|
55
56
|
from ..core.storage import UPath
|
@@ -94,6 +95,39 @@ if TYPE_CHECKING:
|
|
94
95
|
# return feature_sets_union
|
95
96
|
|
96
97
|
|
98
|
+
def _load_concat_artifacts(
|
99
|
+
artifacts: list[Artifact], join: Literal["inner", "outer"] = "outer", **kwargs
|
100
|
+
) -> pd.DataFrame | ad.AnnData:
|
101
|
+
suffixes = {artifact.suffix for artifact in artifacts}
|
102
|
+
# Why is that? - Sergei
|
103
|
+
if len(suffixes) != 1:
|
104
|
+
raise ValueError(
|
105
|
+
"Can only load collections where all artifacts have the same suffix"
|
106
|
+
)
|
107
|
+
|
108
|
+
# because we're tracking data flow on the collection-level, here, we don't
|
109
|
+
# want to track it on the artifact-level
|
110
|
+
first_object = artifacts[0].load(is_run_input=False)
|
111
|
+
is_dataframe = isinstance(first_object, pd.DataFrame)
|
112
|
+
is_anndata = isinstance(first_object, ad.AnnData)
|
113
|
+
if not is_dataframe and not is_anndata:
|
114
|
+
raise ValueError(f"Unable to concatenate {suffixes.pop()} objects.")
|
115
|
+
|
116
|
+
objects = [first_object]
|
117
|
+
artifact_uids = [artifacts[0].uid]
|
118
|
+
for artifact in artifacts[1:]:
|
119
|
+
objects.append(artifact.load(is_run_input=False))
|
120
|
+
artifact_uids.append(artifact.uid)
|
121
|
+
|
122
|
+
if is_dataframe:
|
123
|
+
concat_object = pd.concat(objects, join=join, **kwargs)
|
124
|
+
elif is_anndata:
|
125
|
+
label = kwargs.pop("label", "artifact_uid")
|
126
|
+
keys = kwargs.pop("keys", artifact_uids)
|
127
|
+
concat_object = ad.concat(objects, join=join, label=label, keys=keys, **kwargs)
|
128
|
+
return concat_object
|
129
|
+
|
130
|
+
|
97
131
|
class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
98
132
|
"""Collections of artifacts.
|
99
133
|
|
@@ -325,11 +359,13 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
325
359
|
artifact: An artifact to add to the collection.
|
326
360
|
run: The run that creates the new version of the collection.
|
327
361
|
|
328
|
-
Examples
|
362
|
+
Examples:
|
363
|
+
|
364
|
+
::
|
329
365
|
|
330
|
-
|
331
|
-
|
332
|
-
|
366
|
+
collection_v1 = ln.Collection(artifact, key="My collection").save()
|
367
|
+
collection_v2 = collection.append(another_artifact) # returns a new version of the collection
|
368
|
+
collection_v2.save() # save the new version
|
333
369
|
|
334
370
|
"""
|
335
371
|
return Collection( # type: ignore
|
@@ -340,13 +376,25 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
340
376
|
run=run,
|
341
377
|
)
|
342
378
|
|
343
|
-
def open(
|
344
|
-
|
379
|
+
def open(
|
380
|
+
self,
|
381
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
382
|
+
is_run_input: bool | None = None,
|
383
|
+
**kwargs,
|
384
|
+
) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
|
385
|
+
"""Open a dataset for streaming.
|
386
|
+
|
387
|
+
Works for `pyarrow` and `polars` compatible formats
|
388
|
+
(`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
|
345
389
|
|
346
|
-
|
390
|
+
Args:
|
391
|
+
engine: Which module to use for lazy loading of a dataframe
|
392
|
+
from `pyarrow` or `polars` compatible formats.
|
393
|
+
is_run_input: Whether to track this artifact as run input.
|
394
|
+
**kwargs: Keyword arguments for `pyarrow.dataset.dataset` or `polars.scan_*` functions.
|
347
395
|
|
348
396
|
Notes:
|
349
|
-
For more info, see
|
397
|
+
For more info, see guide: :doc:`/arrays`.
|
350
398
|
"""
|
351
399
|
if self._state.adding:
|
352
400
|
artifacts = self._artifacts
|
@@ -354,31 +402,12 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
354
402
|
else:
|
355
403
|
artifacts = self.ordered_artifacts.all()
|
356
404
|
paths = [artifact.path for artifact in artifacts]
|
357
|
-
|
358
|
-
|
359
|
-
fs = paths[0].fs
|
360
|
-
for path in paths[1:]:
|
361
|
-
# this assumes that the filesystems are cached by fsspec
|
362
|
-
if path.fs is not fs:
|
363
|
-
raise ValueError(
|
364
|
-
"The collection has artifacts with different filesystems, this is not supported."
|
365
|
-
)
|
366
|
-
if not _is_pyarrow_dataset(paths):
|
367
|
-
suffixes = {path.suffix for path in paths}
|
368
|
-
suffixes_str = ", ".join(suffixes)
|
369
|
-
err_msg = (
|
370
|
-
"This collection is not compatible with pyarrow.dataset.dataset(), "
|
371
|
-
)
|
372
|
-
err_msg += (
|
373
|
-
f"the artifacts have incompatible file types: {suffixes_str}"
|
374
|
-
if len(suffixes) > 1
|
375
|
-
else f"the file type {suffixes_str} is not supported by pyarrow."
|
376
|
-
)
|
377
|
-
raise ValueError(err_msg)
|
378
|
-
dataset = _open_pyarrow_dataset(paths)
|
405
|
+
|
406
|
+
dataframe = _open_dataframe(paths, engine=engine, **kwargs)
|
379
407
|
# track only if successful
|
408
|
+
# is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
|
380
409
|
_track_run_input(self, is_run_input)
|
381
|
-
return
|
410
|
+
return dataframe
|
382
411
|
|
383
412
|
def mapped(
|
384
413
|
self,
|
@@ -401,8 +430,8 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
401
430
|
<https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
|
402
431
|
virtually concatenating `AnnData` arrays.
|
403
432
|
|
404
|
-
|
405
|
-
cache first
|
433
|
+
By default (`stream=False`) `AnnData` arrays are moved into a local
|
434
|
+
cache first.
|
406
435
|
|
407
436
|
`__getitem__` of the `MappedCollection` object takes a single integer index
|
408
437
|
and returns a dictionary with the observation data sample for this index from
|
@@ -414,7 +443,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
414
443
|
|
415
444
|
For a guide, see :doc:`docs:scrna-mappedcollection`.
|
416
445
|
|
417
|
-
This method currently only works for collections of `AnnData` artifacts.
|
446
|
+
This method currently only works for collections or query sets of `AnnData` artifacts.
|
418
447
|
|
419
448
|
Args:
|
420
449
|
layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
|
@@ -443,6 +472,11 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
443
472
|
>>> ds = ln.Collection.get(description="my collection")
|
444
473
|
>>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
|
445
474
|
>>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
|
475
|
+
>>> # also works for query sets of artifacts, '...' represents some filtering condition
|
476
|
+
>>> # additional filtering on artifacts of the collection
|
477
|
+
>>> mapped = collection.artifacts.all().filter(...).order_by("-created_at").mapped()
|
478
|
+
>>> # or directly from a query set of artifacts
|
479
|
+
>>> mapped = ln.Artifact.filter(..., otype="AnnData").order_by("-created_at").mapped()
|
446
480
|
"""
|
447
481
|
path_list = []
|
448
482
|
if self._state.adding:
|
@@ -472,6 +506,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
472
506
|
dtype,
|
473
507
|
)
|
474
508
|
# track only if successful
|
509
|
+
# is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
|
475
510
|
_track_run_input(self, is_run_input)
|
476
511
|
return ds
|
477
512
|
|
@@ -488,6 +523,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
488
523
|
path_list = []
|
489
524
|
for artifact in self.ordered_artifacts.all():
|
490
525
|
path_list.append(artifact.cache())
|
526
|
+
# is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
|
491
527
|
_track_run_input(self, is_run_input)
|
492
528
|
return path_list
|
493
529
|
|
@@ -496,29 +532,16 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
496
532
|
join: Literal["inner", "outer"] = "outer",
|
497
533
|
is_run_input: bool | None = None,
|
498
534
|
**kwargs,
|
499
|
-
) ->
|
500
|
-
"""
|
535
|
+
) -> pd.DataFrame | ad.AnnData:
|
536
|
+
"""Cache and load to memory.
|
501
537
|
|
502
|
-
Returns in-memory
|
538
|
+
Returns an in-memory concatenated `DataFrame` or `AnnData` object.
|
503
539
|
"""
|
504
540
|
# cannot call _track_run_input here, see comment further down
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
"Can only load collections where all artifacts have the same suffix"
|
510
|
-
)
|
511
|
-
# because we're tracking data flow on the collection-level, here, we don't
|
512
|
-
# want to track it on the artifact-level
|
513
|
-
objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
|
514
|
-
artifact_uids = [artifact.uid for artifact in all_artifacts]
|
515
|
-
if isinstance(objects[0], pd.DataFrame):
|
516
|
-
concat_object = pd.concat(objects, join=join)
|
517
|
-
elif isinstance(objects[0], ad.AnnData):
|
518
|
-
concat_object = ad.concat(
|
519
|
-
objects, join=join, label="artifact_uid", keys=artifact_uids
|
520
|
-
)
|
521
|
-
# only call it here because there might be errors during concat
|
541
|
+
artifacts = self.ordered_artifacts.all()
|
542
|
+
concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
|
543
|
+
# only call it here because there might be errors during load or concat
|
544
|
+
# is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
|
522
545
|
_track_run_input(self, is_run_input)
|
523
546
|
return concat_object
|
524
547
|
|
lamindb/models/core.py
CHANGED
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
|
|
24
24
|
|
25
25
|
|
26
26
|
class Storage(Record, TracksRun, TracksUpdates):
|
27
|
-
"""Storage locations.
|
27
|
+
"""Storage locations of artifacts such as S3 buckets or local directories.
|
28
28
|
|
29
29
|
A storage location is either a directory/folder (local or in the cloud) or
|
30
30
|
an entire S3/GCP bucket.
|
lamindb/models/feature.py
CHANGED
@@ -143,40 +143,50 @@ def parse_cat_dtype(
|
|
143
143
|
|
144
144
|
|
145
145
|
def serialize_dtype(
|
146
|
-
dtype: Record | FieldAttr | list[Record]
|
146
|
+
dtype: Registry | Record | FieldAttr | list[Record] | list[Registry] | str,
|
147
|
+
is_itype: bool = False,
|
147
148
|
) -> str:
|
148
149
|
"""Converts a data type object into its string representation."""
|
150
|
+
from .ulabel import ULabel
|
151
|
+
|
149
152
|
if (
|
150
153
|
not isinstance(dtype, list)
|
151
154
|
and hasattr(dtype, "__name__")
|
152
155
|
and dtype.__name__ in FEATURE_DTYPES
|
153
156
|
):
|
154
157
|
dtype_str = dtype.__name__
|
158
|
+
elif dtype is dict:
|
159
|
+
dtype_str = "dict"
|
160
|
+
elif is_itype and isinstance(dtype, str):
|
161
|
+
if dtype not in "Feature":
|
162
|
+
parse_cat_dtype(
|
163
|
+
dtype_str=dtype, is_itype=True
|
164
|
+
) # throws an error if invalid
|
165
|
+
dtype_str = dtype
|
155
166
|
elif isinstance(dtype, (ExtensionDtype, np.dtype)):
|
156
167
|
dtype_str = serialize_pandas_dtype(dtype)
|
157
168
|
else:
|
158
|
-
error_message =
|
159
|
-
|
160
|
-
)
|
161
|
-
if isinstance(dtype, Registry):
|
162
|
-
dtype = [dtype]
|
163
|
-
elif isinstance(dtype, DeferredAttribute):
|
169
|
+
error_message = "dtype has to be a registry, a ulabel subtype, a registry field, or a list of registries or fields, not {}"
|
170
|
+
if isinstance(dtype, (Registry, DeferredAttribute, ULabel)):
|
164
171
|
dtype = [dtype]
|
165
172
|
elif not isinstance(dtype, list):
|
166
173
|
raise ValueError(error_message.format(dtype))
|
167
174
|
dtype_str = ""
|
168
|
-
for
|
169
|
-
if not isinstance(
|
170
|
-
|
171
|
-
):
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
+
for one_dtype in dtype:
|
176
|
+
if not isinstance(one_dtype, (Registry, DeferredAttribute, ULabel)):
|
177
|
+
raise ValueError(error_message.format(one_dtype))
|
178
|
+
if isinstance(one_dtype, Registry):
|
179
|
+
dtype_str += one_dtype.__get_name_with_module__() + "|"
|
180
|
+
elif isinstance(one_dtype, ULabel):
|
181
|
+
assert one_dtype.is_type, ( # noqa: S101
|
182
|
+
f"ulabel has to be a type if acting as dtype, {one_dtype} has `is_type` False"
|
183
|
+
)
|
184
|
+
dtype_str += f"ULabel[{one_dtype.name}]"
|
175
185
|
else:
|
186
|
+
name = one_dtype.field.name
|
187
|
+
field_ext = f".{name}" if name != "name" else ""
|
176
188
|
dtype_str += (
|
177
|
-
|
178
|
-
+ f".{single_dtype.field.name}"
|
179
|
-
+ "|"
|
189
|
+
one_dtype.field.model.__get_name_with_module__() + field_ext + "|"
|
180
190
|
)
|
181
191
|
dtype_str = dtype_str.rstrip("|")
|
182
192
|
if not is_itype:
|
@@ -332,7 +342,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
332
342
|
|
333
343
|
_name_field: str = "name"
|
334
344
|
_aux_fields: dict[str, tuple[str, type]] = {
|
335
|
-
"0": ("default_value",
|
345
|
+
"0": ("default_value", Any), # type: ignore
|
336
346
|
"1": ("nullable", bool),
|
337
347
|
"2": ("coerce_dtype", bool),
|
338
348
|
}
|
@@ -499,24 +509,11 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
499
509
|
super().save(*args, **kwargs)
|
500
510
|
return self
|
501
511
|
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
"""
|
508
|
-
if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]: # type: ignore
|
509
|
-
return self._aux["af"]["2"] # type: ignore
|
510
|
-
else:
|
511
|
-
return False
|
512
|
-
|
513
|
-
@coerce_dtype.setter
|
514
|
-
def coerce_dtype(self, value: bool) -> None:
|
515
|
-
if self._aux is None: # type: ignore
|
516
|
-
self._aux = {} # type: ignore
|
517
|
-
if "af" not in self._aux:
|
518
|
-
self._aux["af"] = {}
|
519
|
-
self._aux["af"]["2"] = value
|
512
|
+
def with_config(self, optional: bool | None = None) -> tuple[Feature, dict]:
|
513
|
+
"""Pass addtional configurations to the schema."""
|
514
|
+
if optional is not None:
|
515
|
+
return self, {"optional": optional}
|
516
|
+
return self, {}
|
520
517
|
|
521
518
|
@property
|
522
519
|
def default_value(self) -> Any:
|
@@ -532,12 +529,9 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
532
529
|
return None
|
533
530
|
|
534
531
|
@default_value.setter
|
535
|
-
def default_value(self, value:
|
536
|
-
|
537
|
-
|
538
|
-
if "af" not in self._aux:
|
539
|
-
self._aux["af"] = {}
|
540
|
-
self._aux["af"]["0"] = value
|
532
|
+
def default_value(self, value: str | None) -> None:
|
533
|
+
self._aux = self._aux or {}
|
534
|
+
self._aux.setdefault("af", {})["0"] = value
|
541
535
|
|
542
536
|
@property
|
543
537
|
def nullable(self) -> bool:
|
@@ -568,11 +562,48 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
568
562
|
@nullable.setter
|
569
563
|
def nullable(self, value: bool) -> None:
|
570
564
|
assert isinstance(value, bool), value # noqa: S101
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
565
|
+
self._aux = self._aux or {}
|
566
|
+
self._aux.setdefault("af", {})["1"] = value
|
567
|
+
|
568
|
+
@property
|
569
|
+
def coerce_dtype(self) -> bool:
|
570
|
+
"""Whether dtypes should be coerced during validation.
|
571
|
+
|
572
|
+
For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
|
573
|
+
"""
|
574
|
+
if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]: # type: ignore
|
575
|
+
return self._aux["af"]["2"] # type: ignore
|
576
|
+
else:
|
577
|
+
return False
|
578
|
+
|
579
|
+
@coerce_dtype.setter
|
580
|
+
def coerce_dtype(self, value: bool) -> None:
|
581
|
+
self._aux = self._aux or {}
|
582
|
+
self._aux.setdefault("af", {})["2"] = value
|
583
|
+
|
584
|
+
# we'll enable this later
|
585
|
+
# @property
|
586
|
+
# def observational_unit(self) -> Literal["Artifact", "Observation"]:
|
587
|
+
# """Default observational unit on which the feature is measured.
|
588
|
+
|
589
|
+
# Currently, we only make a distinction between artifact-level and observation-level features.
|
590
|
+
|
591
|
+
# For example, a feature `"ml_split"` that stores `"test"` & `"train"` labels is typically defined on the artifact level.
|
592
|
+
# When accessing `artifact.features.get_values(["ml_split"])`, you expect a single value, either `"test"` or `"train"`.
|
593
|
+
|
594
|
+
# However, when accessing an artifact annotation with a feature that's defined on the observation-level, say `"cell_type"`, you expect a set of values. So,
|
595
|
+
# `artifact.features.get_values(["cell_type_from_expert"])` should return a set: `{"T cell", "B cell"}`.
|
596
|
+
|
597
|
+
# The value of `observational_unit` is currently auto-managed: if using `artifact.featueres.add_values()`,
|
598
|
+
# it will be set to `Artifact`. In a curator, the value depends on whether it's an artifact- or observation-level slot
|
599
|
+
# (e.g. `.uns` is artifact-level in `AnnData` whereas `.obs` is observation-level).
|
600
|
+
|
601
|
+
# Note: This attribute might in the future be used to distinguish different types of observational units (e.g. single cells vs. physical samples vs. study subjects etc.).
|
602
|
+
# """
|
603
|
+
# if self._expect_many:
|
604
|
+
# return "Observation" # this here might be replaced with the specific observational unit
|
605
|
+
# else:
|
606
|
+
# return "Artifact"
|
576
607
|
|
577
608
|
|
578
609
|
class FeatureValue(Record, TracksRun):
|
lamindb/models/has_parents.py
CHANGED
@@ -4,12 +4,15 @@ from __future__ import annotations
|
|
4
4
|
import builtins
|
5
5
|
from typing import TYPE_CHECKING, Literal
|
6
6
|
|
7
|
+
import lamindb_setup as ln_setup
|
7
8
|
from lamin_utils import logger
|
8
9
|
|
9
10
|
from .record import format_field_value, get_name_field
|
10
11
|
from .run import Run
|
11
12
|
|
12
13
|
if TYPE_CHECKING:
|
14
|
+
from graphviz import Digraph
|
15
|
+
|
13
16
|
from lamindb.base.types import StrField
|
14
17
|
|
15
18
|
from .artifact import Artifact
|
@@ -78,7 +81,7 @@ class HasParents:
|
|
78
81
|
if not isinstance(field, str):
|
79
82
|
field = field.field.name
|
80
83
|
|
81
|
-
return
|
84
|
+
return view_parents(
|
82
85
|
record=self, # type: ignore
|
83
86
|
field=field,
|
84
87
|
with_children=with_children,
|
@@ -101,7 +104,7 @@ def _transform_emoji(transform: Transform):
|
|
101
104
|
return TRANSFORM_EMOJIS["pipeline"]
|
102
105
|
|
103
106
|
|
104
|
-
def
|
107
|
+
def view_digraph(u: Digraph):
|
105
108
|
from graphviz.backend import ExecutableNotFound
|
106
109
|
|
107
110
|
try:
|
@@ -117,7 +120,7 @@ def _view(u):
|
|
117
120
|
# call to display()
|
118
121
|
display(u._repr_mimebundle_(), raw=True)
|
119
122
|
else:
|
120
|
-
return u
|
123
|
+
return u.view()
|
121
124
|
except (FileNotFoundError, RuntimeError, ExecutableNotFound): # pragma: no cover
|
122
125
|
logger.error(
|
123
126
|
"please install the graphviz executable on your system:\n - Ubuntu: `sudo"
|
@@ -126,7 +129,9 @@ def _view(u):
|
|
126
129
|
)
|
127
130
|
|
128
131
|
|
129
|
-
def view_lineage(
|
132
|
+
def view_lineage(
|
133
|
+
data: Artifact | Collection, with_children: bool = True, return_graph: bool = False
|
134
|
+
) -> Digraph | None:
|
130
135
|
"""Graph of data flow.
|
131
136
|
|
132
137
|
Notes:
|
@@ -136,6 +141,13 @@ def view_lineage(data: Artifact | Collection, with_children: bool = True) -> Non
|
|
136
141
|
>>> collection.view_lineage()
|
137
142
|
>>> artifact.view_lineage()
|
138
143
|
"""
|
144
|
+
if ln_setup.settings.instance.is_on_hub:
|
145
|
+
instance_slug = ln_setup.settings.instance.slug
|
146
|
+
entity_slug = data.__class__.__name__.lower()
|
147
|
+
logger.important(
|
148
|
+
f"explore at: https://lamin.ai/{instance_slug}/{entity_slug}/{data.uid}"
|
149
|
+
)
|
150
|
+
|
139
151
|
import graphviz
|
140
152
|
|
141
153
|
df_values = _get_all_parent_runs(data)
|
@@ -189,10 +201,13 @@ def view_lineage(data: Artifact | Collection, with_children: bool = True) -> Non
|
|
189
201
|
shape="box",
|
190
202
|
)
|
191
203
|
|
192
|
-
|
204
|
+
if return_graph:
|
205
|
+
return u
|
206
|
+
else:
|
207
|
+
return view_digraph(u)
|
193
208
|
|
194
209
|
|
195
|
-
def
|
210
|
+
def view_parents(
|
196
211
|
record: Record,
|
197
212
|
field: str,
|
198
213
|
with_children: bool = False,
|
@@ -258,7 +273,7 @@ def _view_parents(
|
|
258
273
|
u.node(row["target"], label=row["target_label"])
|
259
274
|
u.edge(row["source"], row["target"], color="dimgrey")
|
260
275
|
|
261
|
-
|
276
|
+
view_digraph(u)
|
262
277
|
|
263
278
|
|
264
279
|
def _get_parents(
|
@@ -505,14 +520,14 @@ def _get_all_child_runs(data: Artifact | Collection) -> list:
|
|
505
520
|
run_inputs_outputs += [(r, outputs_run)]
|
506
521
|
|
507
522
|
child_runs.update(
|
508
|
-
Run.filter(
|
523
|
+
Run.filter( # type: ignore
|
509
524
|
**{f"input_{name}s__uid__in": [i.uid for i in outputs_run]}
|
510
525
|
).list()
|
511
526
|
)
|
512
527
|
# for artifacts, also include collections in the lineage
|
513
528
|
if name == "artifact":
|
514
529
|
child_runs.update(
|
515
|
-
Run.filter(
|
530
|
+
Run.filter( # type: ignore
|
516
531
|
input_collections__uid__in=[i.uid for i in outputs_run]
|
517
532
|
).list()
|
518
533
|
)
|