lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +216 -133
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/objects.py +6 -0
  20. lamindb/core/subsettings/__init__.py +2 -0
  21. lamindb/core/subsettings/_annotation_settings.py +11 -0
  22. lamindb/curators/__init__.py +7 -3559
  23. lamindb/curators/_legacy.py +2056 -0
  24. lamindb/curators/core.py +1546 -0
  25. lamindb/errors.py +11 -0
  26. lamindb/examples/__init__.py +27 -0
  27. lamindb/examples/schemas/__init__.py +12 -0
  28. lamindb/examples/schemas/_anndata.py +25 -0
  29. lamindb/examples/schemas/_simple.py +19 -0
  30. lamindb/integrations/_vitessce.py +8 -5
  31. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  32. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  33. lamindb/models/__init__.py +12 -2
  34. lamindb/models/_describe.py +21 -4
  35. lamindb/models/_feature_manager.py +384 -301
  36. lamindb/models/_from_values.py +1 -1
  37. lamindb/models/_is_versioned.py +5 -15
  38. lamindb/models/_label_manager.py +8 -2
  39. lamindb/models/artifact.py +354 -177
  40. lamindb/models/artifact_set.py +122 -0
  41. lamindb/models/can_curate.py +4 -1
  42. lamindb/models/collection.py +79 -56
  43. lamindb/models/core.py +1 -1
  44. lamindb/models/feature.py +78 -47
  45. lamindb/models/has_parents.py +24 -9
  46. lamindb/models/project.py +3 -3
  47. lamindb/models/query_manager.py +221 -22
  48. lamindb/models/query_set.py +251 -206
  49. lamindb/models/record.py +211 -344
  50. lamindb/models/run.py +59 -5
  51. lamindb/models/save.py +9 -5
  52. lamindb/models/schema.py +673 -196
  53. lamindb/models/transform.py +5 -14
  54. lamindb/models/ulabel.py +8 -5
  55. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
  56. lamindb-1.5.0.dist-info/RECORD +108 -0
  57. lamindb-1.3.2.dist-info/RECORD +0 -95
  58. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
  59. {lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,122 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterable, Iterator
4
+ from typing import TYPE_CHECKING, Literal
5
+
6
+ from lamin_utils import logger
7
+ from lamindb_setup.core._docs import doc_args
8
+
9
+ from ..core._mapped_collection import MappedCollection
10
+ from ..core.storage._backed_access import _open_dataframe
11
+ from .artifact import Artifact, _track_run_input
12
+ from .collection import Collection, _load_concat_artifacts
13
+
14
+ if TYPE_CHECKING:
15
+ from anndata import AnnData
16
+ from pandas import DataFrame
17
+ from polars import LazyFrame as PolarsLazyFrame
18
+ from pyarrow.dataset import Dataset as PyArrowDataset
19
+ from upath import UPath
20
+
21
+
22
+ UNORDERED_WARNING = (
23
+ "this query set is unordered, consider using `.order_by()` first "
24
+ "to avoid opening the artifacts in an arbitrary order"
25
+ )
26
+
27
+
28
+ class ArtifactSet(Iterable):
29
+ """Abstract class representing sets of artifacts returned by queries.
30
+
31
+ This class automatically extends :class:`~lamindb.models.BasicQuerySet`
32
+ and :class:`~lamindb.models.QuerySet` when the base model is :class:`~lamindb.Artifact`.
33
+
34
+ Examples:
35
+
36
+ >>> artifacts = ln.Artifact.filter(otype="AnnData")
37
+ >>> artifacts # an instance of ArtifactQuerySet inheriting from ArtifactSet
38
+ """
39
+
40
+ @doc_args(Collection.load.__doc__)
41
+ def load(
42
+ self,
43
+ join: Literal["inner", "outer"] = "outer",
44
+ is_run_input: bool | None = None,
45
+ **kwargs,
46
+ ) -> DataFrame | AnnData:
47
+ """{}""" # noqa: D415
48
+ if not self.ordered: # type: ignore
49
+ logger.warning(UNORDERED_WARNING)
50
+
51
+ artifacts: list[Artifact] = list(self)
52
+ concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
53
+ # track only if successful
54
+ _track_run_input(artifacts, is_run_input)
55
+ return concat_object
56
+
57
+ @doc_args(Collection.open.__doc__)
58
+ def open(
59
+ self,
60
+ engine: Literal["pyarrow", "polars"] = "pyarrow",
61
+ is_run_input: bool | None = None,
62
+ **kwargs,
63
+ ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
64
+ """{}""" # noqa: D415
65
+ if not self.ordered: # type: ignore
66
+ logger.warning(UNORDERED_WARNING)
67
+
68
+ artifacts: list[Artifact] = list(self)
69
+ paths: list[UPath] = [artifact.path for artifact in artifacts]
70
+
71
+ dataframe = _open_dataframe(paths, engine=engine, **kwargs)
72
+ # track only if successful
73
+ _track_run_input(artifacts, is_run_input)
74
+ return dataframe
75
+
76
+ @doc_args(Collection.mapped.__doc__)
77
+ def mapped(
78
+ self,
79
+ layers_keys: str | list[str] | None = None,
80
+ obs_keys: str | list[str] | None = None,
81
+ obsm_keys: str | list[str] | None = None,
82
+ obs_filter: dict[str, str | list[str]] | None = None,
83
+ join: Literal["inner", "outer"] | None = "inner",
84
+ encode_labels: bool | list[str] = True,
85
+ unknown_label: str | dict[str, str] | None = None,
86
+ cache_categories: bool = True,
87
+ parallel: bool = False,
88
+ dtype: str | None = None,
89
+ stream: bool = False,
90
+ is_run_input: bool | None = None,
91
+ ) -> MappedCollection:
92
+ """{}""" # noqa: D415
93
+ if not self.ordered: # type: ignore
94
+ logger.warning(UNORDERED_WARNING)
95
+
96
+ artifacts: list[Artifact] = []
97
+ paths: list[UPath] = []
98
+ for artifact in self:
99
+ if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
100
+ logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
101
+ continue
102
+ elif not stream:
103
+ paths.append(artifact.cache())
104
+ else:
105
+ paths.append(artifact.path)
106
+ artifacts.append(artifact)
107
+ ds = MappedCollection(
108
+ paths,
109
+ layers_keys,
110
+ obs_keys,
111
+ obsm_keys,
112
+ obs_filter,
113
+ join,
114
+ encode_labels,
115
+ unknown_label,
116
+ cache_categories,
117
+ parallel,
118
+ dtype,
119
+ )
120
+ # track only if successful
121
+ _track_run_input(artifacts, is_run_input)
122
+ return ds
@@ -57,6 +57,7 @@ def _inspect(
57
57
  mute: bool = False,
58
58
  organism: str | Record | None = None,
59
59
  source: Record | None = None,
60
+ from_source: bool = True,
60
61
  strict_source: bool = False,
61
62
  ) -> pd.DataFrame | dict[str, list[str]]:
62
63
  """{}""" # noqa: D415
@@ -94,7 +95,7 @@ def _inspect(
94
95
  )
95
96
  nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
96
97
 
97
- if len(nonval) > 0 and hasattr(registry, "source_id"):
98
+ if from_source and len(nonval) > 0 and hasattr(registry, "source_id"):
98
99
  try:
99
100
  public_result = registry.public(
100
101
  organism=organism_record, source=source
@@ -463,6 +464,7 @@ class CanCurate:
463
464
  mute: bool = False,
464
465
  organism: Union[str, Record, None] = None,
465
466
  source: Record | None = None,
467
+ from_source: bool = True,
466
468
  strict_source: bool = False,
467
469
  ) -> InspectResult:
468
470
  """Inspect if values are mappable to a field.
@@ -506,6 +508,7 @@ class CanCurate:
506
508
  strict_source=strict_source,
507
509
  organism=organism,
508
510
  source=source,
511
+ from_source=from_source,
509
512
  )
510
513
 
511
514
  @classmethod
@@ -24,7 +24,7 @@ from lamindb.base.fields import (
24
24
 
25
25
  from ..base.ids import base62_20
26
26
  from ..core._mapped_collection import MappedCollection
27
- from ..core.storage._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
27
+ from ..core.storage._backed_access import _open_dataframe
28
28
  from ..errors import FieldValidationError
29
29
  from ..models._is_versioned import process_revises
30
30
  from ._is_versioned import IsVersioned
@@ -48,8 +48,9 @@ from .record import (
48
48
  from .run import Run, TracksRun, TracksUpdates
49
49
 
50
50
  if TYPE_CHECKING:
51
- from collections.abc import Iterable
51
+ from collections.abc import Iterable, Iterator
52
52
 
53
+ from polars import LazyFrame as PolarsLazyFrame
53
54
  from pyarrow.dataset import Dataset as PyArrowDataset
54
55
 
55
56
  from ..core.storage import UPath
@@ -94,6 +95,39 @@ if TYPE_CHECKING:
94
95
  # return feature_sets_union
95
96
 
96
97
 
98
+ def _load_concat_artifacts(
99
+ artifacts: list[Artifact], join: Literal["inner", "outer"] = "outer", **kwargs
100
+ ) -> pd.DataFrame | ad.AnnData:
101
+ suffixes = {artifact.suffix for artifact in artifacts}
102
+ # Why is that? - Sergei
103
+ if len(suffixes) != 1:
104
+ raise ValueError(
105
+ "Can only load collections where all artifacts have the same suffix"
106
+ )
107
+
108
+ # because we're tracking data flow on the collection-level, here, we don't
109
+ # want to track it on the artifact-level
110
+ first_object = artifacts[0].load(is_run_input=False)
111
+ is_dataframe = isinstance(first_object, pd.DataFrame)
112
+ is_anndata = isinstance(first_object, ad.AnnData)
113
+ if not is_dataframe and not is_anndata:
114
+ raise ValueError(f"Unable to concatenate {suffixes.pop()} objects.")
115
+
116
+ objects = [first_object]
117
+ artifact_uids = [artifacts[0].uid]
118
+ for artifact in artifacts[1:]:
119
+ objects.append(artifact.load(is_run_input=False))
120
+ artifact_uids.append(artifact.uid)
121
+
122
+ if is_dataframe:
123
+ concat_object = pd.concat(objects, join=join, **kwargs)
124
+ elif is_anndata:
125
+ label = kwargs.pop("label", "artifact_uid")
126
+ keys = kwargs.pop("keys", artifact_uids)
127
+ concat_object = ad.concat(objects, join=join, label=label, keys=keys, **kwargs)
128
+ return concat_object
129
+
130
+
97
131
  class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
98
132
  """Collections of artifacts.
99
133
 
@@ -325,11 +359,13 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
325
359
  artifact: An artifact to add to the collection.
326
360
  run: The run that creates the new version of the collection.
327
361
 
328
- Examples::
362
+ Examples:
363
+
364
+ ::
329
365
 
330
- collection_v1 = ln.Collection(artifact, key="My collection").save()
331
- collection_v2 = collection.append(another_artifact) # returns a new version of the collection
332
- collection_v2.save() # save the new version
366
+ collection_v1 = ln.Collection(artifact, key="My collection").save()
367
+ collection_v2 = collection.append(another_artifact) # returns a new version of the collection
368
+ collection_v2.save() # save the new version
333
369
 
334
370
  """
335
371
  return Collection( # type: ignore
@@ -340,13 +376,25 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
340
376
  run=run,
341
377
  )
342
378
 
343
- def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
344
- """Return a cloud-backed pyarrow Dataset.
379
+ def open(
380
+ self,
381
+ engine: Literal["pyarrow", "polars"] = "pyarrow",
382
+ is_run_input: bool | None = None,
383
+ **kwargs,
384
+ ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
385
+ """Open a dataset for streaming.
386
+
387
+ Works for `pyarrow` and `polars` compatible formats
388
+ (`.parquet`, `.csv`, `.ipc` etc. files or directories with such files).
345
389
 
346
- Works for `pyarrow` compatible formats.
390
+ Args:
391
+ engine: Which module to use for lazy loading of a dataframe
392
+ from `pyarrow` or `polars` compatible formats.
393
+ is_run_input: Whether to track this artifact as run input.
394
+ **kwargs: Keyword arguments for `pyarrow.dataset.dataset` or `polars.scan_*` functions.
347
395
 
348
396
  Notes:
349
- For more info, see tutorial: :doc:`/arrays`.
397
+ For more info, see guide: :doc:`/arrays`.
350
398
  """
351
399
  if self._state.adding:
352
400
  artifacts = self._artifacts
@@ -354,31 +402,12 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
354
402
  else:
355
403
  artifacts = self.ordered_artifacts.all()
356
404
  paths = [artifact.path for artifact in artifacts]
357
- # this checks that the filesystem is the same for all paths
358
- # this is a requirement of pyarrow.dataset.dataset
359
- fs = paths[0].fs
360
- for path in paths[1:]:
361
- # this assumes that the filesystems are cached by fsspec
362
- if path.fs is not fs:
363
- raise ValueError(
364
- "The collection has artifacts with different filesystems, this is not supported."
365
- )
366
- if not _is_pyarrow_dataset(paths):
367
- suffixes = {path.suffix for path in paths}
368
- suffixes_str = ", ".join(suffixes)
369
- err_msg = (
370
- "This collection is not compatible with pyarrow.dataset.dataset(), "
371
- )
372
- err_msg += (
373
- f"the artifacts have incompatible file types: {suffixes_str}"
374
- if len(suffixes) > 1
375
- else f"the file type {suffixes_str} is not supported by pyarrow."
376
- )
377
- raise ValueError(err_msg)
378
- dataset = _open_pyarrow_dataset(paths)
405
+
406
+ dataframe = _open_dataframe(paths, engine=engine, **kwargs)
379
407
  # track only if successful
408
+ # is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
380
409
  _track_run_input(self, is_run_input)
381
- return dataset
410
+ return dataframe
382
411
 
383
412
  def mapped(
384
413
  self,
@@ -401,8 +430,8 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
401
430
  <https://pytorch.org/docs/stable/data.html#map-style-datasets>`__ by
402
431
  virtually concatenating `AnnData` arrays.
403
432
 
404
- If your `AnnData` collection is in the cloud, move them into a local
405
- cache first via :meth:`~lamindb.Collection.cache`.
433
+ By default (`stream=False`) `AnnData` arrays are moved into a local
434
+ cache first.
406
435
 
407
436
  `__getitem__` of the `MappedCollection` object takes a single integer index
408
437
  and returns a dictionary with the observation data sample for this index from
@@ -414,7 +443,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
414
443
 
415
444
  For a guide, see :doc:`docs:scrna-mappedcollection`.
416
445
 
417
- This method currently only works for collections of `AnnData` artifacts.
446
+ This method currently only works for collections or query sets of `AnnData` artifacts.
418
447
 
419
448
  Args:
420
449
  layers_keys: Keys from the ``.layers`` slot. ``layers_keys=None`` or ``"X"`` in the list
@@ -443,6 +472,11 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
443
472
  >>> ds = ln.Collection.get(description="my collection")
444
473
  >>> mapped = collection.mapped(obs_keys=["cell_type", "batch"])
445
474
  >>> dl = DataLoader(mapped, batch_size=128, shuffle=True)
475
+ >>> # also works for query sets of artifacts, '...' represents some filtering condition
476
+ >>> # additional filtering on artifacts of the collection
477
+ >>> mapped = collection.artifacts.all().filter(...).order_by("-created_at").mapped()
478
+ >>> # or directly from a query set of artifacts
479
+ >>> mapped = ln.Artifact.filter(..., otype="AnnData").order_by("-created_at").mapped()
446
480
  """
447
481
  path_list = []
448
482
  if self._state.adding:
@@ -472,6 +506,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
472
506
  dtype,
473
507
  )
474
508
  # track only if successful
509
+ # is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
475
510
  _track_run_input(self, is_run_input)
476
511
  return ds
477
512
 
@@ -488,6 +523,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
488
523
  path_list = []
489
524
  for artifact in self.ordered_artifacts.all():
490
525
  path_list.append(artifact.cache())
526
+ # is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
491
527
  _track_run_input(self, is_run_input)
492
528
  return path_list
493
529
 
@@ -496,29 +532,16 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
496
532
  join: Literal["inner", "outer"] = "outer",
497
533
  is_run_input: bool | None = None,
498
534
  **kwargs,
499
- ) -> Any:
500
- """Stage and load to memory.
535
+ ) -> pd.DataFrame | ad.AnnData:
536
+ """Cache and load to memory.
501
537
 
502
- Returns in-memory representation if possible such as a concatenated `DataFrame` or `AnnData` object.
538
+ Returns an in-memory concatenated `DataFrame` or `AnnData` object.
503
539
  """
504
540
  # cannot call _track_run_input here, see comment further down
505
- all_artifacts = self.ordered_artifacts.all()
506
- suffixes = [artifact.suffix for artifact in all_artifacts]
507
- if len(set(suffixes)) != 1:
508
- raise RuntimeError(
509
- "Can only load collections where all artifacts have the same suffix"
510
- )
511
- # because we're tracking data flow on the collection-level, here, we don't
512
- # want to track it on the artifact-level
513
- objects = [artifact.load(is_run_input=False) for artifact in all_artifacts]
514
- artifact_uids = [artifact.uid for artifact in all_artifacts]
515
- if isinstance(objects[0], pd.DataFrame):
516
- concat_object = pd.concat(objects, join=join)
517
- elif isinstance(objects[0], ad.AnnData):
518
- concat_object = ad.concat(
519
- objects, join=join, label="artifact_uid", keys=artifact_uids
520
- )
521
- # only call it here because there might be errors during concat
541
+ artifacts = self.ordered_artifacts.all()
542
+ concat_object = _load_concat_artifacts(artifacts, join, **kwargs)
543
+ # only call it here because there might be errors during load or concat
544
+ # is it really needed if tracking is done in self.ordered_artifacts.all()? - Sergei
522
545
  _track_run_input(self, is_run_input)
523
546
  return concat_object
524
547
 
lamindb/models/core.py CHANGED
@@ -24,7 +24,7 @@ if TYPE_CHECKING:
24
24
 
25
25
 
26
26
  class Storage(Record, TracksRun, TracksUpdates):
27
- """Storage locations.
27
+ """Storage locations of artifacts such as S3 buckets or local directories.
28
28
 
29
29
  A storage location is either a directory/folder (local or in the cloud) or
30
30
  an entire S3/GCP bucket.
lamindb/models/feature.py CHANGED
@@ -143,40 +143,50 @@ def parse_cat_dtype(
143
143
 
144
144
 
145
145
  def serialize_dtype(
146
- dtype: Record | FieldAttr | list[Record], is_itype: bool = False
146
+ dtype: Registry | Record | FieldAttr | list[Record] | list[Registry] | str,
147
+ is_itype: bool = False,
147
148
  ) -> str:
148
149
  """Converts a data type object into its string representation."""
150
+ from .ulabel import ULabel
151
+
149
152
  if (
150
153
  not isinstance(dtype, list)
151
154
  and hasattr(dtype, "__name__")
152
155
  and dtype.__name__ in FEATURE_DTYPES
153
156
  ):
154
157
  dtype_str = dtype.__name__
158
+ elif dtype is dict:
159
+ dtype_str = "dict"
160
+ elif is_itype and isinstance(dtype, str):
161
+ if dtype not in "Feature":
162
+ parse_cat_dtype(
163
+ dtype_str=dtype, is_itype=True
164
+ ) # throws an error if invalid
165
+ dtype_str = dtype
155
166
  elif isinstance(dtype, (ExtensionDtype, np.dtype)):
156
167
  dtype_str = serialize_pandas_dtype(dtype)
157
168
  else:
158
- error_message = (
159
- "dtype has to be a record, a record field, or a list of records, not {}"
160
- )
161
- if isinstance(dtype, Registry):
162
- dtype = [dtype]
163
- elif isinstance(dtype, DeferredAttribute):
169
+ error_message = "dtype has to be a registry, a ulabel subtype, a registry field, or a list of registries or fields, not {}"
170
+ if isinstance(dtype, (Registry, DeferredAttribute, ULabel)):
164
171
  dtype = [dtype]
165
172
  elif not isinstance(dtype, list):
166
173
  raise ValueError(error_message.format(dtype))
167
174
  dtype_str = ""
168
- for single_dtype in dtype:
169
- if not isinstance(single_dtype, Registry) and not isinstance(
170
- single_dtype, DeferredAttribute
171
- ):
172
- raise ValueError(error_message.format(single_dtype))
173
- if isinstance(single_dtype, Registry):
174
- dtype_str += single_dtype.__get_name_with_module__() + "|"
175
+ for one_dtype in dtype:
176
+ if not isinstance(one_dtype, (Registry, DeferredAttribute, ULabel)):
177
+ raise ValueError(error_message.format(one_dtype))
178
+ if isinstance(one_dtype, Registry):
179
+ dtype_str += one_dtype.__get_name_with_module__() + "|"
180
+ elif isinstance(one_dtype, ULabel):
181
+ assert one_dtype.is_type, ( # noqa: S101
182
+ f"ulabel has to be a type if acting as dtype, {one_dtype} has `is_type` False"
183
+ )
184
+ dtype_str += f"ULabel[{one_dtype.name}]"
175
185
  else:
186
+ name = one_dtype.field.name
187
+ field_ext = f".{name}" if name != "name" else ""
176
188
  dtype_str += (
177
- single_dtype.field.model.__get_name_with_module__()
178
- + f".{single_dtype.field.name}"
179
- + "|"
189
+ one_dtype.field.model.__get_name_with_module__() + field_ext + "|"
180
190
  )
181
191
  dtype_str = dtype_str.rstrip("|")
182
192
  if not is_itype:
@@ -332,7 +342,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
332
342
 
333
343
  _name_field: str = "name"
334
344
  _aux_fields: dict[str, tuple[str, type]] = {
335
- "0": ("default_value", bool),
345
+ "0": ("default_value", Any), # type: ignore
336
346
  "1": ("nullable", bool),
337
347
  "2": ("coerce_dtype", bool),
338
348
  }
@@ -499,24 +509,11 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
499
509
  super().save(*args, **kwargs)
500
510
  return self
501
511
 
502
- @property
503
- def coerce_dtype(self) -> bool:
504
- """Whether dtypes should be coerced during validation.
505
-
506
- For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
507
- """
508
- if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]: # type: ignore
509
- return self._aux["af"]["2"] # type: ignore
510
- else:
511
- return False
512
-
513
- @coerce_dtype.setter
514
- def coerce_dtype(self, value: bool) -> None:
515
- if self._aux is None: # type: ignore
516
- self._aux = {} # type: ignore
517
- if "af" not in self._aux:
518
- self._aux["af"] = {}
519
- self._aux["af"]["2"] = value
512
+ def with_config(self, optional: bool | None = None) -> tuple[Feature, dict]:
513
+ """Pass addtional configurations to the schema."""
514
+ if optional is not None:
515
+ return self, {"optional": optional}
516
+ return self, {}
520
517
 
521
518
  @property
522
519
  def default_value(self) -> Any:
@@ -532,12 +529,9 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
532
529
  return None
533
530
 
534
531
  @default_value.setter
535
- def default_value(self, value: bool) -> None:
536
- if self._aux is None: # type: ignore
537
- self._aux = {} # type: ignore
538
- if "af" not in self._aux:
539
- self._aux["af"] = {}
540
- self._aux["af"]["0"] = value
532
+ def default_value(self, value: str | None) -> None:
533
+ self._aux = self._aux or {}
534
+ self._aux.setdefault("af", {})["0"] = value
541
535
 
542
536
  @property
543
537
  def nullable(self) -> bool:
@@ -568,11 +562,48 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
568
562
  @nullable.setter
569
563
  def nullable(self, value: bool) -> None:
570
564
  assert isinstance(value, bool), value # noqa: S101
571
- if self._aux is None:
572
- self._aux = {}
573
- if "af" not in self._aux:
574
- self._aux["af"] = {}
575
- self._aux["af"]["1"] = value
565
+ self._aux = self._aux or {}
566
+ self._aux.setdefault("af", {})["1"] = value
567
+
568
+ @property
569
+ def coerce_dtype(self) -> bool:
570
+ """Whether dtypes should be coerced during validation.
571
+
572
+ For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
573
+ """
574
+ if self._aux is not None and "af" in self._aux and "2" in self._aux["af"]: # type: ignore
575
+ return self._aux["af"]["2"] # type: ignore
576
+ else:
577
+ return False
578
+
579
+ @coerce_dtype.setter
580
+ def coerce_dtype(self, value: bool) -> None:
581
+ self._aux = self._aux or {}
582
+ self._aux.setdefault("af", {})["2"] = value
583
+
584
+ # we'll enable this later
585
+ # @property
586
+ # def observational_unit(self) -> Literal["Artifact", "Observation"]:
587
+ # """Default observational unit on which the feature is measured.
588
+
589
+ # Currently, we only make a distinction between artifact-level and observation-level features.
590
+
591
+ # For example, a feature `"ml_split"` that stores `"test"` & `"train"` labels is typically defined on the artifact level.
592
+ # When accessing `artifact.features.get_values(["ml_split"])`, you expect a single value, either `"test"` or `"train"`.
593
+
594
+ # However, when accessing an artifact annotation with a feature that's defined on the observation-level, say `"cell_type"`, you expect a set of values. So,
595
+ # `artifact.features.get_values(["cell_type_from_expert"])` should return a set: `{"T cell", "B cell"}`.
596
+
597
+ # The value of `observational_unit` is currently auto-managed: if using `artifact.featueres.add_values()`,
598
+ # it will be set to `Artifact`. In a curator, the value depends on whether it's an artifact- or observation-level slot
599
+ # (e.g. `.uns` is artifact-level in `AnnData` whereas `.obs` is observation-level).
600
+
601
+ # Note: This attribute might in the future be used to distinguish different types of observational units (e.g. single cells vs. physical samples vs. study subjects etc.).
602
+ # """
603
+ # if self._expect_many:
604
+ # return "Observation" # this here might be replaced with the specific observational unit
605
+ # else:
606
+ # return "Artifact"
576
607
 
577
608
 
578
609
  class FeatureValue(Record, TracksRun):
@@ -4,12 +4,15 @@ from __future__ import annotations
4
4
  import builtins
5
5
  from typing import TYPE_CHECKING, Literal
6
6
 
7
+ import lamindb_setup as ln_setup
7
8
  from lamin_utils import logger
8
9
 
9
10
  from .record import format_field_value, get_name_field
10
11
  from .run import Run
11
12
 
12
13
  if TYPE_CHECKING:
14
+ from graphviz import Digraph
15
+
13
16
  from lamindb.base.types import StrField
14
17
 
15
18
  from .artifact import Artifact
@@ -78,7 +81,7 @@ class HasParents:
78
81
  if not isinstance(field, str):
79
82
  field = field.field.name
80
83
 
81
- return _view_parents(
84
+ return view_parents(
82
85
  record=self, # type: ignore
83
86
  field=field,
84
87
  with_children=with_children,
@@ -101,7 +104,7 @@ def _transform_emoji(transform: Transform):
101
104
  return TRANSFORM_EMOJIS["pipeline"]
102
105
 
103
106
 
104
- def _view(u):
107
+ def view_digraph(u: Digraph):
105
108
  from graphviz.backend import ExecutableNotFound
106
109
 
107
110
  try:
@@ -117,7 +120,7 @@ def _view(u):
117
120
  # call to display()
118
121
  display(u._repr_mimebundle_(), raw=True)
119
122
  else:
120
- return u
123
+ return u.view()
121
124
  except (FileNotFoundError, RuntimeError, ExecutableNotFound): # pragma: no cover
122
125
  logger.error(
123
126
  "please install the graphviz executable on your system:\n - Ubuntu: `sudo"
@@ -126,7 +129,9 @@ def _view(u):
126
129
  )
127
130
 
128
131
 
129
- def view_lineage(data: Artifact | Collection, with_children: bool = True) -> None:
132
+ def view_lineage(
133
+ data: Artifact | Collection, with_children: bool = True, return_graph: bool = False
134
+ ) -> Digraph | None:
130
135
  """Graph of data flow.
131
136
 
132
137
  Notes:
@@ -136,6 +141,13 @@ def view_lineage(data: Artifact | Collection, with_children: bool = True) -> Non
136
141
  >>> collection.view_lineage()
137
142
  >>> artifact.view_lineage()
138
143
  """
144
+ if ln_setup.settings.instance.is_on_hub:
145
+ instance_slug = ln_setup.settings.instance.slug
146
+ entity_slug = data.__class__.__name__.lower()
147
+ logger.important(
148
+ f"explore at: https://lamin.ai/{instance_slug}/{entity_slug}/{data.uid}"
149
+ )
150
+
139
151
  import graphviz
140
152
 
141
153
  df_values = _get_all_parent_runs(data)
@@ -189,10 +201,13 @@ def view_lineage(data: Artifact | Collection, with_children: bool = True) -> Non
189
201
  shape="box",
190
202
  )
191
203
 
192
- _view(u)
204
+ if return_graph:
205
+ return u
206
+ else:
207
+ return view_digraph(u)
193
208
 
194
209
 
195
- def _view_parents(
210
+ def view_parents(
196
211
  record: Record,
197
212
  field: str,
198
213
  with_children: bool = False,
@@ -258,7 +273,7 @@ def _view_parents(
258
273
  u.node(row["target"], label=row["target_label"])
259
274
  u.edge(row["source"], row["target"], color="dimgrey")
260
275
 
261
- _view(u)
276
+ view_digraph(u)
262
277
 
263
278
 
264
279
  def _get_parents(
@@ -505,14 +520,14 @@ def _get_all_child_runs(data: Artifact | Collection) -> list:
505
520
  run_inputs_outputs += [(r, outputs_run)]
506
521
 
507
522
  child_runs.update(
508
- Run.filter(
523
+ Run.filter( # type: ignore
509
524
  **{f"input_{name}s__uid__in": [i.uid for i in outputs_run]}
510
525
  ).list()
511
526
  )
512
527
  # for artifacts, also include collections in the lineage
513
528
  if name == "artifact":
514
529
  child_runs.update(
515
- Run.filter(
530
+ Run.filter( # type: ignore
516
531
  input_collections__uid__in=[i.uid for i in outputs_run]
517
532
  ).list()
518
533
  )