lamindb 1.3.2__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any, Union, overload
9
9
 
10
10
  import fsspec
11
11
  import lamindb_setup as ln_setup
12
+ import numpy as np
12
13
  import pandas as pd
13
14
  from anndata import AnnData
14
15
  from django.db import connections, models
@@ -38,7 +39,6 @@ from lamindb.errors import FieldValidationError
38
39
  from lamindb.models.query_set import QuerySet
39
40
 
40
41
  from ..base.users import current_user_id
41
- from ..core._compat import is_package_installed
42
42
  from ..core.loaders import load_to_memory
43
43
  from ..core.storage import (
44
44
  LocalPathClasses,
@@ -61,7 +61,6 @@ from ..core.storage.paths import (
61
61
  from ..errors import IntegrityError, InvalidArgument, ValidationError
62
62
  from ..models._is_versioned import (
63
63
  create_uid,
64
- message_update_key_in_version_family,
65
64
  )
66
65
  from ._django import get_artifact_with_related
67
66
  from ._feature_manager import (
@@ -69,6 +68,7 @@ from ._feature_manager import (
69
68
  ParamManager,
70
69
  ParamManagerArtifact,
71
70
  add_label_feature_links,
71
+ filter_base,
72
72
  get_label_links,
73
73
  )
74
74
  from ._is_versioned import IsVersioned
@@ -86,7 +86,7 @@ from .record import (
86
86
  _get_record_kwargs,
87
87
  record_repr,
88
88
  )
89
- from .run import ParamValue, Run, TracksRun, TracksUpdates, User
89
+ from .run import Param, ParamValue, Run, TracksRun, TracksUpdates, User
90
90
  from .schema import Schema
91
91
  from .ulabel import ULabel
92
92
 
@@ -210,17 +210,6 @@ def process_data(
210
210
 
211
211
  if not overwritten, data gets stored in default storage
212
212
  """
213
- supported_data_types = [pd.DataFrame, AnnData]
214
- if is_package_installed("mudata"):
215
- from mudata import MuData
216
-
217
- supported_data_types.append(MuData)
218
- if is_package_installed("spatialdata"):
219
- from spatialdata import SpatialData
220
-
221
- supported_data_types.append(SpatialData)
222
- supported_data_types = tuple(supported_data_types) # type: ignore
223
-
224
213
  if key is not None:
225
214
  key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
226
215
  # use suffix as the (adata) format if the format is not provided
@@ -228,7 +217,8 @@ def process_data(
228
217
  format = key_suffix[1:]
229
218
  else:
230
219
  key_suffix = None
231
- if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
220
+
221
+ if isinstance(data, (str, Path, UPath)):
232
222
  access_token = (
233
223
  default_storage._access_token
234
224
  if hasattr(default_storage, "_access_token")
@@ -239,6 +229,7 @@ def process_data(
239
229
  # for example into a temporary url
240
230
  if path.protocol not in {"http", "https"}:
241
231
  path = path.resolve()
232
+
242
233
  storage, use_existing_storage_key = process_pathlike(
243
234
  path,
244
235
  default_storage=default_storage,
@@ -247,28 +238,37 @@ def process_data(
247
238
  )
248
239
  suffix = extract_suffix_from_path(path)
249
240
  memory_rep = None
250
- elif isinstance(data, supported_data_types):
241
+ elif (
242
+ isinstance(data, pd.DataFrame)
243
+ or isinstance(data, AnnData)
244
+ or data_is_mudata(data)
245
+ or data_is_spatialdata(data)
246
+ ):
251
247
  storage = default_storage
252
248
  memory_rep = data
253
249
  suffix = infer_suffix(data, format)
254
250
  else:
255
251
  raise NotImplementedError(
256
- f"Do not know how to create a artifact object from {data}, pass a path instead!"
252
+ f"Do not know how to create an Artifact from {data}, pass a path instead."
257
253
  )
254
+
255
+ # Check for suffix consistency
258
256
  if key_suffix is not None and key_suffix != suffix and not is_replace:
259
257
  # consciously omitting a trailing period
260
- if isinstance(data, (str, Path, UPath)):
258
+ if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
261
259
  message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
262
260
  else:
263
261
  message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
264
262
  raise InvalidArgument(message)
263
+
265
264
  # in case we have an in-memory representation, we need to write it to disk
266
- from lamindb import settings
265
+ if memory_rep is not None:
266
+ from lamindb import settings
267
267
 
268
- if isinstance(data, supported_data_types):
269
268
  path = settings.cache_dir / f"{provisional_uid}{suffix}"
270
269
  write_to_disk(data, path)
271
270
  use_existing_storage_key = False
271
+
272
272
  return memory_rep, path, suffix, storage, use_existing_storage_key
273
273
 
274
274
 
@@ -533,28 +533,24 @@ def data_is_anndata(data: AnnData | UPathStr) -> bool:
533
533
 
534
534
 
535
535
  def data_is_mudata(data: MuData | UPathStr) -> bool:
536
- if is_package_installed("mudata"):
537
- from mudata import MuData
538
-
539
- if isinstance(data, MuData):
540
- return True
536
+ # We are not importing MuData here to keep loaded modules minimal
537
+ if hasattr(data, "__class__") and data.__class__.__name__ == "MuData":
538
+ return True
541
539
  if isinstance(data, (str, Path)):
542
540
  return UPath(data).suffix == ".h5mu"
543
541
  return False
544
542
 
545
543
 
546
544
  def data_is_spatialdata(data: SpatialData | UPathStr) -> bool:
547
- if is_package_installed("spatialdata"):
548
- from spatialdata import SpatialData
549
-
550
- if isinstance(data, SpatialData):
551
- return True
552
- if isinstance(data, (str, Path)):
553
- if UPath(data).suffix == ".zarr":
554
- # TODO: inconsistent with anndata, where we run the storage
555
- # check only for local, expensive for cloud
556
- return identify_zarr_type(data, check=False) == "spatialdata"
557
- return False
545
+ # We are not importing SpatialData here to keep loaded modules minimal
546
+ if hasattr(data, "__class__") and data.__class__.__name__ == "SpatialData":
547
+ return True
548
+ if isinstance(data, (str, Path)):
549
+ if UPath(data).suffix == ".zarr":
550
+ # TODO: inconsistent with anndata, where we run the storage
551
+ # check only for local, expensive for cloud
552
+ return identify_zarr_type(data, check=False) == "spatialdata"
553
+ return False
558
554
 
559
555
 
560
556
  def _check_otype_artifact(
@@ -962,53 +958,27 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
962
958
  revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
963
959
  run: `Run | None = None` The run that creates the artifact.
964
960
 
965
- .. dropdown:: Typical storage formats & their API accessors
966
-
967
- Arrays:
968
-
969
- - Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
970
- - Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
971
- - Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
972
-
973
- Non-arrays:
974
-
975
- - Image: `.jpg`, `.png` ⟷ `np.ndarray`, ...
976
- - Fastq: `.fastq` ⟷ /
977
- - VCF: `.vcf` ⟷ /
978
- - QC: `.html` ⟷ /
979
-
980
- You'll find these values in the `suffix` & `accessor` fields.
981
-
982
- LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
983
-
984
- See Also:
985
- :class:`~lamindb.Storage`
986
- Storage locations for artifacts.
987
- :class:`~lamindb.Collection`
988
- Collections of artifacts.
989
- :meth:`~lamindb.Artifact.from_df`
990
- Create an artifact from a `DataFrame`.
991
- :meth:`~lamindb.Artifact.from_anndata`
992
- Create an artifact from an `AnnData`.
993
-
994
961
  Examples:
995
962
 
996
- Create an artifact by passing `key`:
963
+ Create an artifact **from a local file or folder**::
997
964
 
998
- >>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
999
- >>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
965
+ artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
966
+ artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
1000
967
 
1001
- Calling `.save()` uploads the file to the default storage location of your lamindb instance.
1002
- (If it's a local instance, the "upload" is a mere copy operation.)
968
+ Calling `.save()` copies or uploads the file to the default storage location of your lamindb instance.
969
+ If you create an artifact **from a remote file or folder**, lamindb merely registers the S3 `key` and avoids copying the data::
1003
970
 
1004
- If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
971
+ artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
1005
972
 
1006
- >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
973
+ If you want to **validate & annotate** an array, pass a `schema` to one of the `.from_df()`, `.from_anndata()`, ... constructors::
1007
974
 
1008
- You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
975
+ schema = ln.Schema(itype=ln.Feature) # a schema that merely enforces that feature names exist in the Feature registry
976
+ artifact = ln.Artifact.from_df("./my_file.parquet", key="my_dataset.parquet", schema=schema).save() # validated and annotated
1009
977
 
1010
- >>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
1011
- >>> artifact_v2.versions.df() # see all versions
978
+ You can make a **new version** of an artifact by passing an existing `key`::
979
+
980
+ artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
981
+ artifact_v2.versions.df() # see all versions
1012
982
 
1013
983
  .. dropdown:: Why does the API look this way?
1014
984
 
@@ -1031,18 +1001,48 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1031
1001
  bucket = quilt3.Bucket('mybucket')
1032
1002
  bucket.put_file('hello.txt', '/tmp/hello.txt')
1033
1003
 
1034
- Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
1004
+ Sometimes you want to **avoid mapping the artifact into a path hierarchy**, and you only pass `description`::
1005
+
1006
+ artifact = ln.Artifact("./my_folder", description="My folder").save()
1007
+ artifact_v2 = ln.Artifact("./my_folder", revises=old_artifact).save() # need to version based on `revises`, a shared description does not trigger a new version
1008
+
1009
+ Notes:
1010
+
1011
+ .. dropdown:: Typical storage formats & their API accessors
1012
+
1013
+ Arrays:
1014
+
1015
+ - Table: `.csv`, `.tsv`, `.parquet`, `.ipc` ⟷ `DataFrame`, `pyarrow.Table`
1016
+ - Annotated matrix: `.h5ad`, `.h5mu`, `.zrad` ⟷ `AnnData`, `MuData`
1017
+ - Generic array: HDF5 group, zarr group, TileDB store ⟷ HDF5, zarr, TileDB loaders
1035
1018
 
1036
- >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
1037
- >>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
1019
+ Non-arrays:
1038
1020
 
1039
- Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
1021
+ - Image: `.jpg`, `.png` `np.ndarray`, ...
1022
+ - Fastq: `.fastq` ⟷ /
1023
+ - VCF: `.vcf` ⟷ /
1024
+ - QC: `.html` ⟷ /
1040
1025
 
1041
- >>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
1026
+ You'll find these values in the `suffix` & `accessor` fields.
1042
1027
 
1043
- If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
1044
- the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
1045
- detects the duplication and will return the existing artifact.
1028
+ LaminDB makes some default choices (e.g., serialize a `DataFrame` as a `.parquet` file).
1029
+
1030
+ .. dropdown:: Will artifacts get duplicated?
1031
+
1032
+ If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact.
1033
+
1034
+ In concurrent workloads where the same artifact is created repeatedly at the exact same time, `.save()`
1035
+ detects the duplication and will return the existing artifact.
1036
+
1037
+ See Also:
1038
+ :class:`~lamindb.Storage`
1039
+ Storage locations for artifacts.
1040
+ :class:`~lamindb.Collection`
1041
+ Collections of artifacts.
1042
+ :meth:`~lamindb.Artifact.from_df`
1043
+ Create an artifact from a `DataFrame`.
1044
+ :meth:`~lamindb.Artifact.from_anndata`
1045
+ Create an artifact from an `AnnData`.
1046
1046
 
1047
1047
  """
1048
1048
 
@@ -1055,6 +1055,8 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1055
1055
  params: ParamManager = ParamManagerArtifact # type: ignore
1056
1056
  """Param manager.
1057
1057
 
1058
+ What features are for dataset-like artifacts, parameters are for model-like artifacts & runs.
1059
+
1058
1060
  Example::
1059
1061
 
1060
1062
  artifact.params.add_values({
@@ -1071,20 +1073,20 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1071
1073
  features: FeatureManager = FeatureManager # type: ignore
1072
1074
  """Feature manager.
1073
1075
 
1074
- Features denote dataset dimensions, i.e., the variables that measure labels & numbers.
1076
+ Typically, you annotate a dataset with features by defining a `Schema` and passing it to the `Artifact` constructor.
1075
1077
 
1076
- Annotate with features & values::
1078
+ Here is how to do annotate an artifact ad hoc::
1077
1079
 
1078
1080
  artifact.features.add_values({
1079
1081
  "species": organism, # here, organism is an Organism record
1080
1082
  "scientist": ['Barbara McClintock', 'Edgar Anderson'],
1081
1083
  "temperature": 27.6,
1082
- "study": "Candidate marker study"
1084
+ "experiment": "Experiment 1"
1083
1085
  })
1084
1086
 
1085
- Query for features & values::
1087
+ Query artifacts by features::
1086
1088
 
1087
- ln.Artifact.features.filter(scientist="Barbara McClintock")
1089
+ ln.Artifact.filter(scientist="Barbara McClintock")
1088
1090
 
1089
1091
  Features may or may not be part of the artifact content in storage. For
1090
1092
  instance, the :class:`~lamindb.Curator` flow validates the columns of a
@@ -1100,22 +1102,22 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1100
1102
  To annotate with labels, you typically use the registry-specific accessors,
1101
1103
  for instance :attr:`~lamindb.Artifact.ulabels`::
1102
1104
 
1103
- candidate_marker_study = ln.ULabel(name="Candidate marker study").save()
1104
- artifact.ulabels.add(candidate_marker_study)
1105
+ experiment = ln.ULabel(name="Experiment 1").save()
1106
+ artifact.ulabels.add(experiment)
1105
1107
 
1106
1108
  Similarly, you query based on these accessors::
1107
1109
 
1108
- ln.Artifact.filter(ulabels__name="Candidate marker study").all()
1110
+ ln.Artifact.filter(ulabels__name="Experiment 1").all()
1109
1111
 
1110
1112
  Unlike the registry-specific accessors, the `.labels` accessor provides
1111
1113
  a way of associating labels with features::
1112
1114
 
1113
- study = ln.Feature(name="study", dtype="cat").save()
1114
- artifact.labels.add(candidate_marker_study, feature=study)
1115
+ experiment = ln.Feature(name="experiment", dtype="cat").save()
1116
+ artifact.labels.add(experiment, feature=study)
1115
1117
 
1116
1118
  Note that the above is equivalent to::
1117
1119
 
1118
- artifact.features.add_values({"study": candidate_marker_study})
1120
+ artifact.features.add_values({"experiment": experiment})
1119
1121
  """
1120
1122
  from ._label_manager import LabelManager
1121
1123
 
@@ -1343,15 +1345,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1343
1345
  f"Only {valid_keywords} can be passed, you passed: {kwargs}"
1344
1346
  )
1345
1347
  if revises is not None and key is not None and revises.key != key:
1346
- note = message_update_key_in_version_family(
1347
- suid=revises.stem_uid,
1348
- existing_key=revises.key,
1349
- new_key=key,
1350
- registry="Artifact",
1351
- )
1352
- raise ValueError(
1353
- f"`key` is {key}, but `revises.key` is '{revises.key}'\n\n Either do *not* pass `key`.\n\n{note}"
1354
- )
1348
+ logger.warning(f"renaming artifact from '{revises.key}' to {key}")
1355
1349
  if revises is not None:
1356
1350
  if not isinstance(revises, Artifact):
1357
1351
  raise TypeError("`revises` has to be of type `Artifact`")
@@ -1431,11 +1425,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1431
1425
  kwargs["uid"] = uid
1432
1426
 
1433
1427
  # only set key now so that we don't do a look-up on it in case revises is passed
1434
- if revises is not None and revises.key is not None:
1435
- assert revises.key.endswith(kwargs["suffix"]), ( # noqa: S101
1436
- revises.key,
1437
- kwargs["suffix"],
1438
- )
1428
+ if revises is not None and revises.key is not None and kwargs["key"] is None:
1439
1429
  kwargs["key"] = revises.key
1440
1430
 
1441
1431
  kwargs["kind"] = kind
@@ -1530,15 +1520,84 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1530
1520
  - Guide: :doc:`docs:registries`
1531
1521
  - Method in `Record` base class: :meth:`~lamindb.models.Record.get`
1532
1522
 
1533
- Examples::
1523
+ Examples:
1534
1524
 
1535
- artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
1536
- artifact = ln.Arfifact.get(key="my_datasets/my_file.parquet")
1525
+ ::
1526
+
1527
+ artifact = ln.Artifact.get("tCUkRcaEjTjhtozp0000")
1528
+ artifact = ln.Arfifact.get(key="my_datasets/my_file.parquet")
1537
1529
  """
1538
1530
  from .query_set import QuerySet
1539
1531
 
1540
1532
  return QuerySet(model=cls).get(idlike, **expressions)
1541
1533
 
1534
+ @classmethod
1535
+ def filter(
1536
+ cls,
1537
+ *queries,
1538
+ **expressions,
1539
+ ) -> QuerySet:
1540
+ """Query a set of artifacts.
1541
+
1542
+ Args:
1543
+ *queries: `Q` expressions.
1544
+ **expressions: Features, params, fields via the Django query syntax.
1545
+
1546
+ See Also:
1547
+ - Guide: :doc:`docs:registries`
1548
+
1549
+ Examples:
1550
+
1551
+ Query by fields::
1552
+
1553
+ ln.Arfifact.filter(key="my_datasets/my_file.parquet")
1554
+
1555
+ Query by features::
1556
+
1557
+ ln.Arfifact.filter(cell_type_by_model__name="T cell")
1558
+
1559
+ Query by params::
1560
+
1561
+ ln.Arfifact.filter(hyperparam_x=100)
1562
+ """
1563
+ from .query_set import QuerySet
1564
+
1565
+ if expressions:
1566
+ keys_normalized = [key.split("__")[0] for key in expressions]
1567
+ field_or_feature_or_param = keys_normalized[0].split("__")[0]
1568
+ if field_or_feature_or_param in Artifact.__get_available_fields__():
1569
+ return QuerySet(model=cls).filter(*queries, **expressions)
1570
+ elif all(
1571
+ features_validated := Feature.validate(
1572
+ keys_normalized, field="name", mute=True
1573
+ )
1574
+ ):
1575
+ return filter_base(FeatureManager, **expressions)
1576
+ elif all(
1577
+ params_validated := Param.validate(
1578
+ keys_normalized, field="name", mute=True
1579
+ )
1580
+ ):
1581
+ return filter_base(ParamManagerArtifact, **expressions)
1582
+ else:
1583
+ if sum(features_validated) < sum(params_validated):
1584
+ params = ", ".join(
1585
+ sorted(np.array(keys_normalized)[~params_validated])
1586
+ )
1587
+ message = f"param names: {params}"
1588
+ else:
1589
+ features = ", ".join(
1590
+ sorted(np.array(keys_normalized)[~params_validated])
1591
+ )
1592
+ message = f"feature names: {features}"
1593
+ fields = ", ".join(sorted(cls.__get_available_fields__()))
1594
+ raise InvalidArgument(
1595
+ f"You can query either by available fields: {fields}\n"
1596
+ f"Or fix invalid {message}"
1597
+ )
1598
+ else:
1599
+ return QuerySet(model=cls).filter(*queries, **expressions)
1600
+
1542
1601
  @classmethod
1543
1602
  def from_df(
1544
1603
  cls,
@@ -1548,6 +1607,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1548
1607
  description: str | None = None,
1549
1608
  run: Run | None = None,
1550
1609
  revises: Artifact | None = None,
1610
+ schema: Schema | None = None,
1551
1611
  **kwargs,
1552
1612
  ) -> Artifact:
1553
1613
  """Create from `DataFrame`, validate & link features.
@@ -1559,6 +1619,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1559
1619
  description: A description.
1560
1620
  revises: An old version of the artifact.
1561
1621
  run: The run that creates the artifact.
1622
+ schema: A schema to validate & annotate.
1562
1623
 
1563
1624
  See Also:
1564
1625
  :meth:`~lamindb.Collection`
@@ -1591,6 +1652,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1591
1652
  **kwargs,
1592
1653
  )
1593
1654
  artifact.n_observations = len(df)
1655
+ if schema is not None:
1656
+ from ..curators import DataFrameCurator
1657
+
1658
+ curator = DataFrameCurator(artifact, schema)
1659
+ curator.validate()
1660
+ artifact.schema = schema
1661
+ artifact._curator = curator
1594
1662
  return artifact
1595
1663
 
1596
1664
  @classmethod
@@ -1602,6 +1670,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1602
1670
  description: str | None = None,
1603
1671
  run: Run | None = None,
1604
1672
  revises: Artifact | None = None,
1673
+ schema: Schema | None = None,
1605
1674
  **kwargs,
1606
1675
  ) -> Artifact:
1607
1676
  """Create from ``AnnData``, validate & link features.
@@ -1613,6 +1682,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1613
1682
  description: A description.
1614
1683
  revises: An old version of the artifact.
1615
1684
  run: The run that creates the artifact.
1685
+ schema: A schema to validate & annotate.
1616
1686
 
1617
1687
  See Also:
1618
1688
 
@@ -1654,6 +1724,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1654
1724
  # and the proper path through create_path for cloud paths
1655
1725
  obj_for_obs = artifact.path
1656
1726
  artifact.n_observations = _anndata_n_observations(obj_for_obs)
1727
+ if schema is not None:
1728
+ from ..curators import AnnDataCurator
1729
+
1730
+ curator = AnnDataCurator(artifact, schema)
1731
+ curator.validate()
1732
+ artifact.schema = schema
1733
+ artifact._curator = curator
1657
1734
  return artifact
1658
1735
 
1659
1736
  @classmethod
@@ -1665,6 +1742,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1665
1742
  description: str | None = None,
1666
1743
  run: Run | None = None,
1667
1744
  revises: Artifact | None = None,
1745
+ schema: Schema | None = None,
1668
1746
  **kwargs,
1669
1747
  ) -> Artifact:
1670
1748
  """Create from ``MuData``, validate & link features.
@@ -1676,6 +1754,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1676
1754
  description: A description.
1677
1755
  revises: An old version of the artifact.
1678
1756
  run: The run that creates the artifact.
1757
+ schema: A schema to validate & annotate.
1679
1758
 
1680
1759
  See Also:
1681
1760
  :meth:`~lamindb.Collection`
@@ -1704,6 +1783,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1704
1783
  )
1705
1784
  if not isinstance(mdata, UPathStr):
1706
1785
  artifact.n_observations = mdata.n_obs
1786
+ if schema is not None:
1787
+ from ..curators import MuDataCurator
1788
+
1789
+ curator = MuDataCurator(artifact, schema)
1790
+ curator.validate()
1791
+ artifact.schema = schema
1792
+ artifact._curator = curator
1707
1793
  return artifact
1708
1794
 
1709
1795
  @classmethod
@@ -1715,6 +1801,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1715
1801
  description: str | None = None,
1716
1802
  run: Run | None = None,
1717
1803
  revises: Artifact | None = None,
1804
+ schema: Schema | None = None,
1718
1805
  **kwargs,
1719
1806
  ) -> Artifact:
1720
1807
  """Create from ``SpatialData``, validate & link features.
@@ -1726,6 +1813,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1726
1813
  description: A description.
1727
1814
  revises: An old version of the artifact.
1728
1815
  run: The run that creates the artifact.
1816
+ schema: A schema to validate & annotate.
1729
1817
 
1730
1818
  See Also:
1731
1819
  :meth:`~lamindb.Collection`
@@ -1755,6 +1843,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
1755
1843
  )
1756
1844
  # ill-defined https://scverse.zulipchat.com/#narrow/channel/315824-spatial/topic/How.20to.20calculate.20the.20number.20of.20observations.3F
1757
1845
  # artifact.n_observations = ...
1846
+ if schema is not None:
1847
+ from ..curators import SpatialDataCurator
1848
+
1849
+ curator = SpatialDataCurator(artifact, schema)
1850
+ curator.validate()
1851
+ artifact.schema = schema
1852
+ artifact._curator = curator
1758
1853
  return artifact
1759
1854
 
1760
1855
  @classmethod
@@ -2466,6 +2561,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2466
2561
  local_path_cache,
2467
2562
  )
2468
2563
  logger.important(f"moved local artifact to cache: {local_path_cache}")
2564
+ if hasattr(self, "_curator"):
2565
+ curator = self._curator
2566
+ delattr(self, "_curator")
2567
+ curator.save_artifact()
2469
2568
  return self
2470
2569
 
2471
2570
  def restore(self) -> None:
@@ -57,6 +57,7 @@ def _inspect(
57
57
  mute: bool = False,
58
58
  organism: str | Record | None = None,
59
59
  source: Record | None = None,
60
+ from_source: bool = True,
60
61
  strict_source: bool = False,
61
62
  ) -> pd.DataFrame | dict[str, list[str]]:
62
63
  """{}""" # noqa: D415
@@ -94,7 +95,7 @@ def _inspect(
94
95
  )
95
96
  nonval = set(result_db.non_validated).difference(result_db.synonyms_mapper.keys())
96
97
 
97
- if len(nonval) > 0 and hasattr(registry, "source_id"):
98
+ if from_source and len(nonval) > 0 and hasattr(registry, "source_id"):
98
99
  try:
99
100
  public_result = registry.public(
100
101
  organism=organism_record, source=source
@@ -463,6 +464,7 @@ class CanCurate:
463
464
  mute: bool = False,
464
465
  organism: Union[str, Record, None] = None,
465
466
  source: Record | None = None,
467
+ from_source: bool = True,
466
468
  strict_source: bool = False,
467
469
  ) -> InspectResult:
468
470
  """Inspect if values are mappable to a field.
@@ -506,6 +508,7 @@ class CanCurate:
506
508
  strict_source=strict_source,
507
509
  organism=organism,
508
510
  source=source,
511
+ from_source=from_source,
509
512
  )
510
513
 
511
514
  @classmethod
@@ -325,11 +325,13 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
325
325
  artifact: An artifact to add to the collection.
326
326
  run: The run that creates the new version of the collection.
327
327
 
328
- Examples::
328
+ Examples:
329
+
330
+ ::
329
331
 
330
- collection_v1 = ln.Collection(artifact, key="My collection").save()
331
- collection_v2 = collection.append(another_artifact) # returns a new version of the collection
332
- collection_v2.save() # save the new version
332
+ collection_v1 = ln.Collection(artifact, key="My collection").save()
333
+ collection_v2 = collection.append(another_artifact) # returns a new version of the collection
334
+ collection_v2.save() # save the new version
333
335
 
334
336
  """
335
337
  return Collection( # type: ignore