lamindb 1.0.5__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. lamindb/__init__.py +17 -6
  2. lamindb/_artifact.py +202 -87
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +86 -52
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +21 -7
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +78 -18
  10. lamindb/_record.py +170 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +42 -11
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +129 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/users.py +1 -4
  19. lamindb/base/validation.py +2 -6
  20. lamindb/core/__init__.py +13 -14
  21. lamindb/core/_context.py +14 -9
  22. lamindb/core/_data.py +29 -25
  23. lamindb/core/_describe.py +1 -1
  24. lamindb/core/_django.py +1 -1
  25. lamindb/core/_feature_manager.py +53 -43
  26. lamindb/core/_label_manager.py +4 -4
  27. lamindb/core/_mapped_collection.py +24 -9
  28. lamindb/core/_track_environment.py +2 -1
  29. lamindb/core/datasets/__init__.py +6 -1
  30. lamindb/core/datasets/_core.py +12 -11
  31. lamindb/core/datasets/_small.py +67 -21
  32. lamindb/core/exceptions.py +1 -90
  33. lamindb/core/loaders.py +21 -15
  34. lamindb/core/relations.py +6 -4
  35. lamindb/core/storage/_anndata_accessor.py +49 -3
  36. lamindb/core/storage/_backed_access.py +12 -7
  37. lamindb/core/storage/_pyarrow_dataset.py +40 -15
  38. lamindb/core/storage/_tiledbsoma.py +56 -12
  39. lamindb/core/storage/paths.py +30 -24
  40. lamindb/core/subsettings/_creation_settings.py +4 -16
  41. lamindb/curators/__init__.py +2193 -846
  42. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  43. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  44. lamindb/errors.py +96 -0
  45. lamindb/integrations/_vitessce.py +3 -3
  46. lamindb/migrations/0069_squashed.py +76 -75
  47. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  48. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  49. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  50. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  51. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  52. lamindb/migrations/0086_various.py +95 -0
  53. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  54. lamindb/migrations/0088_schema_components.py +273 -0
  55. lamindb/migrations/0088_squashed.py +4372 -0
  56. lamindb/models.py +475 -168
  57. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/METADATA +9 -7
  58. lamindb-1.1.1.dist-info/RECORD +95 -0
  59. lamindb/curators/_spatial.py +0 -528
  60. lamindb/migrations/0052_squashed.py +0 -1261
  61. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  62. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  63. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  64. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  65. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  66. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  67. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  68. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  69. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  70. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  71. lamindb/migrations/0063_populate_latest_field.py +0 -45
  72. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  73. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  74. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  75. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  76. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  77. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  78. lamindb-1.0.5.dist-info/RECORD +0 -102
  79. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
  80. {lamindb-1.0.5.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0
lamindb/models.py CHANGED
@@ -65,6 +65,7 @@ if TYPE_CHECKING:
65
65
  from pyarrow.dataset import Dataset as PyArrowDataset
66
66
  from tiledbsoma import Collection as SOMACollection
67
67
  from tiledbsoma import Experiment as SOMAExperiment
68
+ from tiledbsoma import Measurement as SOMAMeasurement
68
69
  from upath import UPath
69
70
 
70
71
  from lamindb.core import LabelManager, MappedCollection, QuerySet, RecordList
@@ -152,9 +153,13 @@ def current_run() -> Run | None:
152
153
  if not _TRACKING_READY:
153
154
  _TRACKING_READY = _check_instance_setup()
154
155
  if _TRACKING_READY:
155
- import lamindb.core
156
+ import lamindb
156
157
 
157
- return lamindb.context.run
158
+ # also see get_run() in core._data
159
+ run = lamindb._tracked.get_current_tracked_run()
160
+ if run is None:
161
+ run = lamindb.context.run
162
+ return run
158
163
  else:
159
164
  return None
160
165
 
@@ -239,6 +244,7 @@ class CanCurate:
239
244
  mute: bool = False,
240
245
  organism: str | Record | None = None,
241
246
  source: Record | None = None,
247
+ strict_source: bool = False,
242
248
  ) -> InspectResult:
243
249
  """Inspect if values are mappable to a field.
244
250
 
@@ -252,6 +258,10 @@ class CanCurate:
252
258
  mute: Whether to mute logging.
253
259
  organism: An Organism name or record.
254
260
  source: A `bionty.Source` record that specifies the version to inspect against.
261
+ strict_source: Determines the validation behavior against records in the registry.
262
+ - If `False`, validation will include all records in the registry, ignoring the specified source.
263
+ - If `True`, validation will only include records in the registry that are linked to the specified source.
264
+ Note: this parameter won't affect validation against bionty/public sources.
255
265
 
256
266
  See Also:
257
267
  :meth:`~lamindb.core.CanCurate.validate`
@@ -278,10 +288,11 @@ class CanCurate:
278
288
  mute: bool = False,
279
289
  organism: str | Record | None = None,
280
290
  source: Record | None = None,
291
+ strict_source: bool = False,
281
292
  ) -> np.ndarray:
282
293
  """Validate values against existing values of a string field.
283
294
 
284
- Note this is strict validation, only asserts exact matches.
295
+ Note this is strict_source validation, only asserts exact matches.
285
296
 
286
297
  Args:
287
298
  values: Values that will be validated against the field.
@@ -291,6 +302,10 @@ class CanCurate:
291
302
  mute: Whether to mute logging.
292
303
  organism: An Organism name or record.
293
304
  source: A `bionty.Source` record that specifies the version to validate against.
305
+ strict_source: Determines the validation behavior against records in the registry.
306
+ - If `False`, validation will include all records in the registry, ignoring the specified source.
307
+ - If `True`, validation will only include records in the registry that are linked to the specified source.
308
+ Note: this parameter won't affect validation against bionty/public sources.
294
309
 
295
310
  Returns:
296
311
  A vector of booleans indicating if an element is validated.
@@ -370,6 +385,7 @@ class CanCurate:
370
385
  synonyms_field: str = "synonyms",
371
386
  organism: str | Record | None = None,
372
387
  source: Record | None = None,
388
+ strict_source: bool = False,
373
389
  ) -> list[str] | dict[str, str]:
374
390
  """Maps input synonyms to standardized names.
375
391
 
@@ -392,6 +408,10 @@ class CanCurate:
392
408
  synonyms_field: A field containing the concatenated synonyms.
393
409
  organism: An Organism name or record.
394
410
  source: A `bionty.Source` record that specifies the version to validate against.
411
+ strict_source: Determines the validation behavior against records in the registry.
412
+ - If `False`, validation will include all records in the registry, ignoring the specified source.
413
+ - If `True`, validation will only include records in the registry that are linked to the specified source.
414
+ Note: this parameter won't affect validation against bionty/public sources.
395
415
 
396
416
  Returns:
397
417
  If `return_mapper` is `False`: a list of standardized names. Otherwise,
@@ -679,7 +699,7 @@ class Registry(ModelBase):
679
699
  A record.
680
700
 
681
701
  Raises:
682
- :exc:`docs:lamindb.core.exceptions.DoesNotExist`: In case no matching record is found.
702
+ :exc:`docs:lamindb.errors.DoesNotExist`: In case no matching record is found.
683
703
 
684
704
  See Also:
685
705
  - Guide: :doc:`docs:registries`
@@ -1187,7 +1207,7 @@ class Transform(Record, IsVersioned):
1187
1207
 
1188
1208
  Create a transform for a pipeline:
1189
1209
 
1190
- >>> transform = ln.Transform(name="Cell Ranger", version="7.2.0", type="pipeline").save()
1210
+ >>> transform = ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
1191
1211
 
1192
1212
  Create a transform from a notebook:
1193
1213
 
@@ -1230,7 +1250,11 @@ class Transform(Record, IsVersioned):
1230
1250
  .. versionchanged:: 0.75
1231
1251
  The `source_code` field is no longer an artifact, but a text field.
1232
1252
  """
1233
- hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
1253
+ # we have a unique constraint here but not on artifact because on artifact, we haven't yet
1254
+ # settled how we model the same artifact in different storage locations
1255
+ hash: str | None = CharField(
1256
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True
1257
+ )
1234
1258
  """Hash of the source code."""
1235
1259
  reference: str | None = CharField(max_length=255, db_index=True, null=True)
1236
1260
  """Reference for the transform, e.g., a URL."""
@@ -1340,7 +1364,7 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
1340
1364
  _name_field: str = "name"
1341
1365
 
1342
1366
  name: str = CharField(max_length=100, db_index=True)
1343
- dtype: str = CharField(max_length=64, db_index=True)
1367
+ dtype: str | None = CharField(db_index=True, null=True)
1344
1368
  """Data type ("num", "cat", "int", "float", "bool", "datetime").
1345
1369
 
1346
1370
  For categorical types, can define from which registry values are
@@ -1353,7 +1377,7 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
1353
1377
  """
1354
1378
  records: Param
1355
1379
  """Records of this type."""
1356
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
1380
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
1357
1381
  """Distinguish types from instances of the type."""
1358
1382
  _expect_many: bool = models.BooleanField(default=False, db_default=False)
1359
1383
  """Indicates whether values for this param are expected to occur a single or multiple times for an artifact/run (default `False`).
@@ -1369,6 +1393,28 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
1369
1393
  values: ParamValue
1370
1394
  """Values for this parameter."""
1371
1395
 
1396
+ def __init__(self, *args, **kwargs):
1397
+ from ._feature import process_init_feature_param
1398
+ from .errors import ValidationError
1399
+
1400
+ if len(args) == len(self._meta.concrete_fields):
1401
+ super().__init__(*args, **kwargs)
1402
+ return None
1403
+
1404
+ dtype = kwargs.get("dtype", None)
1405
+ kwargs = process_init_feature_param(args, kwargs, is_param=True)
1406
+ super().__init__(*args, **kwargs)
1407
+ dtype_str = kwargs.pop("dtype", None)
1408
+ if not self._state.adding:
1409
+ if not (
1410
+ self.dtype.startswith("cat")
1411
+ if dtype == "cat"
1412
+ else self.dtype == dtype_str
1413
+ ):
1414
+ raise ValidationError(
1415
+ f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
1416
+ )
1417
+
1372
1418
 
1373
1419
  # FeatureValue behaves in many ways like a link in a LinkORM
1374
1420
  # in particular, we don't want a _public field on it
@@ -1460,8 +1506,8 @@ class Run(Record):
1460
1506
 
1461
1507
  Create a run record:
1462
1508
 
1463
- >>> ln.Transform(name="Cell Ranger", version="7.2.0", type="pipeline").save()
1464
- >>> transform = ln.Transform.get(name="Cell Ranger", version="7.2.0")
1509
+ >>> ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
1510
+ >>> transform = ln.Transform.get(key="Cell Ranger", version="7.2.0")
1465
1511
  >>> run = ln.Run(transform)
1466
1512
 
1467
1513
  Create a global run context for a custom transform:
@@ -1687,7 +1733,7 @@ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
1687
1733
  """
1688
1734
  records: ULabel
1689
1735
  """Records of this type."""
1690
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
1736
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
1691
1737
  """Distinguish types from instances of the type.
1692
1738
 
1693
1739
  For example, a ulabel "Project" would be a type, and the actual projects "Project 1", "Project 2", would be records of that `type`.
@@ -1727,6 +1773,8 @@ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
1727
1773
  def __init__(
1728
1774
  self,
1729
1775
  name: str,
1776
+ type: ULabel | None = None,
1777
+ is_type: bool = False,
1730
1778
  description: str | None = None,
1731
1779
  reference: str | None = None,
1732
1780
  reference_type: str | None = None,
@@ -1765,12 +1813,15 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1765
1813
 
1766
1814
  Args:
1767
1815
  name: `str` Name of the feature, typically. column name.
1768
- dtype: `FeatureDtype | Registry | list[Registry]` See :class:`~lamindb.base.types.FeatureDtype`.
1816
+ dtype: `FeatureDtype | Registry | list[Registry] | FieldAttr` See :class:`~lamindb.base.types.FeatureDtype`.
1769
1817
  For categorical types, can define from which registry values are
1770
1818
  sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`.
1771
1819
  unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
1772
1820
  description: `str | None = None` A description.
1773
1821
  synonyms: `str | None = None` Bar-separated synonyms.
1822
+ nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.
1823
+ default_value: `Any | None = None` Default value for the feature.
1824
+ cat_filters: `dict[str, str] | None = None` Subset a registry by additional filters to define valid categories.
1774
1825
 
1775
1826
  Note:
1776
1827
 
@@ -1835,6 +1886,10 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1835
1886
  abstract = False
1836
1887
 
1837
1888
  _name_field: str = "name"
1889
+ _aux_fields: dict[str, tuple[str, type]] = {
1890
+ "0": ("default_value", bool),
1891
+ "1": ("nullable", bool),
1892
+ }
1838
1893
 
1839
1894
  id: int = models.AutoField(primary_key=True)
1840
1895
  """Internal id, valid only in one DB instance."""
@@ -1844,7 +1899,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1844
1899
  """Universal id, valid across DB instances."""
1845
1900
  name: str = CharField(max_length=150, db_index=True, unique=True)
1846
1901
  """Name of feature (hard unique constraint `unique=True`)."""
1847
- dtype: FeatureDtype = CharField(db_index=True)
1902
+ dtype: FeatureDtype | None = CharField(db_index=True, null=True)
1848
1903
  """Data type (:class:`~lamindb.base.types.FeatureDtype`).
1849
1904
 
1850
1905
  For categorical types, can define from which registry values are
@@ -1860,7 +1915,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1860
1915
  """
1861
1916
  records: Feature
1862
1917
  """Records of this type."""
1863
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
1918
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
1864
1919
  """Distinguish types from instances of the type."""
1865
1920
  unit: str | None = CharField(max_length=30, db_index=True, null=True)
1866
1921
  """Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
@@ -1922,10 +1977,15 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1922
1977
  def __init__(
1923
1978
  self,
1924
1979
  name: str,
1925
- dtype: FeatureDtype | Registry | list[Registry],
1926
- unit: str | None,
1927
- description: str | None,
1928
- synonyms: str | None,
1980
+ dtype: FeatureDtype | Registry | list[Registry] | FieldAttr,
1981
+ type: Feature | None = None,
1982
+ is_type: bool = False,
1983
+ unit: str | None = None,
1984
+ description: str | None = None,
1985
+ synonyms: str | None = None,
1986
+ nullable: bool = True,
1987
+ default_value: str | None = None,
1988
+ cat_filters: dict[str, str] | None = None,
1929
1989
  ): ...
1930
1990
 
1931
1991
  @overload
@@ -1950,6 +2010,62 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1950
2010
  """Save."""
1951
2011
  pass
1952
2012
 
2013
+ @property
2014
+ def default_value(self) -> Any:
2015
+ """A default value that overwrites missing values (default `None`).
2016
+
2017
+ This takes effect when you call `Curator.standardize()`.
2018
+
2019
+ If `default_value = None`, missing values like `pd.NA` or `np.nan` are kept.
2020
+ """
2021
+ if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
2022
+ return self._aux["af"]["0"]
2023
+ else:
2024
+ return None
2025
+
2026
+ @default_value.setter
2027
+ def default_value(self, value: bool) -> None:
2028
+ if self._aux is None:
2029
+ self._aux = {}
2030
+ if "af" not in self._aux:
2031
+ self._aux["af"] = {}
2032
+ self._aux["af"]["0"] = value
2033
+
2034
+ @property
2035
+ def nullable(self) -> bool:
2036
+ """Indicates whether the feature can have nullable values (default `True`).
2037
+
2038
+ Example::
2039
+
2040
+ import lamindb as ln
2041
+ import pandas as pd
2042
+
2043
+ disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save()
2044
+ schema = ln.Schema(features=[disease]).save()
2045
+ dataset = {"disease": pd.Categorical([pd.NA, "asthma"])}
2046
+ df = pd.DataFrame(dataset)
2047
+ curator = ln.curators.DataFrameCurator(df, schema)
2048
+ try:
2049
+ curator.validate()
2050
+ except ln.errors.ValidationError as e:
2051
+ assert str(e).startswith("non-nullable series 'disease' contains null values")
2052
+
2053
+ """
2054
+ if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
2055
+ value = self._aux["af"]["1"]
2056
+ return True if value is None else value
2057
+ else:
2058
+ return True
2059
+
2060
+ @nullable.setter
2061
+ def nullable(self, value: bool) -> None:
2062
+ assert isinstance(value, bool), value # noqa: S101
2063
+ if self._aux is None:
2064
+ self._aux = {}
2065
+ if "af" not in self._aux:
2066
+ self._aux["af"] = {}
2067
+ self._aux["af"]["1"] = value
2068
+
1953
2069
 
1954
2070
  class FeatureValue(Record, TracksRun):
1955
2071
  """Non-categorical features values.
@@ -2000,9 +2116,10 @@ class FeatureValue(Record, TracksRun):
2000
2116
  # Simple types: int, float, str, bool
2001
2117
  if isinstance(value, (int, float, str, bool)):
2002
2118
  try:
2003
- return cls.objects.create(
2004
- feature=feature, value=value, hash=None
2005
- ), False
2119
+ return (
2120
+ cls.objects.create(feature=feature, value=value, hash=None),
2121
+ False,
2122
+ )
2006
2123
  except IntegrityError:
2007
2124
  return cls.objects.get(feature=feature, value=value), True
2008
2125
 
@@ -2010,49 +2127,64 @@ class FeatureValue(Record, TracksRun):
2010
2127
  else:
2011
2128
  hash = hash_dict(value)
2012
2129
  try:
2013
- return cls.objects.create(
2014
- feature=feature, value=value, hash=hash
2015
- ), False
2130
+ return (
2131
+ cls.objects.create(feature=feature, value=value, hash=hash),
2132
+ False,
2133
+ )
2016
2134
  except IntegrityError:
2017
2135
  return cls.objects.get(feature=feature, hash=hash), True
2018
2136
 
2019
2137
 
2020
2138
  class Schema(Record, CanCurate, TracksRun):
2021
- """Feature sets (dataset schemas).
2139
+ """Schemas / feature sets.
2022
2140
 
2023
- Stores references to dataset schemas: these are the sets of columns in a dataset
2024
- that correspond to :class:`~lamindb.Feature`, :class:`~bionty.Gene`, :class:`~bionty.Protein` or other
2025
- entities.
2141
+ A simple schema is just a set of columns in a `DataFrame`, a "feature set".
2026
2142
 
2027
- .. dropdown:: Why does LaminDB model feature sets, not just features?
2028
-
2029
- 1. Performance: Imagine you measure the same panel of 20k transcripts in
2030
- 1M samples. By modeling the panel as a feature set, you can link all
2031
- your artifacts against one feature set and only need to store 1M
2032
- instead of 1M x 20k = 20B links.
2033
- 2. Interpretation: Model protein panels, gene panels, etc.
2034
- 3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
2035
-
2036
- These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
2143
+ A composite schema has multiple components, e.g. for an `AnnData`, each a feature set for `obs` and `var`.
2037
2144
 
2038
2145
  Args:
2039
- features: `Iterable[Record]` An iterable of :class:`~lamindb.Feature`
2146
+ features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
2040
2147
  records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
2041
2148
  a set upon instantiation. If you'd like to pass values, use
2042
2149
  :meth:`~lamindb.Schema.from_values` or
2043
2150
  :meth:`~lamindb.Schema.from_df`.
2151
+ components: `dict[str, Schema] | None = None` A dictionary mapping component names to
2152
+ their corresponding :class:`~lamindb.Schema` objects for composite schemas.
2153
+ name: `str | None = None` A name.
2154
+ description: `str | None = None` A description.
2044
2155
  dtype: `str | None = None` The simple type. Defaults to
2045
2156
  `None` for sets of :class:`~lamindb.Feature` records.
2046
2157
  Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
2047
- name: `str | None = None` A name.
2158
+ itype: `str | None = None` The feature identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
2159
+ type: `Schema | None = None` A type.
2160
+ is_type: `bool = False` Distinguish types from instances of the type.
2161
+ otype: `str | None = None` An object type to define the structure of a composite schema.
2162
+ minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
2163
+ ordered_set: `bool = False` Whether features are required to be ordered.
2164
+ maximal_set: `bool = False` If `True`, no additional features are allowed.
2165
+ slot: `str | None = None` The slot name when this schema is used as a component in a
2166
+ composite schema.
2167
+ coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
2168
+ during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
2169
+
2170
+ .. dropdown:: Why does LaminDB model schemas, not just features?
2171
+
2172
+ 1. Performance: Imagine you measure the same panel of 20k transcripts in
2173
+ 1M samples. By modeling the panel as a feature set, you can link all
2174
+ your artifacts against one feature set and only need to store 1M
2175
+ instead of 1M x 20k = 20B links.
2176
+ 2. Interpretation: Model protein panels, gene panels, etc.
2177
+ 3. Data integration: Feature sets provide the information that determines whether two datasets can be meaningfully concatenated.
2178
+
2179
+ These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
2048
2180
 
2049
2181
  Note:
2050
2182
 
2051
- A feature set can be identified by the `hash` its feature uids.
2183
+ A feature set can be identified by the `hash` of its feature uids.
2052
2184
  It's stored in the `.hash` field.
2053
2185
 
2054
- A `slot` provides a string key to access feature sets.
2055
- It's typically the accessor within the registered data object, here `pd.DataFrame.columns`.
2186
+ A `slot` provides a string key to access feature sets. For instance, for the schema of an
2187
+ `AnnData` object, it would be `'obs'` for `adata.obs`.
2056
2188
 
2057
2189
  See Also:
2058
2190
  :meth:`~lamindb.Schema.from_values`
@@ -2062,24 +2194,20 @@ class Schema(Record, CanCurate, TracksRun):
2062
2194
 
2063
2195
  Examples:
2064
2196
 
2065
- Create a feature set / schema from df with types:
2197
+ Create a schema (feature set) from df with types:
2066
2198
 
2067
2199
  >>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
2068
- >>> feature_set = ln.FeatureSet.from_df(df)
2200
+ >>> schema = ln.Schema.from_df(df)
2069
2201
 
2070
- Create a feature set / schema from features:
2202
+ Create a schema (feature set) from features:
2071
2203
 
2072
2204
  >>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
2073
- >>> feature_set = ln.FeatureSet(features)
2205
+ >>> schema = ln.Schema(features)
2074
2206
 
2075
- Create a feature set / schema from feature values:
2207
+ Create a schema (feature set) from identifier values:
2076
2208
 
2077
2209
  >>> import bionty as bt
2078
- >>> feature_set = ln.FeatureSet.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
2079
-
2080
- Link a feature set to an artifact:
2081
-
2082
- >>> artifact.features.add_feature_set(feature_set, slot="var")
2210
+ >>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
2083
2211
 
2084
2212
  """
2085
2213
 
@@ -2087,6 +2215,10 @@ class Schema(Record, CanCurate, TracksRun):
2087
2215
  abstract = False
2088
2216
 
2089
2217
  _name_field: str = "name"
2218
+ _aux_fields: dict[str, tuple[str, type]] = {
2219
+ "0": ("coerce_dtype", bool),
2220
+ "1": ("_index_feature_uid", str),
2221
+ }
2090
2222
 
2091
2223
  id: int = models.AutoField(primary_key=True)
2092
2224
  """Internal id, valid only in one DB instance."""
@@ -2098,89 +2230,116 @@ class Schema(Record, CanCurate, TracksRun):
2098
2230
  """A description."""
2099
2231
  n = IntegerField()
2100
2232
  """Number of features in the set."""
2101
- dtype: str | None = CharField(max_length=64, null=True)
2233
+ dtype: str | None = CharField(max_length=64, null=True, editable=False)
2102
2234
  """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
2103
2235
 
2104
2236
  For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
2105
2237
  """
2106
- # _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
2107
- # ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
2108
- itype: str | None = CharField(max_length=120, db_index=True, null=True)
2238
+ itype: str | None = CharField(
2239
+ max_length=120, db_index=True, null=True, editable=False
2240
+ )
2109
2241
  """A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
2110
2242
 
2111
2243
  Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
2112
2244
 
2113
2245
  .. versionchanged:: 1.0.0
2114
- Was called `itype` before.
2246
+ Was called `registry` before.
2115
2247
  """
2116
- type: Feature | None = ForeignKey(
2117
- "self", PROTECT, null=True, related_name="records"
2118
- )
2119
- """Type of feature set (e.g., 'ExpressionPanel', 'ProteinPanel', 'Multimodal', 'Metadata', 'Embedding').
2248
+ type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
2249
+ """Type of schema.
2120
2250
 
2121
- Allows to group feature sets by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
2251
+ Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
2252
+
2253
+ You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`.
2254
+
2255
+ Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.
2122
2256
  """
2123
- records: Feature
2257
+ records: Schema
2124
2258
  """Records of this type."""
2125
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
2259
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
2126
2260
  """Distinguish types from instances of the type."""
2127
2261
  otype: str | None = CharField(max_length=64, db_index=True, null=True)
2128
2262
  """Default Python object type, e.g., DataFrame, AnnData."""
2129
- hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
2263
+ hash: str | None = CharField(
2264
+ max_length=HASH_LENGTH, db_index=True, null=True, editable=False
2265
+ )
2130
2266
  """A hash of the set of feature identifiers.
2131
2267
 
2132
2268
  For a composite schema, the hash of hashes.
2133
2269
  """
2134
- minimal_set: bool = BooleanField(default=True, db_index=True)
2270
+ minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
2135
2271
  """Whether the schema contains a minimal set of linked features (default `True`).
2136
2272
 
2137
2273
  If `False`, no features are linked to this schema.
2138
2274
 
2139
2275
  If `True`, features are linked and considered as a minimally required set in validation.
2140
2276
  """
2141
- ordered_set: bool = BooleanField(default=False, db_index=True)
2142
- """Whether the linked features are ordered (default `False`)."""
2143
- maximal_set: bool = BooleanField(default=False, db_index=True)
2277
+ ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
2278
+ """Whether features are required to be ordered (default `False`)."""
2279
+ maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
2144
2280
  """If `False`, additional features are allowed (default `False`).
2145
2281
 
2146
2282
  If `True`, the the minimal set is a maximal set and no additional features are allowed.
2147
2283
  """
2148
- composite: Schema | None = ForeignKey(
2149
- "self", PROTECT, related_name="components", default=None, null=True
2150
- )
2151
- """The composite schema that contains this schema as a component.
2152
-
2153
- The composite schema composes multiple simpler schemas into one object.
2154
-
2155
- For example, an AnnData composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
2156
- """
2157
- slot: str | None = CharField(max_length=100, db_index=True, null=True)
2158
- """The slot in which the schema is stored in the composite schema."""
2159
- validated_by: Schema | None = ForeignKey(
2160
- "self", PROTECT, related_name="validated_schemas", default=None, null=True
2284
+ components: Schema = ManyToManyField(
2285
+ "self", through="SchemaComponent", symmetrical=False, related_name="composites"
2161
2286
  )
2162
- """The schema that validated this schema during curation.
2163
-
2164
- When performing validation, the schema that enforced validation is often less concrete than what is validated.
2165
-
2166
- For instance, the set of measured features might be a superset of the minimally required set of features.
2287
+ """Components of this schema."""
2288
+ composites: Schema
2289
+ """The composite schemas that contains this schema as a component.
2167
2290
 
2168
- Often, the curating schema does not specficy any concrete features at all
2291
+ For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
2169
2292
  """
2170
2293
  features: Feature
2171
2294
  """The features contained in the schema."""
2172
2295
  params: Param
2173
2296
  """The params contained in the schema."""
2174
2297
  artifacts: Artifact
2175
- """The artifacts that observe this schema."""
2298
+ """The artifacts that measure a feature set that matches this schema."""
2299
+ validated_artifacts: Artifact
2300
+ """The artifacts that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
2301
+ projects: Project
2302
+ """Associated projects."""
2176
2303
  _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
2304
+ # lamindb v2
2305
+ # _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
2306
+ # ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
2307
+ # -- the following two fields are dynamically removed from the API for now
2308
+ validated_by: Schema | None = ForeignKey(
2309
+ "self", PROTECT, related_name="validated_schemas", default=None, null=True
2310
+ )
2311
+ # """The schema that validated this schema during curation.
2312
+
2313
+ # When performing validation, the schema that enforced validation is often less concrete than what is validated.
2314
+
2315
+ # For instance, the set of measured features might be a superset of the minimally required set of features.
2316
+ # """
2317
+ # validated_schemas: Schema
2318
+ # """The schemas that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
2319
+ composite: Schema | None = ForeignKey(
2320
+ "self", PROTECT, related_name="+", default=None, null=True
2321
+ )
2322
+ # The legacy foreign key
2323
+ slot: str | None = CharField(max_length=100, db_index=True, null=True)
2324
+ # The legacy slot
2177
2325
 
2178
2326
  @overload
2179
2327
  def __init__(
2180
2328
  self,
2181
- features: Iterable[Record],
2182
- dtype: str | None = None,
2329
+ features: Iterable[Record] | None = None,
2330
+ components: dict[str, Schema] | None = None,
2183
2331
  name: str | None = None,
2332
+ description: str | None = None,
2333
+ dtype: str | None = None,
2334
+ itype: str | Registry | FieldAttr | None = None,
2335
+ type: Schema | None = None,
2336
+ is_type: bool = False,
2337
+ otype: str | None = None,
2338
+ minimal_set: bool = True,
2339
+ ordered_set: bool = False,
2340
+ maximal_set: bool = False,
2341
+ slot: str | None = None,
2342
+ coerce_dtype: bool = False,
2184
2343
  ): ...
2185
2344
 
2186
2345
  @overload
@@ -2256,6 +2415,58 @@ class Schema(Record, CanCurate, TracksRun):
2256
2415
  """A queryset for the individual records of the set."""
2257
2416
  pass
2258
2417
 
2418
+ @property
2419
+ def coerce_dtype(self) -> bool:
2420
+ """Whether dtypes should be coerced during validation.
2421
+
2422
+ For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
2423
+ """
2424
+ if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
2425
+ return self._aux["af"]["0"]
2426
+ else:
2427
+ return False
2428
+
2429
+ @coerce_dtype.setter
2430
+ def coerce_dtype(self, value: bool) -> None:
2431
+ if self._aux is None:
2432
+ self._aux = {}
2433
+ if "af" not in self._aux:
2434
+ self._aux["af"] = {}
2435
+ self._aux["af"]["0"] = value
2436
+
2437
+ @coerce_dtype.setter
2438
+ def coerce_dtype(self, value: bool) -> None:
2439
+ if self._aux is None:
2440
+ self._aux = {}
2441
+ if "af" not in self._aux:
2442
+ self._aux["af"] = {}
2443
+ self._aux["af"]["0"] = value
2444
+
2445
+ # @property
2446
+ # def index_feature(self) -> None | Feature:
2447
+ # # index_feature: `Record | None = None` A :class:`~lamindb.Feature` to validate the index of a `DataFrame`.
2448
+ # """The uid of the index feature, if `index_feature` was set."""
2449
+ # if self._index_feature_uid is None:
2450
+ # return None
2451
+ # else:
2452
+ # return self.features.get(uid=self._index_feature_uid)
2453
+
2454
+ # @property
2455
+ # def _index_feature_uid(self) -> None | str:
2456
+ # """The uid of the index feature, if `index_feature` was set."""
2457
+ # if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
2458
+ # return self._aux["af"]["1"]
2459
+ # else:
2460
+ # return None
2461
+
2462
+ # @_index_feature_uid.setter
2463
+ # def _index_feature_uid(self, value: str) -> None:
2464
+ # if self._aux is None:
2465
+ # self._aux = {}
2466
+ # if "af" not in self._aux:
2467
+ # self._aux["af"] = {}
2468
+ # self._aux["af"]["1"] = value
2469
+
2259
2470
  @property
2260
2471
  @deprecated("itype")
2261
2472
  def registry(self) -> str:
@@ -2265,8 +2476,23 @@ class Schema(Record, CanCurate, TracksRun):
2265
2476
  def registry(self, value) -> None:
2266
2477
  self.itype = value
2267
2478
 
2479
+ def describe(self, return_str=False) -> None | str:
2480
+ """Describe schema."""
2481
+ message = str(self) + "\ncomponents:"
2482
+ for component in self.components.all():
2483
+ message += "\n " + str(component)
2484
+ if return_str:
2485
+ return message
2486
+ else:
2487
+ print(message)
2488
+ return None
2489
+
2490
+ def _get_component(self, slot: str) -> Schema:
2491
+ return self.components.get(links_component__slot=slot)
2492
+
2268
2493
 
2269
2494
  class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2495
+ # Note that this docstring has to be consistent with Curator.save_artifact()
2270
2496
  """Datasets & models stored as files, folders, or arrays.
2271
2497
 
2272
2498
  Artifacts manage data in local or remote storage.
@@ -2276,10 +2502,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2276
2502
 
2277
2503
  Args:
2278
2504
  data: `UPathStr` A path to a local or remote folder or file.
2279
- type: `Literal["dataset", "model"] | None = None` The artifact type.
2280
- key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
2505
+ kind: `Literal["dataset", "model"] | None = None` Distinguish models from datasets from other files & folders.
2506
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
2281
2507
  description: `str | None = None` A description.
2282
- revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
2508
+ revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
2283
2509
  run: `Run | None = None` The run that creates the artifact.
2284
2510
 
2285
2511
  .. dropdown:: Typical storage formats & their API accessors
@@ -2313,26 +2539,28 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2313
2539
 
2314
2540
  Examples:
2315
2541
 
2316
- Create an artifact from a file path and pass `description`:
2542
+ Create an artifact by passing `key`:
2543
+
2544
+ >>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
2545
+ >>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
2317
2546
 
2318
- >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv", description="My file")
2319
- >>> artifact = ln.Artifact("./my_local_file.jpg", description="My image")
2547
+ Calling `.save()` uploads the file to the default storage location of your lamindb instance.
2548
+ (If it's a local instance, the "upload" is a mere copy operation.)
2320
2549
 
2321
- You can also pass `key` to create a virtual filepath hierarchy:
2550
+ If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
2322
2551
 
2323
- >>> artifact = ln.Artifact("./my_local_file.jpg", key="example_datasets/dataset1.jpg")
2552
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
2324
2553
 
2325
- What works for files also works for folders:
2554
+ You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
2326
2555
 
2327
- >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder")
2328
- >>> artifact = ln.Artifact("./my_local_folder", description="My local folder")
2329
- >>> artifact = ln.Artifact("./my_local_folder", key="project1/my_target_folder")
2556
+ >>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
2557
+ >>> artifact_v2.versions.df() # see all versions
2330
2558
 
2331
2559
  .. dropdown:: Why does the API look this way?
2332
2560
 
2333
2561
  It's inspired by APIs building on AWS S3.
2334
2562
 
2335
- Both boto3 and quilt select a bucket (akin to default storage in LaminDB) and define a target path through a `key` argument.
2563
+ Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
2336
2564
 
2337
2565
  In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
2338
2566
 
@@ -2349,16 +2577,18 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2349
2577
  bucket = quilt3.Bucket('mybucket')
2350
2578
  bucket.put_file('hello.txt', '/tmp/hello.txt')
2351
2579
 
2580
+ Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
2352
2581
 
2353
- Make a new version of an artifact:
2582
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
2583
+ >>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
2354
2584
 
2355
- >>> artifact = ln.Artifact.from_df(df, key="example_datasets/dataset1.parquet").save()
2356
- >>> artifact_v2 = ln.Artifact(df_updated, key="example_datasets/dataset1.parquet").save()
2585
+ Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
2357
2586
 
2358
- Alternatively, if you don't want to provide a value for `key`, you can use `revises`:
2587
+ >>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
2359
2588
 
2360
- >>> artifact = ln.Artifact.from_df(df, description="My dataframe").save()
2361
- >>> artifact_v2 = ln.Artifact(df_updated, revises=artifact).save()
2589
+ If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
2590
+ the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
2591
+ detects the duplication and will return the existing artifact.
2362
2592
 
2363
2593
  """
2364
2594
 
@@ -2455,9 +2685,11 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2455
2685
  """
2456
2686
  description: str | None = CharField(db_index=True, null=True)
2457
2687
  """A description."""
2458
- storage: Storage = ForeignKey(Storage, PROTECT, related_name="artifacts")
2688
+ storage: Storage = ForeignKey(
2689
+ Storage, PROTECT, related_name="artifacts", editable=False
2690
+ )
2459
2691
  """Storage location, e.g. an S3 or GCP bucket or a local directory."""
2460
- suffix: str = CharField(max_length=30, db_index=True)
2692
+ suffix: str = CharField(max_length=30, db_index=True, editable=False)
2461
2693
  # Initially, we thought about having this be nullable to indicate folders
2462
2694
  # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
2463
2695
  """Path suffix or empty string if no canonical suffix exists.
@@ -2470,19 +2702,27 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2470
2702
  null=True,
2471
2703
  )
2472
2704
  """:class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
2473
- otype: str | None = CharField(max_length=64, db_index=True, null=True)
2705
+ otype: str | None = CharField(
2706
+ max_length=64, db_index=True, null=True, editable=False
2707
+ )
2474
2708
  """Default Python object type, e.g., DataFrame, AnnData."""
2475
- size: int | None = BigIntegerField(null=True, db_index=True, default=None)
2709
+ size: int | None = BigIntegerField(
2710
+ null=True, db_index=True, default=None, editable=False
2711
+ )
2476
2712
  """Size in bytes.
2477
2713
 
2478
2714
  Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
2479
2715
  """
2480
- hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
2716
+ hash: str | None = CharField(
2717
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True, editable=False
2718
+ )
2481
2719
  """Hash or pseudo-hash of artifact content.
2482
2720
 
2483
2721
  Useful to ascertain integrity and avoid duplication.
2484
2722
  """
2485
- n_files: int | None = BigIntegerField(null=True, db_index=True, default=None)
2723
+ n_files: int | None = BigIntegerField(
2724
+ null=True, db_index=True, default=None, editable=False
2725
+ )
2486
2726
  """Number of files for folder-like artifacts, `None` for file-like artifacts.
2487
2727
 
2488
2728
  Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
@@ -2490,19 +2730,28 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2490
2730
  .. versionchanged:: 1.0
2491
2731
  Renamed from `n_objects` to `n_files`.
2492
2732
  """
2493
- n_observations: int | None = BigIntegerField(null=True, db_index=True, default=None)
2733
+ n_observations: int | None = BigIntegerField(
2734
+ null=True, db_index=True, default=None, editable=False
2735
+ )
2494
2736
  """Number of observations.
2495
2737
 
2496
2738
  Typically, this denotes the first array dimension.
2497
2739
  """
2498
- _hash_type: str | None = CharField(max_length=30, db_index=True, null=True)
2740
+ _hash_type: str | None = CharField(
2741
+ max_length=30, db_index=True, null=True, editable=False
2742
+ )
2499
2743
  """Type of hash."""
2500
2744
  ulabels: ULabel = models.ManyToManyField(
2501
2745
  ULabel, through="ArtifactULabel", related_name="artifacts"
2502
2746
  )
2503
2747
  """The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
2504
2748
  run: Run | None = ForeignKey(
2505
- Run, PROTECT, related_name="output_artifacts", null=True, default=None
2749
+ Run,
2750
+ PROTECT,
2751
+ related_name="output_artifacts",
2752
+ null=True,
2753
+ default=None,
2754
+ editable=False,
2506
2755
  )
2507
2756
  """Run that created the artifact."""
2508
2757
  input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
@@ -2516,13 +2765,17 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2516
2765
  collections: Collection
2517
2766
  """The collections that this artifact is part of."""
2518
2767
  schema: Schema | None = ForeignKey(
2519
- Schema, PROTECT, null=True, default=None, related_name="artifacts"
2768
+ Schema,
2769
+ PROTECT,
2770
+ null=True,
2771
+ default=None,
2772
+ related_name="validated_artifacts",
2520
2773
  )
2521
- """The schema of the artifact (to be populated in lamindb 1.1)."""
2522
- _schemas_m2m: Schema = models.ManyToManyField(
2523
- Schema, related_name="_artifacts_m2m", through="ArtifactSchema"
2774
+ """The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
2775
+ feature_sets: Schema = models.ManyToManyField(
2776
+ Schema, related_name="artifacts", through="ArtifactSchema"
2524
2777
  )
2525
- """[For backward compatibility] The feature sets measured in the artifact."""
2778
+ """The feature sets measured by the artifact."""
2526
2779
  _feature_values: FeatureValue = models.ManyToManyField(
2527
2780
  FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
2528
2781
  )
@@ -2543,6 +2796,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2543
2796
  PROTECT,
2544
2797
  default=current_user_id,
2545
2798
  related_name="created_artifacts",
2799
+ editable=False,
2546
2800
  )
2547
2801
  """Creator of record."""
2548
2802
  _overwrite_versions: bool = BooleanField(default=None)
@@ -2566,7 +2820,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2566
2820
  # here; and we might refactor this but we might also keep that internal
2567
2821
  # usage
2568
2822
  data: UPathStr,
2569
- type: ArtifactKind | None = None,
2823
+ kind: ArtifactKind | None = None,
2570
2824
  key: str | None = None,
2571
2825
  description: str | None = None,
2572
2826
  revises: Artifact | None = None,
@@ -2606,11 +2860,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2606
2860
  def n_objects(self) -> int:
2607
2861
  return self.n_files
2608
2862
 
2609
- @property
2610
- def feature_sets(self) -> QuerySet[Schema]:
2611
- """Feature sets linked to this artifact."""
2612
- return self._schemas_m2m
2613
-
2614
2863
  # add the below because this is what people will have in their code
2615
2864
  # if they implement the recommended migration strategy
2616
2865
  # - FeatureSet -> Schema
@@ -2620,14 +2869,14 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2620
2869
  # def schemas(self) -> QuerySet[Schema]:
2621
2870
  # """Schemas linked to artifact via many-to-many relationship.
2622
2871
 
2623
- # Is now mediating the private `._schemas_m2m` relationship during
2872
+ # Is now mediating the private `.feature_sets` relationship during
2624
2873
  # a transition period to better schema management.
2625
2874
 
2626
2875
  # .. versionchanged: 1.0
2627
2876
  # Was previously called `.feature_sets`.
2628
2877
 
2629
2878
  # """
2630
- # return self._schemas_m2m
2879
+ # return self.feature_sets
2631
2880
 
2632
2881
  @property
2633
2882
  def path(self) -> Path:
@@ -2637,7 +2886,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2637
2886
 
2638
2887
  >>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
2639
2888
  >>> artifact.path
2640
- S3Path('s3://my-bucket/my-file.csv')
2889
+ S3QueryPath('s3://my-bucket/my-file.csv')
2641
2890
 
2642
2891
  File in local storage:
2643
2892
 
@@ -2652,6 +2901,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2652
2901
  def from_df(
2653
2902
  cls,
2654
2903
  df: pd.DataFrame,
2904
+ *,
2655
2905
  key: str | None = None,
2656
2906
  description: str | None = None,
2657
2907
  run: Run | None = None,
@@ -2692,6 +2942,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2692
2942
  def from_anndata(
2693
2943
  cls,
2694
2944
  adata: AnnData | UPathStr,
2945
+ *,
2695
2946
  key: str | None = None,
2696
2947
  description: str | None = None,
2697
2948
  run: Run | None = None,
@@ -2728,6 +2979,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2728
2979
  def from_mudata(
2729
2980
  cls,
2730
2981
  mdata: MuData,
2982
+ *,
2731
2983
  key: str | None = None,
2732
2984
  description: str | None = None,
2733
2985
  run: Run | None = None,
@@ -2760,11 +3012,38 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2760
3012
  pass
2761
3013
 
2762
3014
  @classmethod
2763
- def from_dir(
3015
+ def from_tiledbsoma(
2764
3016
  cls,
2765
3017
  path: UPathStr,
3018
+ *,
2766
3019
  key: str | None = None,
3020
+ description: str | None = None,
3021
+ run: Run | None = None,
3022
+ revises: Artifact | None = None,
3023
+ **kwargs,
3024
+ ) -> Artifact:
3025
+ """Create from a tiledbsoma store.
3026
+
3027
+ Args:
3028
+ path: A tiledbsoma store with .tiledbsoma suffix.
3029
+ key: A relative path within default storage,
3030
+ e.g., `"myfolder/mystore.tiledbsoma"`.
3031
+ description: A description.
3032
+ revises: An old version of the artifact.
3033
+ run: The run that creates the artifact.
3034
+
3035
+ Examples:
3036
+ >>> artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store")
3037
+ >>> artifact.save()
3038
+ """
3039
+ pass
3040
+
3041
+ @classmethod
3042
+ def from_dir(
3043
+ cls,
3044
+ path: UPathStr,
2767
3045
  *,
3046
+ key: str | None = None,
2768
3047
  run: Run | None = None,
2769
3048
  ) -> list[Artifact]:
2770
3049
  """Create a list of artifact objects from a directory.
@@ -2818,12 +3097,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2818
3097
  pass
2819
3098
 
2820
3099
  def open(
2821
- self, mode: str = "r", is_run_input: bool | None = None
3100
+ self, mode: str = "r", is_run_input: bool | None = None, **kwargs
2822
3101
  ) -> (
2823
3102
  AnnDataAccessor
2824
3103
  | BackedAccessor
2825
3104
  | SOMACollection
2826
3105
  | SOMAExperiment
3106
+ | SOMAMeasurement
2827
3107
  | PyArrowDataset
2828
3108
  ):
2829
3109
  """Return a cloud-backed data object.
@@ -2966,13 +3246,13 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
2966
3246
 
2967
3247
  Args:
2968
3248
  artifacts: `list[Artifact]` A list of artifacts.
2969
- name: `str` A name.
3249
+ key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.
2970
3250
  description: `str | None = None` A description.
2971
3251
  revises: `Collection | None = None` An old version of the collection.
2972
3252
  run: `Run | None = None` The run that creates the collection.
2973
3253
  meta: `Artifact | None = None` An artifact that defines metadata for the collection.
2974
- reference: `str | None = None` For instance, an external ID or a URL.
2975
- reference_type: `str | None = None` For instance, `"url"`.
3254
+ reference: `str | None = None` A simple reference, e.g. an external ID or a URL.
3255
+ reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`.
2976
3256
 
2977
3257
  See Also:
2978
3258
  :class:`~lamindb.Artifact`
@@ -2981,11 +3261,11 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
2981
3261
 
2982
3262
  Create a collection from a list of :class:`~lamindb.Artifact` objects:
2983
3263
 
2984
- >>> collection = ln.Collection([artifact1, artifact2], name="My collection")
3264
+ >>> collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection")
2985
3265
 
2986
3266
  Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):
2987
3267
 
2988
- >>> collection = ln.Collection(data_artifact, name="My collection", meta=metadata_artifact)
3268
+ >>> collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact)
2989
3269
 
2990
3270
  """
2991
3271
 
@@ -3008,13 +3288,15 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3008
3288
  """Universal id, valid across DB instances."""
3009
3289
  key: str = CharField(db_index=True)
3010
3290
  """Name or path-like key."""
3011
- # these here is the only case in which we use a TextField
3291
+ # below is the only case in which we use a TextField
3012
3292
  # for description; we do so because users had descriptions exceeding 255 chars
3013
3293
  # in their instances
3014
3294
  description: str | None = TextField(null=True, db_index=True)
3015
3295
  """A description or title."""
3016
- hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
3017
- """Hash of collection content. 86 base64 chars allow to store 64 bytes, 512 bits."""
3296
+ hash: str | None = CharField(
3297
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True
3298
+ )
3299
+ """Hash of collection content."""
3018
3300
  reference: str | None = CharField(max_length=255, db_index=True, null=True)
3019
3301
  """A reference like URL or external ID."""
3020
3302
  # also for reference_type here, we allow an extra long max_length
@@ -3058,7 +3340,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3058
3340
  def __init__(
3059
3341
  self,
3060
3342
  artifacts: list[Artifact],
3061
- name: str,
3343
+ key: str,
3062
3344
  description: str | None = None,
3063
3345
  meta: Any | None = None,
3064
3346
  reference: str | None = None,
@@ -3084,21 +3366,39 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3084
3366
  """Add an artifact to the collection.
3085
3367
 
3086
3368
  Creates a new version of the collection.
3369
+ This does not modify the original collection in-place, but returns a new version
3370
+ of the original collection with the added artifact.
3087
3371
 
3088
3372
  Args:
3089
3373
  artifact: An artifact to add to the collection.
3090
3374
  run: The run that creates the new version of the collection.
3091
3375
 
3376
+ Examples:
3377
+ >>> collection = ln.Collection(artifact, key="new collection")
3378
+ >>> collecton.save()
3379
+ >>> collection = collection.append(another_artifact) # returns a new version
3380
+ >>> collection.save() # save the new version
3381
+
3092
3382
  .. versionadded:: 0.76.14
3093
3383
  """
3094
3384
  pass
3095
3385
 
3386
+ def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
3387
+ """Return a cloud-backed pyarrow Dataset.
3388
+
3389
+ Works for `pyarrow` compatible formats.
3390
+
3391
+ Notes:
3392
+ For more info, see tutorial: :doc:`/arrays`.
3393
+ """
3394
+ pass
3395
+
3096
3396
  def mapped(
3097
3397
  self,
3098
3398
  layers_keys: str | list[str] | None = None,
3099
3399
  obs_keys: str | list[str] | None = None,
3100
3400
  obsm_keys: str | list[str] | None = None,
3101
- obs_filter: dict[str, str | tuple[str, ...]] | None = None,
3401
+ obs_filter: dict[str, str | list[str]] | None = None,
3102
3402
  join: Literal["inner", "outer"] | None = "inner",
3103
3403
  encode_labels: bool | list[str] = True,
3104
3404
  unknown_label: str | dict[str, str] | None = None,
@@ -3136,7 +3436,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3136
3436
  obsm_keys: Keys from the ``.obsm`` slots.
3137
3437
  obs_filter: Select only observations with these values for the given obs columns.
3138
3438
  Should be a dictionary with obs column names as keys
3139
- and filtering values (a string or a tuple of strings) as values.
3439
+ and filtering values (a string or a list of strings) as values.
3140
3440
  join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
3141
3441
  does not join.
3142
3442
  encode_labels: Encode labels into integers.
@@ -3330,7 +3630,7 @@ class Project(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3330
3630
  """Type of project (e.g., 'Program', 'Project', 'GithubIssue', 'Task')."""
3331
3631
  records: Project
3332
3632
  """Records of this type."""
3333
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
3633
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
3334
3634
  """Distinguish types from instances of the type."""
3335
3635
  abbr: str | None = CharField(max_length=32, db_index=True, null=True)
3336
3636
  """An abbreviation."""
@@ -3434,7 +3734,7 @@ class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3434
3734
  """
3435
3735
  records: Reference
3436
3736
  """Records of this type."""
3437
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
3737
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
3438
3738
  """Distinguish types from instances of the type."""
3439
3739
  url: str | None = URLField(null=True)
3440
3740
  """URL linking to the reference."""
@@ -3476,7 +3776,7 @@ class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3476
3776
  # -------------------------------------------------------------------------------------
3477
3777
  # Data models
3478
3778
 
3479
- from django.contrib.postgres.fields import JSONField
3779
+ from django.contrib.postgres.fields import JSONField # type: ignore
3480
3780
  from django.core.exceptions import ValidationError
3481
3781
  from django.db import models
3482
3782
 
@@ -3543,7 +3843,7 @@ class RunData(BasicRecord, DataMixin):
3543
3843
  class Meta:
3544
3844
  constraints = [
3545
3845
  models.CheckConstraint(
3546
- check=(
3846
+ condition=(
3547
3847
  models.Q(feature__isnull=False, param__isnull=True)
3548
3848
  | models.Q(feature__isnull=True, param__isnull=False)
3549
3849
  ),
@@ -3574,7 +3874,7 @@ class FlexTable(Record, TracksRun, TracksUpdates):
3574
3874
  """Type of tidy table, e.g., `Cell`, `SampleSheet`, etc."""
3575
3875
  records: ULabel
3576
3876
  """Records of this type."""
3577
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
3877
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
3578
3878
  """Distinguish types from instances of the type."""
3579
3879
  description: str = CharField(null=True, db_index=True)
3580
3880
  """A description."""
@@ -3593,7 +3893,7 @@ class FlexTableData(BasicRecord, DataMixin):
3593
3893
  class Meta:
3594
3894
  constraints = [
3595
3895
  models.CheckConstraint(
3596
- check=(
3896
+ condition=(
3597
3897
  models.Q(feature__isnull=False, param__isnull=True)
3598
3898
  | models.Q(feature__isnull=True, param__isnull=False)
3599
3899
  ),
@@ -3621,8 +3921,8 @@ class LinkORM:
3621
3921
 
3622
3922
  class SchemaFeature(BasicRecord, LinkORM):
3623
3923
  id: int = models.BigAutoField(primary_key=True)
3624
- schema: Schema = ForeignKey(Schema, CASCADE, related_name="+")
3625
- feature: Feature = ForeignKey(Feature, PROTECT, related_name="+")
3924
+ schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature")
3925
+ feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")
3626
3926
 
3627
3927
  class Meta:
3628
3928
  unique_together = ("schema", "feature")
@@ -3640,15 +3940,22 @@ class SchemaParam(BasicRecord, LinkORM):
3640
3940
  class ArtifactSchema(BasicRecord, LinkORM, TracksRun):
3641
3941
  id: int = models.BigAutoField(primary_key=True)
3642
3942
  artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="_links_schema")
3643
- # we follow the lower() case convention rather than snake case for link models
3644
3943
  schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
3645
- slot: str | None = CharField(max_length=40, null=True)
3646
- feature_ref_is_semantic: bool | None = BooleanField(
3647
- null=True
3648
- ) # like Feature name or Gene symbol or CellMarker name
3944
+ slot: str | None = CharField(null=True)
3945
+ feature_ref_is_semantic: bool | None = BooleanField(null=True)
3946
+
3947
+ class Meta:
3948
+ unique_together = (("artifact", "schema"), ("artifact", "slot"))
3949
+
3950
+
3951
+ class SchemaComponent(BasicRecord, LinkORM, TracksRun):
3952
+ id: int = models.BigAutoField(primary_key=True)
3953
+ composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_composite")
3954
+ component: Schema = ForeignKey(Schema, PROTECT, related_name="links_component")
3955
+ slot: str | None = CharField(null=True)
3649
3956
 
3650
3957
  class Meta:
3651
- unique_together = ("artifact", "schema")
3958
+ unique_together = (("composite", "component"), ("composite", "slot"))
3652
3959
 
3653
3960
 
3654
3961
  class CollectionArtifact(BasicRecord, LinkORM, TracksRun):
@@ -3883,14 +4190,14 @@ class CollectionReference(BasicRecord, LinkORM, TracksRun):
3883
4190
  unique_together = ("collection", "reference")
3884
4191
 
3885
4192
 
3886
- # class Migration(Record):
3887
- # app = CharField(max_length=255)
3888
- # name = CharField(max_length=255)
3889
- # applied: datetime = DateTimeField()
4193
+ class Migration(BasicRecord):
4194
+ app = CharField(max_length=255)
4195
+ name = CharField(max_length=255)
4196
+ applied: datetime = DateTimeField()
3890
4197
 
3891
- # class Meta:
3892
- # db_table = "django_migrations"
3893
- # managed = False
4198
+ class Meta:
4199
+ db_table = "django_migrations"
4200
+ managed = False
3894
4201
 
3895
4202
 
3896
4203
  # -------------------------------------------------------------------------------------