lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. lamindb/__init__.py +14 -5
  2. lamindb/_artifact.py +174 -57
  3. lamindb/_can_curate.py +27 -8
  4. lamindb/_collection.py +85 -51
  5. lamindb/_feature.py +177 -41
  6. lamindb/_finish.py +222 -81
  7. lamindb/_from_values.py +83 -98
  8. lamindb/_parents.py +4 -4
  9. lamindb/_query_set.py +59 -17
  10. lamindb/_record.py +171 -53
  11. lamindb/_run.py +4 -4
  12. lamindb/_save.py +33 -10
  13. lamindb/_schema.py +135 -38
  14. lamindb/_storage.py +1 -1
  15. lamindb/_tracked.py +106 -0
  16. lamindb/_transform.py +21 -8
  17. lamindb/_ulabel.py +5 -14
  18. lamindb/base/validation.py +2 -6
  19. lamindb/core/__init__.py +13 -14
  20. lamindb/core/_context.py +39 -36
  21. lamindb/core/_data.py +29 -25
  22. lamindb/core/_describe.py +1 -1
  23. lamindb/core/_django.py +1 -1
  24. lamindb/core/_feature_manager.py +54 -44
  25. lamindb/core/_label_manager.py +4 -4
  26. lamindb/core/_mapped_collection.py +20 -7
  27. lamindb/core/datasets/__init__.py +6 -1
  28. lamindb/core/datasets/_core.py +12 -11
  29. lamindb/core/datasets/_small.py +66 -20
  30. lamindb/core/exceptions.py +1 -90
  31. lamindb/core/loaders.py +7 -13
  32. lamindb/core/relations.py +6 -4
  33. lamindb/core/storage/_anndata_accessor.py +41 -0
  34. lamindb/core/storage/_backed_access.py +2 -2
  35. lamindb/core/storage/_pyarrow_dataset.py +25 -15
  36. lamindb/core/storage/_tiledbsoma.py +56 -12
  37. lamindb/core/storage/paths.py +41 -22
  38. lamindb/core/subsettings/_creation_settings.py +4 -16
  39. lamindb/curators/__init__.py +2168 -833
  40. lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
  41. lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
  42. lamindb/errors.py +96 -0
  43. lamindb/integrations/_vitessce.py +3 -3
  44. lamindb/migrations/0069_squashed.py +76 -75
  45. lamindb/migrations/0075_lamindbv1_part5.py +4 -5
  46. lamindb/migrations/0082_alter_feature_dtype.py +21 -0
  47. lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
  48. lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
  49. lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
  50. lamindb/migrations/0086_various.py +95 -0
  51. lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
  52. lamindb/migrations/0088_schema_components.py +273 -0
  53. lamindb/migrations/0088_squashed.py +4372 -0
  54. lamindb/models.py +423 -156
  55. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
  56. lamindb-1.1.0.dist-info/RECORD +95 -0
  57. lamindb/curators/_spatial.py +0 -528
  58. lamindb/migrations/0052_squashed.py +0 -1261
  59. lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
  60. lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
  61. lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
  62. lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
  63. lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
  64. lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
  65. lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
  66. lamindb/migrations/0060_alter_artifact__actions.py +0 -22
  67. lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
  68. lamindb/migrations/0062_add_is_latest_field.py +0 -32
  69. lamindb/migrations/0063_populate_latest_field.py +0 -45
  70. lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
  71. lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
  72. lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
  73. lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
  74. lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
  75. lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
  76. lamindb-1.0.4.dist-info/RECORD +0 -102
  77. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
  78. {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
lamindb/models.py CHANGED
@@ -65,6 +65,7 @@ if TYPE_CHECKING:
65
65
  from pyarrow.dataset import Dataset as PyArrowDataset
66
66
  from tiledbsoma import Collection as SOMACollection
67
67
  from tiledbsoma import Experiment as SOMAExperiment
68
+ from tiledbsoma import Measurement as SOMAMeasurement
68
69
  from upath import UPath
69
70
 
70
71
  from lamindb.core import LabelManager, MappedCollection, QuerySet, RecordList
@@ -152,9 +153,13 @@ def current_run() -> Run | None:
152
153
  if not _TRACKING_READY:
153
154
  _TRACKING_READY = _check_instance_setup()
154
155
  if _TRACKING_READY:
155
- import lamindb.core
156
+ import lamindb
156
157
 
157
- return lamindb.context.run
158
+ # also see get_run() in core._data
159
+ run = lamindb._tracked.get_current_tracked_run()
160
+ if run is None:
161
+ run = lamindb.context.run
162
+ return run
158
163
  else:
159
164
  return None
160
165
 
@@ -239,6 +244,7 @@ class CanCurate:
239
244
  mute: bool = False,
240
245
  organism: str | Record | None = None,
241
246
  source: Record | None = None,
247
+ strict_source: bool = False,
242
248
  ) -> InspectResult:
243
249
  """Inspect if values are mappable to a field.
244
250
 
@@ -252,6 +258,10 @@ class CanCurate:
252
258
  mute: Whether to mute logging.
253
259
  organism: An Organism name or record.
254
260
  source: A `bionty.Source` record that specifies the version to inspect against.
261
+ strict_source: Determines the validation behavior against records in the registry.
262
+ - If `False`, validation will include all records in the registry, ignoring the specified source.
263
+ - If `True`, validation will only include records in the registry that are linked to the specified source.
264
+ Note: this parameter won't affect validation against bionty/public sources.
255
265
 
256
266
  See Also:
257
267
  :meth:`~lamindb.core.CanCurate.validate`
@@ -278,10 +288,11 @@ class CanCurate:
278
288
  mute: bool = False,
279
289
  organism: str | Record | None = None,
280
290
  source: Record | None = None,
291
+ strict_source: bool = False,
281
292
  ) -> np.ndarray:
282
293
  """Validate values against existing values of a string field.
283
294
 
284
- Note this is strict validation, only asserts exact matches.
295
+ Note this is strict_source validation, only asserts exact matches.
285
296
 
286
297
  Args:
287
298
  values: Values that will be validated against the field.
@@ -291,6 +302,10 @@ class CanCurate:
291
302
  mute: Whether to mute logging.
292
303
  organism: An Organism name or record.
293
304
  source: A `bionty.Source` record that specifies the version to validate against.
305
+ strict_source: Determines the validation behavior against records in the registry.
306
+ - If `False`, validation will include all records in the registry, ignoring the specified source.
307
+ - If `True`, validation will only include records in the registry that are linked to the specified source.
308
+ Note: this parameter won't affect validation against bionty/public sources.
294
309
 
295
310
  Returns:
296
311
  A vector of booleans indicating if an element is validated.
@@ -370,6 +385,7 @@ class CanCurate:
370
385
  synonyms_field: str = "synonyms",
371
386
  organism: str | Record | None = None,
372
387
  source: Record | None = None,
388
+ strict_source: bool = False,
373
389
  ) -> list[str] | dict[str, str]:
374
390
  """Maps input synonyms to standardized names.
375
391
 
@@ -392,6 +408,10 @@ class CanCurate:
392
408
  synonyms_field: A field containing the concatenated synonyms.
393
409
  organism: An Organism name or record.
394
410
  source: A `bionty.Source` record that specifies the version to validate against.
411
+ strict_source: Determines the validation behavior against records in the registry.
412
+ - If `False`, validation will include all records in the registry, ignoring the specified source.
413
+ - If `True`, validation will only include records in the registry that are linked to the specified source.
414
+ Note: this parameter won't affect validation against bionty/public sources.
395
415
 
396
416
  Returns:
397
417
  If `return_mapper` is `False`: a list of standardized names. Otherwise,
@@ -1187,7 +1207,7 @@ class Transform(Record, IsVersioned):
1187
1207
 
1188
1208
  Create a transform for a pipeline:
1189
1209
 
1190
- >>> transform = ln.Transform(name="Cell Ranger", version="7.2.0", type="pipeline").save()
1210
+ >>> transform = ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
1191
1211
 
1192
1212
  Create a transform from a notebook:
1193
1213
 
@@ -1230,7 +1250,11 @@ class Transform(Record, IsVersioned):
1230
1250
  .. versionchanged:: 0.75
1231
1251
  The `source_code` field is no longer an artifact, but a text field.
1232
1252
  """
1233
- hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
1253
+ # we have a unique constraint here but not on artifact because on artifact, we haven't yet
1254
+ # settled how we model the same artifact in different storage locations
1255
+ hash: str | None = CharField(
1256
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True
1257
+ )
1234
1258
  """Hash of the source code."""
1235
1259
  reference: str | None = CharField(max_length=255, db_index=True, null=True)
1236
1260
  """Reference for the transform, e.g., a URL."""
@@ -1340,7 +1364,7 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
1340
1364
  _name_field: str = "name"
1341
1365
 
1342
1366
  name: str = CharField(max_length=100, db_index=True)
1343
- dtype: str = CharField(max_length=64, db_index=True)
1367
+ dtype: str | None = CharField(db_index=True, null=True)
1344
1368
  """Data type ("num", "cat", "int", "float", "bool", "datetime").
1345
1369
 
1346
1370
  For categorical types, can define from which registry values are
@@ -1353,7 +1377,7 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
1353
1377
  """
1354
1378
  records: Param
1355
1379
  """Records of this type."""
1356
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
1380
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
1357
1381
  """Distinguish types from instances of the type."""
1358
1382
  _expect_many: bool = models.BooleanField(default=False, db_default=False)
1359
1383
  """Indicates whether values for this param are expected to occur a single or multiple times for an artifact/run (default `False`).
@@ -1369,6 +1393,28 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
1369
1393
  values: ParamValue
1370
1394
  """Values for this parameter."""
1371
1395
 
1396
+ def __init__(self, *args, **kwargs):
1397
+ from ._feature import process_init_feature_param
1398
+ from .errors import ValidationError
1399
+
1400
+ if len(args) == len(self._meta.concrete_fields):
1401
+ super().__init__(*args, **kwargs)
1402
+ return None
1403
+
1404
+ dtype = kwargs.get("dtype", None)
1405
+ kwargs = process_init_feature_param(args, kwargs, is_param=True)
1406
+ super().__init__(*args, **kwargs)
1407
+ dtype_str = kwargs.pop("dtype", None)
1408
+ if not self._state.adding:
1409
+ if not (
1410
+ self.dtype.startswith("cat")
1411
+ if dtype == "cat"
1412
+ else self.dtype == dtype_str
1413
+ ):
1414
+ raise ValidationError(
1415
+ f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
1416
+ )
1417
+
1372
1418
 
1373
1419
  # FeatureValue behaves in many ways like a link in a LinkORM
1374
1420
  # in particular, we don't want a _public field on it
@@ -1460,8 +1506,8 @@ class Run(Record):
1460
1506
 
1461
1507
  Create a run record:
1462
1508
 
1463
- >>> ln.Transform(name="Cell Ranger", version="7.2.0", type="pipeline").save()
1464
- >>> transform = ln.Transform.get(name="Cell Ranger", version="7.2.0")
1509
+ >>> ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
1510
+ >>> transform = ln.Transform.get(key="Cell Ranger", version="7.2.0")
1465
1511
  >>> run = ln.Run(transform)
1466
1512
 
1467
1513
  Create a global run context for a custom transform:
@@ -1679,7 +1725,7 @@ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
1679
1725
  )
1680
1726
  """A universal random id, valid across DB instances."""
1681
1727
  name: str = CharField(max_length=150, db_index=True)
1682
- """Name or title of ulabel (`unique=True`)."""
1728
+ """Name or title of ulabel."""
1683
1729
  type: ULabel | None = ForeignKey("self", PROTECT, null=True, related_name="records")
1684
1730
  """Type of ulabel, e.g., `"donor"`, `"split"`, etc.
1685
1731
 
@@ -1687,7 +1733,7 @@ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
1687
1733
  """
1688
1734
  records: ULabel
1689
1735
  """Records of this type."""
1690
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
1736
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
1691
1737
  """Distinguish types from instances of the type.
1692
1738
 
1693
1739
  For example, a ulabel "Project" would be a type, and the actual projects "Project 1", "Project 2", would be records of that `type`.
@@ -1727,6 +1773,8 @@ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
1727
1773
  def __init__(
1728
1774
  self,
1729
1775
  name: str,
1776
+ type: ULabel | None = None,
1777
+ is_type: bool = False,
1730
1778
  description: str | None = None,
1731
1779
  reference: str | None = None,
1732
1780
  reference_type: str | None = None,
@@ -1765,12 +1813,15 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1765
1813
 
1766
1814
  Args:
1767
1815
  name: `str` Name of the feature, typically. column name.
1768
- dtype: `FeatureDtype | Registry | list[Registry]` See :class:`~lamindb.base.types.FeatureDtype`.
1816
+ dtype: `FeatureDtype | Registry | list[Registry] | FieldAttr` See :class:`~lamindb.base.types.FeatureDtype`.
1769
1817
  For categorical types, can define from which registry values are
1770
1818
  sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`.
1771
1819
  unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
1772
1820
  description: `str | None = None` A description.
1773
1821
  synonyms: `str | None = None` Bar-separated synonyms.
1822
+ nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.
1823
+ default_value: `Any | None = None` Default value for the feature.
1824
+ cat_filters: `dict[str, str] | None = None` Subset a registry by additional filters to define valid categories.
1774
1825
 
1775
1826
  Note:
1776
1827
 
@@ -1835,6 +1886,10 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1835
1886
  abstract = False
1836
1887
 
1837
1888
  _name_field: str = "name"
1889
+ _aux_fields: dict[str, tuple[str, type]] = {
1890
+ "0": ("default_value", bool),
1891
+ "1": ("nullable", bool),
1892
+ }
1838
1893
 
1839
1894
  id: int = models.AutoField(primary_key=True)
1840
1895
  """Internal id, valid only in one DB instance."""
@@ -1843,8 +1898,8 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1843
1898
  )
1844
1899
  """Universal id, valid across DB instances."""
1845
1900
  name: str = CharField(max_length=150, db_index=True, unique=True)
1846
- """Name of feature (`unique=True`)."""
1847
- dtype: FeatureDtype = CharField(db_index=True)
1901
+ """Name of feature (hard unique constraint `unique=True`)."""
1902
+ dtype: FeatureDtype | None = CharField(db_index=True, null=True)
1848
1903
  """Data type (:class:`~lamindb.base.types.FeatureDtype`).
1849
1904
 
1850
1905
  For categorical types, can define from which registry values are
@@ -1860,7 +1915,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1860
1915
  """
1861
1916
  records: Feature
1862
1917
  """Records of this type."""
1863
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
1918
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
1864
1919
  """Distinguish types from instances of the type."""
1865
1920
  unit: str | None = CharField(max_length=30, db_index=True, null=True)
1866
1921
  """Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
@@ -1922,10 +1977,15 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1922
1977
  def __init__(
1923
1978
  self,
1924
1979
  name: str,
1925
- dtype: FeatureDtype | Registry | list[Registry],
1926
- unit: str | None,
1927
- description: str | None,
1928
- synonyms: str | None,
1980
+ dtype: FeatureDtype | Registry | list[Registry] | FieldAttr,
1981
+ type: Feature | None = None,
1982
+ is_type: bool = False,
1983
+ unit: str | None = None,
1984
+ description: str | None = None,
1985
+ synonyms: str | None = None,
1986
+ nullable: bool = True,
1987
+ default_value: str | None = None,
1988
+ cat_filters: dict[str, str] | None = None,
1929
1989
  ): ...
1930
1990
 
1931
1991
  @overload
@@ -1950,6 +2010,58 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
1950
2010
  """Save."""
1951
2011
  pass
1952
2012
 
2013
+ @property
2014
+ def default_value(self) -> Any:
2015
+ """A default value that overwrites missing values (default `None`).
2016
+
2017
+ This takes effect when you call `Curator.standardize()`.
2018
+ """
2019
+ if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
2020
+ return self._aux["af"]["0"]
2021
+ else:
2022
+ return None
2023
+
2024
+ @default_value.setter
2025
+ def default_value(self, value: bool) -> None:
2026
+ if self._aux is None:
2027
+ self._aux = {}
2028
+ if "af" not in self._aux:
2029
+ self._aux["af"] = {}
2030
+ self._aux["af"]["0"] = value
2031
+
2032
+ @property
2033
+ def nullable(self) -> bool:
2034
+ """Indicates whether the feature can have nullable values (default `True`).
2035
+
2036
+ Example::
2037
+
2038
+ import lamindb as ln
2039
+ import pandas as pd
2040
+
2041
+ disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save()
2042
+ schema = ln.Schema(features=[disease]).save()
2043
+ dataset = {"disease": pd.Categorical([pd.NA, "asthma"])}
2044
+ df = pd.DataFrame(dataset)
2045
+ curator = ln.curators.DataFrameCurator(df, schema)
2046
+ try:
2047
+ curator.validate()
2048
+ except ln.errors.ValidationError as e:
2049
+ assert str(e).startswith("non-nullable series 'disease' contains null values")
2050
+
2051
+ """
2052
+ if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
2053
+ return self._aux["af"]["1"]
2054
+ else:
2055
+ return True
2056
+
2057
+ @nullable.setter
2058
+ def nullable(self, value: bool) -> None:
2059
+ if self._aux is None:
2060
+ self._aux = {}
2061
+ if "af" not in self._aux:
2062
+ self._aux["af"] = {}
2063
+ self._aux["af"]["1"] = value
2064
+
1953
2065
 
1954
2066
  class FeatureValue(Record, TracksRun):
1955
2067
  """Non-categorical features values.
@@ -2000,9 +2112,10 @@ class FeatureValue(Record, TracksRun):
2000
2112
  # Simple types: int, float, str, bool
2001
2113
  if isinstance(value, (int, float, str, bool)):
2002
2114
  try:
2003
- return cls.objects.create(
2004
- feature=feature, value=value, hash=None
2005
- ), False
2115
+ return (
2116
+ cls.objects.create(feature=feature, value=value, hash=None),
2117
+ False,
2118
+ )
2006
2119
  except IntegrityError:
2007
2120
  return cls.objects.get(feature=feature, value=value), True
2008
2121
 
@@ -2010,15 +2123,16 @@ class FeatureValue(Record, TracksRun):
2010
2123
  else:
2011
2124
  hash = hash_dict(value)
2012
2125
  try:
2013
- return cls.objects.create(
2014
- feature=feature, value=value, hash=hash
2015
- ), False
2126
+ return (
2127
+ cls.objects.create(feature=feature, value=value, hash=hash),
2128
+ False,
2129
+ )
2016
2130
  except IntegrityError:
2017
2131
  return cls.objects.get(feature=feature, hash=hash), True
2018
2132
 
2019
2133
 
2020
2134
  class Schema(Record, CanCurate, TracksRun):
2021
- """Feature sets (dataset schemas).
2135
+ """Schemas / feature sets.
2022
2136
 
2023
2137
  Stores references to dataset schemas: these are the sets of columns in a dataset
2024
2138
  that correspond to :class:`~lamindb.Feature`, :class:`~bionty.Gene`, :class:`~bionty.Protein` or other
@@ -2036,23 +2150,37 @@ class Schema(Record, CanCurate, TracksRun):
2036
2150
  These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
2037
2151
 
2038
2152
  Args:
2039
- features: `Iterable[Record]` An iterable of :class:`~lamindb.Feature`
2153
+ features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
2040
2154
  records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
2041
2155
  a set upon instantiation. If you'd like to pass values, use
2042
2156
  :meth:`~lamindb.Schema.from_values` or
2043
2157
  :meth:`~lamindb.Schema.from_df`.
2158
+ components: `dict[str, Schema] | None = None` A dictionary mapping component names to
2159
+ their corresponding :class:`~lamindb.Schema` objects for composite schemas.
2160
+ name: `str | None = None` A name.
2161
+ description: `str | None = None` A description.
2044
2162
  dtype: `str | None = None` The simple type. Defaults to
2045
2163
  `None` for sets of :class:`~lamindb.Feature` records.
2046
2164
  Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
2047
- name: `str | None = None` A name.
2165
+ itype: `str | None = None` The schema identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
2166
+ type: `Schema | None = None` A type.
2167
+ is_type: `bool = False` Distinguish types from instances of the type.
2168
+ otype: `str | None = None` An object type to define the structure of a composite schema.
2169
+ minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
2170
+ ordered_set: `bool = False` Whether features are required to be ordered.
2171
+ maximal_set: `bool = False` If `True`, no additional features are allowed.
2172
+ slot: `str | None = None` The slot name when this schema is used as a component in a
2173
+ composite schema.
2174
+ coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
2175
+ during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
2048
2176
 
2049
2177
  Note:
2050
2178
 
2051
- A feature set can be identified by the `hash` its feature uids.
2179
+ A feature set can be identified by the `hash` of its feature uids.
2052
2180
  It's stored in the `.hash` field.
2053
2181
 
2054
- A `slot` provides a string key to access feature sets.
2055
- It's typically the accessor within the registered data object, here `pd.DataFrame.columns`.
2182
+ A `slot` provides a string key to access feature sets. For instance, for the schema of an
2183
+ `AnnData` object, it would be `'obs'` for `adata.obs`.
2056
2184
 
2057
2185
  See Also:
2058
2186
  :meth:`~lamindb.Schema.from_values`
@@ -2062,24 +2190,20 @@ class Schema(Record, CanCurate, TracksRun):
2062
2190
 
2063
2191
  Examples:
2064
2192
 
2065
- Create a feature set / schema from df with types:
2193
+ Create a schema (feature set) from df with types:
2066
2194
 
2067
2195
  >>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
2068
- >>> feature_set = ln.FeatureSet.from_df(df)
2196
+ >>> schema = ln.Schema.from_df(df)
2069
2197
 
2070
- Create a feature set / schema from features:
2198
+ Create a schema (feature set) from features:
2071
2199
 
2072
2200
  >>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
2073
- >>> feature_set = ln.FeatureSet(features)
2201
+ >>> schema = ln.Schema(features)
2074
2202
 
2075
- Create a feature set / schema from feature values:
2203
+ Create a schema (feature set) from identifier values:
2076
2204
 
2077
2205
  >>> import bionty as bt
2078
- >>> feature_set = ln.FeatureSet.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
2079
-
2080
- Link a feature set to an artifact:
2081
-
2082
- >>> artifact.features.add_feature_set(feature_set, slot="var")
2206
+ >>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
2083
2207
 
2084
2208
  """
2085
2209
 
@@ -2087,6 +2211,7 @@ class Schema(Record, CanCurate, TracksRun):
2087
2211
  abstract = False
2088
2212
 
2089
2213
  _name_field: str = "name"
2214
+ _aux_fields: dict[str, tuple[str, type]] = {"0": ("coerce_dtype", bool)}
2090
2215
 
2091
2216
  id: int = models.AutoField(primary_key=True)
2092
2217
  """Internal id, valid only in one DB instance."""
@@ -2098,89 +2223,116 @@ class Schema(Record, CanCurate, TracksRun):
2098
2223
  """A description."""
2099
2224
  n = IntegerField()
2100
2225
  """Number of features in the set."""
2101
- dtype: str | None = CharField(max_length=64, null=True)
2226
+ dtype: str | None = CharField(max_length=64, null=True, editable=False)
2102
2227
  """Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
2103
2228
 
2104
2229
  For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
2105
2230
  """
2106
- # _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
2107
- # ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
2108
- itype: str | None = CharField(max_length=120, db_index=True, null=True)
2231
+ itype: str | None = CharField(
2232
+ max_length=120, db_index=True, null=True, editable=False
2233
+ )
2109
2234
  """A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
2110
2235
 
2111
2236
  Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
2112
2237
 
2113
2238
  .. versionchanged:: 1.0.0
2114
- Was called `itype` before.
2239
+ Was called `registry` before.
2115
2240
  """
2116
- type: Feature | None = ForeignKey(
2117
- "self", PROTECT, null=True, related_name="records"
2118
- )
2119
- """Type of feature set (e.g., 'ExpressionPanel', 'ProteinPanel', 'Multimodal', 'Metadata', 'Embedding').
2241
+ type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
2242
+ """Type of schema.
2243
+
2244
+ Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
2120
2245
 
2121
- Allows to group feature sets by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
2246
+ You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`.
2247
+
2248
+ Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.
2122
2249
  """
2123
- records: Feature
2250
+ records: Schema
2124
2251
  """Records of this type."""
2125
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
2252
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
2126
2253
  """Distinguish types from instances of the type."""
2127
2254
  otype: str | None = CharField(max_length=64, db_index=True, null=True)
2128
2255
  """Default Python object type, e.g., DataFrame, AnnData."""
2129
- hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
2256
+ hash: str | None = CharField(
2257
+ max_length=HASH_LENGTH, db_index=True, null=True, editable=False
2258
+ )
2130
2259
  """A hash of the set of feature identifiers.
2131
2260
 
2132
2261
  For a composite schema, the hash of hashes.
2133
2262
  """
2134
- minimal_set: bool = BooleanField(default=True, db_index=True)
2263
+ minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
2135
2264
  """Whether the schema contains a minimal set of linked features (default `True`).
2136
2265
 
2137
2266
  If `False`, no features are linked to this schema.
2138
2267
 
2139
2268
  If `True`, features are linked and considered as a minimally required set in validation.
2140
2269
  """
2141
- ordered_set: bool = BooleanField(default=False, db_index=True)
2142
- """Whether the linked features are ordered (default `False`)."""
2143
- maximal_set: bool = BooleanField(default=False, db_index=True)
2270
+ ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
2271
+ """Whether features are required to be ordered (default `False`)."""
2272
+ maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
2144
2273
  """If `False`, additional features are allowed (default `False`).
2145
2274
 
2146
2275
  If `True`, the the minimal set is a maximal set and no additional features are allowed.
2147
2276
  """
2148
- composite: Schema | None = ForeignKey(
2149
- "self", PROTECT, related_name="components", default=None, null=True
2150
- )
2151
- """The composite schema that contains this schema as a component.
2152
-
2153
- The composite schema composes multiple simpler schemas into one object.
2154
-
2155
- For example, an AnnData composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
2156
- """
2157
- slot: str | None = CharField(max_length=100, db_index=True, null=True)
2158
- """The slot in which the schema is stored in the composite schema."""
2159
- validated_by: Schema | None = ForeignKey(
2160
- "self", PROTECT, related_name="validated_schemas", default=None, null=True
2277
+ components: Schema = ManyToManyField(
2278
+ "self", through="SchemaComponent", symmetrical=False, related_name="composites"
2161
2279
  )
2162
- """The schema that validated this schema during curation.
2280
+ """Components of this schema."""
2281
+ composites: Schema
2282
+ """The composite schemas that contains this schema as a component.
2163
2283
 
2164
- When performing validation, the schema that enforced validation is often less concrete than what is validated.
2165
-
2166
- For instance, the set of measured features might be a superset of the minimally required set of features.
2167
-
2168
- Often, the curating schema does not specficy any concrete features at all
2284
+ For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
2169
2285
  """
2170
2286
  features: Feature
2171
2287
  """The features contained in the schema."""
2172
2288
  params: Param
2173
2289
  """The params contained in the schema."""
2174
2290
  artifacts: Artifact
2175
- """The artifacts that observe this schema."""
2291
+ """The artifacts that measure a feature set that matches this schema."""
2292
+ validated_artifacts: Artifact
2293
+ """The artifacts that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
2294
+ projects: Project
2295
+ """Associated projects."""
2176
2296
  _curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
2297
+ # lamindb v2
2298
+ # _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
2299
+ # ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
2300
+ # -- the following two fields are dynamically removed from the API for now
2301
+ validated_by: Schema | None = ForeignKey(
2302
+ "self", PROTECT, related_name="validated_schemas", default=None, null=True
2303
+ )
2304
+ # """The schema that validated this schema during curation.
2305
+
2306
+ # When performing validation, the schema that enforced validation is often less concrete than what is validated.
2307
+
2308
+ # For instance, the set of measured features might be a superset of the minimally required set of features.
2309
+ # """
2310
+ # validated_schemas: Schema
2311
+ # """The schemas that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
2312
+ composite: Schema | None = ForeignKey(
2313
+ "self", PROTECT, related_name="+", default=None, null=True
2314
+ )
2315
+ # The legacy foreign key
2316
+ slot: str | None = CharField(max_length=100, db_index=True, null=True)
2317
+ # The legacy slot
2177
2318
 
2178
2319
  @overload
2179
2320
  def __init__(
2180
2321
  self,
2181
- features: Iterable[Record],
2182
- dtype: str | None = None,
2322
+ features: Iterable[Record] | None = None,
2323
+ components: dict[str, Schema] | None = None,
2183
2324
  name: str | None = None,
2325
+ description: str | None = None,
2326
+ dtype: str | None = None,
2327
+ itype: str | Registry | FieldAttr | None = None,
2328
+ type: Schema | None = None,
2329
+ is_type: bool = False,
2330
+ otype: str | None = None,
2331
+ minimal_set: bool = True,
2332
+ ordered_set: bool = False,
2333
+ maximal_set: bool = False,
2334
+ slot: str | None = None,
2335
+ coerce_dtype: bool = False,
2184
2336
  ): ...
2185
2337
 
2186
2338
  @overload
@@ -2256,6 +2408,25 @@ class Schema(Record, CanCurate, TracksRun):
2256
2408
  """A queryset for the individual records of the set."""
2257
2409
  pass
2258
2410
 
2411
+ @property
2412
+ def coerce_dtype(self) -> bool:
2413
+ """Whether dtypes should be coerced during validation.
2414
+
2415
+ For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
2416
+ """
2417
+ if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
2418
+ return self._aux["af"]["0"]
2419
+ else:
2420
+ return False
2421
+
2422
+ @coerce_dtype.setter
2423
+ def coerce_dtype(self, value: bool) -> None:
2424
+ if self._aux is None:
2425
+ self._aux = {}
2426
+ if "af" not in self._aux:
2427
+ self._aux["af"] = {}
2428
+ self._aux["af"]["0"] = value
2429
+
2259
2430
  @property
2260
2431
  @deprecated("itype")
2261
2432
  def registry(self) -> str:
@@ -2265,8 +2436,23 @@ class Schema(Record, CanCurate, TracksRun):
2265
2436
  def registry(self, value) -> None:
2266
2437
  self.itype = value
2267
2438
 
2439
+ def describe(self, return_str=False) -> None | str:
2440
+ """Describe schema."""
2441
+ message = str(self) + "\ncomponents:"
2442
+ for component in self.components.all():
2443
+ message += "\n " + str(component)
2444
+ if return_str:
2445
+ return message
2446
+ else:
2447
+ print(message)
2448
+ return None
2449
+
2450
+ def _get_component(self, slot: str) -> Schema:
2451
+ return self.components.get(links_component__slot=slot)
2452
+
2268
2453
 
2269
2454
  class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2455
+ # Note that this docstring has to be consistent with Curator.save_artifact()
2270
2456
  """Datasets & models stored as files, folders, or arrays.
2271
2457
 
2272
2458
  Artifacts manage data in local or remote storage.
@@ -2276,10 +2462,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2276
2462
 
2277
2463
  Args:
2278
2464
  data: `UPathStr` A path to a local or remote folder or file.
2279
- type: `Literal["dataset", "model"] | None = None` The artifact type.
2280
- key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a revision family.
2465
+ kind: `Literal["dataset", "model"] | None = None` Distinguish models from datasets from other files & folders.
2466
+ key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
2281
2467
  description: `str | None = None` A description.
2282
- revises: `Artifact | None = None` Previous version of the artifact. Triggers a revision.
2468
+ revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
2283
2469
  run: `Run | None = None` The run that creates the artifact.
2284
2470
 
2285
2471
  .. dropdown:: Typical storage formats & their API accessors
@@ -2313,26 +2499,28 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2313
2499
 
2314
2500
  Examples:
2315
2501
 
2316
- Create an artifact from a file path and pass `description`:
2502
+ Create an artifact by passing `key`:
2503
+
2504
+ >>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
2505
+ >>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
2317
2506
 
2318
- >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv", description="My file")
2319
- >>> artifact = ln.Artifact("./my_local_file.jpg", description="My image")
2507
+ Calling `.save()` uploads the file to the default storage location of your lamindb instance.
2508
+ (If it's a local instance, the "upload" is a mere copy operation.)
2320
2509
 
2321
- You can also pass `key` to create a virtual filepath hierarchy:
2510
+ If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
2322
2511
 
2323
- >>> artifact = ln.Artifact("./my_local_file.jpg", key="example_datasets/dataset1.jpg")
2512
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
2324
2513
 
2325
- What works for files also works for folders:
2514
+ You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
2326
2515
 
2327
- >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder")
2328
- >>> artifact = ln.Artifact("./my_local_folder", description="My local folder")
2329
- >>> artifact = ln.Artifact("./my_local_folder", key="project1/my_target_folder")
2516
+ >>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
2517
+ >>> artifact_v2.versions.df() # see all versions
2330
2518
 
2331
2519
  .. dropdown:: Why does the API look this way?
2332
2520
 
2333
2521
  It's inspired by APIs building on AWS S3.
2334
2522
 
2335
- Both boto3 and quilt select a bucket (akin to default storage in LaminDB) and define a target path through a `key` argument.
2523
+ Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
2336
2524
 
2337
2525
  In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
2338
2526
 
@@ -2349,16 +2537,18 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2349
2537
  bucket = quilt3.Bucket('mybucket')
2350
2538
  bucket.put_file('hello.txt', '/tmp/hello.txt')
2351
2539
 
2540
+ Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
2352
2541
 
2353
- Make a new version of an artifact:
2542
+ >>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
2543
+ >>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
2354
2544
 
2355
- >>> artifact = ln.Artifact.from_df(df, key="example_datasets/dataset1.parquet").save()
2356
- >>> artifact_v2 = ln.Artifact(df_updated, key="example_datasets/dataset1.parquet").save()
2545
+ Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
2357
2546
 
2358
- Alternatively, if you don't want to provide a value for `key`, you can use `revises`:
2547
+ >>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
2359
2548
 
2360
- >>> artifact = ln.Artifact.from_df(df, description="My dataframe").save()
2361
- >>> artifact_v2 = ln.Artifact(df_updated, revises=artifact).save()
2549
+ If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
2550
+ the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
2551
+ detects the duplication and will return the existing artifact.
2362
2552
 
2363
2553
  """
2364
2554
 
@@ -2455,9 +2645,11 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2455
2645
  """
2456
2646
  description: str | None = CharField(db_index=True, null=True)
2457
2647
  """A description."""
2458
- storage: Storage = ForeignKey(Storage, PROTECT, related_name="artifacts")
2648
+ storage: Storage = ForeignKey(
2649
+ Storage, PROTECT, related_name="artifacts", editable=False
2650
+ )
2459
2651
  """Storage location, e.g. an S3 or GCP bucket or a local directory."""
2460
- suffix: str = CharField(max_length=30, db_index=True)
2652
+ suffix: str = CharField(max_length=30, db_index=True, editable=False)
2461
2653
  # Initially, we thought about having this be nullable to indicate folders
2462
2654
  # But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
2463
2655
  """Path suffix or empty string if no canonical suffix exists.
@@ -2470,19 +2662,27 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2470
2662
  null=True,
2471
2663
  )
2472
2664
  """:class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
2473
- otype: str | None = CharField(max_length=64, db_index=True, null=True)
2665
+ otype: str | None = CharField(
2666
+ max_length=64, db_index=True, null=True, editable=False
2667
+ )
2474
2668
  """Default Python object type, e.g., DataFrame, AnnData."""
2475
- size: int | None = BigIntegerField(null=True, db_index=True, default=None)
2669
+ size: int | None = BigIntegerField(
2670
+ null=True, db_index=True, default=None, editable=False
2671
+ )
2476
2672
  """Size in bytes.
2477
2673
 
2478
2674
  Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
2479
2675
  """
2480
- hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
2676
+ hash: str | None = CharField(
2677
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True, editable=False
2678
+ )
2481
2679
  """Hash or pseudo-hash of artifact content.
2482
2680
 
2483
2681
  Useful to ascertain integrity and avoid duplication.
2484
2682
  """
2485
- n_files: int | None = BigIntegerField(null=True, db_index=True, default=None)
2683
+ n_files: int | None = BigIntegerField(
2684
+ null=True, db_index=True, default=None, editable=False
2685
+ )
2486
2686
  """Number of files for folder-like artifacts, `None` for file-like artifacts.
2487
2687
 
2488
2688
  Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
@@ -2490,19 +2690,28 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2490
2690
  .. versionchanged:: 1.0
2491
2691
  Renamed from `n_objects` to `n_files`.
2492
2692
  """
2493
- n_observations: int | None = BigIntegerField(null=True, db_index=True, default=None)
2693
+ n_observations: int | None = BigIntegerField(
2694
+ null=True, db_index=True, default=None, editable=False
2695
+ )
2494
2696
  """Number of observations.
2495
2697
 
2496
2698
  Typically, this denotes the first array dimension.
2497
2699
  """
2498
- _hash_type: str | None = CharField(max_length=30, db_index=True, null=True)
2700
+ _hash_type: str | None = CharField(
2701
+ max_length=30, db_index=True, null=True, editable=False
2702
+ )
2499
2703
  """Type of hash."""
2500
2704
  ulabels: ULabel = models.ManyToManyField(
2501
2705
  ULabel, through="ArtifactULabel", related_name="artifacts"
2502
2706
  )
2503
2707
  """The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
2504
2708
  run: Run | None = ForeignKey(
2505
- Run, PROTECT, related_name="output_artifacts", null=True, default=None
2709
+ Run,
2710
+ PROTECT,
2711
+ related_name="output_artifacts",
2712
+ null=True,
2713
+ default=None,
2714
+ editable=False,
2506
2715
  )
2507
2716
  """Run that created the artifact."""
2508
2717
  input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
@@ -2516,13 +2725,17 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2516
2725
  collections: Collection
2517
2726
  """The collections that this artifact is part of."""
2518
2727
  schema: Schema | None = ForeignKey(
2519
- Schema, PROTECT, null=True, default=None, related_name="artifacts"
2728
+ Schema,
2729
+ PROTECT,
2730
+ null=True,
2731
+ default=None,
2732
+ related_name="validated_artifacts",
2520
2733
  )
2521
- """The schema of the artifact (to be populated in lamindb 1.1)."""
2522
- _schemas_m2m: Schema = models.ManyToManyField(
2523
- Schema, related_name="_artifacts_m2m", through="ArtifactSchema"
2734
+ """The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
2735
+ feature_sets: Schema = models.ManyToManyField(
2736
+ Schema, related_name="artifacts", through="ArtifactSchema"
2524
2737
  )
2525
- """[For backward compatibility] The feature sets measured in the artifact."""
2738
+ """The feature sets measured by the artifact."""
2526
2739
  _feature_values: FeatureValue = models.ManyToManyField(
2527
2740
  FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
2528
2741
  )
@@ -2543,6 +2756,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2543
2756
  PROTECT,
2544
2757
  default=current_user_id,
2545
2758
  related_name="created_artifacts",
2759
+ editable=False,
2546
2760
  )
2547
2761
  """Creator of record."""
2548
2762
  _overwrite_versions: bool = BooleanField(default=None)
@@ -2566,7 +2780,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2566
2780
  # here; and we might refactor this but we might also keep that internal
2567
2781
  # usage
2568
2782
  data: UPathStr,
2569
- type: ArtifactKind | None = None,
2783
+ kind: ArtifactKind | None = None,
2570
2784
  key: str | None = None,
2571
2785
  description: str | None = None,
2572
2786
  revises: Artifact | None = None,
@@ -2606,11 +2820,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2606
2820
  def n_objects(self) -> int:
2607
2821
  return self.n_files
2608
2822
 
2609
- @property
2610
- def feature_sets(self) -> QuerySet[Schema]:
2611
- """Feature sets linked to this artifact."""
2612
- return self._schemas_m2m
2613
-
2614
2823
  # add the below because this is what people will have in their code
2615
2824
  # if they implement the recommended migration strategy
2616
2825
  # - FeatureSet -> Schema
@@ -2620,14 +2829,14 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2620
2829
  # def schemas(self) -> QuerySet[Schema]:
2621
2830
  # """Schemas linked to artifact via many-to-many relationship.
2622
2831
 
2623
- # Is now mediating the private `._schemas_m2m` relationship during
2832
+ # Is now mediating the private `.feature_sets` relationship during
2624
2833
  # a transition period to better schema management.
2625
2834
 
2626
2835
  # .. versionchanged: 1.0
2627
2836
  # Was previously called `.feature_sets`.
2628
2837
 
2629
2838
  # """
2630
- # return self._schemas_m2m
2839
+ # return self.feature_sets
2631
2840
 
2632
2841
  @property
2633
2842
  def path(self) -> Path:
@@ -2637,7 +2846,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2637
2846
 
2638
2847
  >>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
2639
2848
  >>> artifact.path
2640
- S3Path('s3://my-bucket/my-file.csv')
2849
+ S3QueryPath('s3://my-bucket/my-file.csv')
2641
2850
 
2642
2851
  File in local storage:
2643
2852
 
@@ -2652,6 +2861,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2652
2861
  def from_df(
2653
2862
  cls,
2654
2863
  df: pd.DataFrame,
2864
+ *,
2655
2865
  key: str | None = None,
2656
2866
  description: str | None = None,
2657
2867
  run: Run | None = None,
@@ -2692,6 +2902,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2692
2902
  def from_anndata(
2693
2903
  cls,
2694
2904
  adata: AnnData | UPathStr,
2905
+ *,
2695
2906
  key: str | None = None,
2696
2907
  description: str | None = None,
2697
2908
  run: Run | None = None,
@@ -2728,6 +2939,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2728
2939
  def from_mudata(
2729
2940
  cls,
2730
2941
  mdata: MuData,
2942
+ *,
2731
2943
  key: str | None = None,
2732
2944
  description: str | None = None,
2733
2945
  run: Run | None = None,
@@ -2760,11 +2972,38 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2760
2972
  pass
2761
2973
 
2762
2974
  @classmethod
2763
- def from_dir(
2975
+ def from_tiledbsoma(
2764
2976
  cls,
2765
2977
  path: UPathStr,
2978
+ *,
2766
2979
  key: str | None = None,
2980
+ description: str | None = None,
2981
+ run: Run | None = None,
2982
+ revises: Artifact | None = None,
2983
+ **kwargs,
2984
+ ) -> Artifact:
2985
+ """Create from a tiledbsoma store.
2986
+
2987
+ Args:
2988
+ path: A tiledbsoma store with .tiledbsoma suffix.
2989
+ key: A relative path within default storage,
2990
+ e.g., `"myfolder/mystore.tiledbsoma"`.
2991
+ description: A description.
2992
+ revises: An old version of the artifact.
2993
+ run: The run that creates the artifact.
2994
+
2995
+ Examples:
2996
+ >>> artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store")
2997
+ >>> artifact.save()
2998
+ """
2999
+ pass
3000
+
3001
+ @classmethod
3002
+ def from_dir(
3003
+ cls,
3004
+ path: UPathStr,
2767
3005
  *,
3006
+ key: str | None = None,
2768
3007
  run: Run | None = None,
2769
3008
  ) -> list[Artifact]:
2770
3009
  """Create a list of artifact objects from a directory.
@@ -2791,7 +3030,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2791
3030
 
2792
3031
  def replace(
2793
3032
  self,
2794
- data: UPathStr,
3033
+ data: UPathStr | pd.DataFrame | AnnData | MuData,
2795
3034
  run: Run | None = None,
2796
3035
  format: str | None = None,
2797
3036
  ) -> None:
@@ -2824,6 +3063,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
2824
3063
  | BackedAccessor
2825
3064
  | SOMACollection
2826
3065
  | SOMAExperiment
3066
+ | SOMAMeasurement
2827
3067
  | PyArrowDataset
2828
3068
  ):
2829
3069
  """Return a cloud-backed data object.
@@ -2966,13 +3206,13 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
2966
3206
 
2967
3207
  Args:
2968
3208
  artifacts: `list[Artifact]` A list of artifacts.
2969
- name: `str` A name.
3209
+ key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.
2970
3210
  description: `str | None = None` A description.
2971
3211
  revises: `Collection | None = None` An old version of the collection.
2972
3212
  run: `Run | None = None` The run that creates the collection.
2973
3213
  meta: `Artifact | None = None` An artifact that defines metadata for the collection.
2974
- reference: `str | None = None` For instance, an external ID or a URL.
2975
- reference_type: `str | None = None` For instance, `"url"`.
3214
+ reference: `str | None = None` A simple reference, e.g. an external ID or a URL.
3215
+ reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`.
2976
3216
 
2977
3217
  See Also:
2978
3218
  :class:`~lamindb.Artifact`
@@ -2981,11 +3221,11 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
2981
3221
 
2982
3222
  Create a collection from a list of :class:`~lamindb.Artifact` objects:
2983
3223
 
2984
- >>> collection = ln.Collection([artifact1, artifact2], name="My collection")
3224
+ >>> collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection")
2985
3225
 
2986
3226
  Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):
2987
3227
 
2988
- >>> collection = ln.Collection(data_artifact, name="My collection", meta=metadata_artifact)
3228
+ >>> collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact)
2989
3229
 
2990
3230
  """
2991
3231
 
@@ -3008,13 +3248,15 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3008
3248
  """Universal id, valid across DB instances."""
3009
3249
  key: str = CharField(db_index=True)
3010
3250
  """Name or path-like key."""
3011
- # these here is the only case in which we use a TextField
3251
+ # below is the only case in which we use a TextField
3012
3252
  # for description; we do so because users had descriptions exceeding 255 chars
3013
3253
  # in their instances
3014
3254
  description: str | None = TextField(null=True, db_index=True)
3015
3255
  """A description or title."""
3016
- hash: str | None = CharField(max_length=HASH_LENGTH, db_index=True, null=True)
3017
- """Hash of collection content. 86 base64 chars allow to store 64 bytes, 512 bits."""
3256
+ hash: str | None = CharField(
3257
+ max_length=HASH_LENGTH, db_index=True, null=True, unique=True
3258
+ )
3259
+ """Hash of collection content."""
3018
3260
  reference: str | None = CharField(max_length=255, db_index=True, null=True)
3019
3261
  """A reference like URL or external ID."""
3020
3262
  # also for reference_type here, we allow an extra long max_length
@@ -3058,7 +3300,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3058
3300
  def __init__(
3059
3301
  self,
3060
3302
  artifacts: list[Artifact],
3061
- name: str,
3303
+ key: str,
3062
3304
  description: str | None = None,
3063
3305
  meta: Any | None = None,
3064
3306
  reference: str | None = None,
@@ -3084,21 +3326,39 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3084
3326
  """Add an artifact to the collection.
3085
3327
 
3086
3328
  Creates a new version of the collection.
3329
+ This does not modify the original collection in-place, but returns a new version
3330
+ of the original collection with the added artifact.
3087
3331
 
3088
3332
  Args:
3089
3333
  artifact: An artifact to add to the collection.
3090
3334
  run: The run that creates the new version of the collection.
3091
3335
 
3336
+ Examples:
3337
+ >>> collection = ln.Collection(artifact, key="new collection")
3338
+ >>> collecton.save()
3339
+ >>> collection = collection.append(another_artifact) # returns a new version
3340
+ >>> collection.save() # save the new version
3341
+
3092
3342
  .. versionadded:: 0.76.14
3093
3343
  """
3094
3344
  pass
3095
3345
 
3346
+ def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
3347
+ """Return a cloud-backed pyarrow Dataset.
3348
+
3349
+ Works for `pyarrow` compatible formats.
3350
+
3351
+ Notes:
3352
+ For more info, see tutorial: :doc:`/arrays`.
3353
+ """
3354
+ pass
3355
+
3096
3356
  def mapped(
3097
3357
  self,
3098
3358
  layers_keys: str | list[str] | None = None,
3099
3359
  obs_keys: str | list[str] | None = None,
3100
3360
  obsm_keys: str | list[str] | None = None,
3101
- obs_filter: dict[str, str | tuple[str, ...]] | None = None,
3361
+ obs_filter: dict[str, str | list[str]] | None = None,
3102
3362
  join: Literal["inner", "outer"] | None = "inner",
3103
3363
  encode_labels: bool | list[str] = True,
3104
3364
  unknown_label: str | dict[str, str] | None = None,
@@ -3136,7 +3396,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
3136
3396
  obsm_keys: Keys from the ``.obsm`` slots.
3137
3397
  obs_filter: Select only observations with these values for the given obs columns.
3138
3398
  Should be a dictionary with obs column names as keys
3139
- and filtering values (a string or a tuple of strings) as values.
3399
+ and filtering values (a string or a list of strings) as values.
3140
3400
  join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
3141
3401
  does not join.
3142
3402
  encode_labels: Encode labels into integers.
@@ -3330,7 +3590,7 @@ class Project(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3330
3590
  """Type of project (e.g., 'Program', 'Project', 'GithubIssue', 'Task')."""
3331
3591
  records: Project
3332
3592
  """Records of this type."""
3333
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
3593
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
3334
3594
  """Distinguish types from instances of the type."""
3335
3595
  abbr: str | None = CharField(max_length=32, db_index=True, null=True)
3336
3596
  """An abbreviation."""
@@ -3434,7 +3694,7 @@ class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3434
3694
  """
3435
3695
  records: Reference
3436
3696
  """Records of this type."""
3437
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
3697
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
3438
3698
  """Distinguish types from instances of the type."""
3439
3699
  url: str | None = URLField(null=True)
3440
3700
  """URL linking to the reference."""
@@ -3476,7 +3736,7 @@ class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
3476
3736
  # -------------------------------------------------------------------------------------
3477
3737
  # Data models
3478
3738
 
3479
- from django.contrib.postgres.fields import JSONField
3739
+ from django.contrib.postgres.fields import JSONField # type: ignore
3480
3740
  from django.core.exceptions import ValidationError
3481
3741
  from django.db import models
3482
3742
 
@@ -3543,7 +3803,7 @@ class RunData(BasicRecord, DataMixin):
3543
3803
  class Meta:
3544
3804
  constraints = [
3545
3805
  models.CheckConstraint(
3546
- check=(
3806
+ condition=(
3547
3807
  models.Q(feature__isnull=False, param__isnull=True)
3548
3808
  | models.Q(feature__isnull=True, param__isnull=False)
3549
3809
  ),
@@ -3574,7 +3834,7 @@ class FlexTable(Record, TracksRun, TracksUpdates):
3574
3834
  """Type of tidy table, e.g., `Cell`, `SampleSheet`, etc."""
3575
3835
  records: ULabel
3576
3836
  """Records of this type."""
3577
- is_type: bool = BooleanField(default=None, db_index=True, null=True)
3837
+ is_type: bool = BooleanField(default=False, db_index=True, null=True)
3578
3838
  """Distinguish types from instances of the type."""
3579
3839
  description: str = CharField(null=True, db_index=True)
3580
3840
  """A description."""
@@ -3593,7 +3853,7 @@ class FlexTableData(BasicRecord, DataMixin):
3593
3853
  class Meta:
3594
3854
  constraints = [
3595
3855
  models.CheckConstraint(
3596
- check=(
3856
+ condition=(
3597
3857
  models.Q(feature__isnull=False, param__isnull=True)
3598
3858
  | models.Q(feature__isnull=True, param__isnull=False)
3599
3859
  ),
@@ -3621,8 +3881,8 @@ class LinkORM:
3621
3881
 
3622
3882
  class SchemaFeature(BasicRecord, LinkORM):
3623
3883
  id: int = models.BigAutoField(primary_key=True)
3624
- schema: Schema = ForeignKey(Schema, CASCADE, related_name="+")
3625
- feature: Feature = ForeignKey(Feature, PROTECT, related_name="+")
3884
+ schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature")
3885
+ feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")
3626
3886
 
3627
3887
  class Meta:
3628
3888
  unique_together = ("schema", "feature")
@@ -3640,15 +3900,22 @@ class SchemaParam(BasicRecord, LinkORM):
3640
3900
  class ArtifactSchema(BasicRecord, LinkORM, TracksRun):
3641
3901
  id: int = models.BigAutoField(primary_key=True)
3642
3902
  artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="_links_schema")
3643
- # we follow the lower() case convention rather than snake case for link models
3644
3903
  schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
3645
- slot: str | None = CharField(max_length=40, null=True)
3646
- feature_ref_is_semantic: bool | None = BooleanField(
3647
- null=True
3648
- ) # like Feature name or Gene symbol or CellMarker name
3904
+ slot: str | None = CharField(null=True)
3905
+ feature_ref_is_semantic: bool | None = BooleanField(null=True)
3649
3906
 
3650
3907
  class Meta:
3651
- unique_together = ("artifact", "schema")
3908
+ unique_together = (("artifact", "schema"), ("artifact", "slot"))
3909
+
3910
+
3911
+ class SchemaComponent(BasicRecord, LinkORM, TracksRun):
3912
+ id: int = models.BigAutoField(primary_key=True)
3913
+ composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_composite")
3914
+ component: Schema = ForeignKey(Schema, PROTECT, related_name="links_component")
3915
+ slot: str | None = CharField(null=True)
3916
+
3917
+ class Meta:
3918
+ unique_together = (("composite", "component"), ("composite", "slot"))
3652
3919
 
3653
3920
 
3654
3921
  class CollectionArtifact(BasicRecord, LinkORM, TracksRun):
@@ -3883,14 +4150,14 @@ class CollectionReference(BasicRecord, LinkORM, TracksRun):
3883
4150
  unique_together = ("collection", "reference")
3884
4151
 
3885
4152
 
3886
- # class Migration(Record):
3887
- # app = CharField(max_length=255)
3888
- # name = CharField(max_length=255)
3889
- # applied: datetime = DateTimeField()
4153
+ class Migration(BasicRecord):
4154
+ app = CharField(max_length=255)
4155
+ name = CharField(max_length=255)
4156
+ applied: datetime = DateTimeField()
3890
4157
 
3891
- # class Meta:
3892
- # db_table = "django_migrations"
3893
- # managed = False
4158
+ class Meta:
4159
+ db_table = "django_migrations"
4160
+ managed = False
3894
4161
 
3895
4162
 
3896
4163
  # -------------------------------------------------------------------------------------