lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +14 -5
- lamindb/_artifact.py +174 -57
- lamindb/_can_curate.py +27 -8
- lamindb/_collection.py +85 -51
- lamindb/_feature.py +177 -41
- lamindb/_finish.py +222 -81
- lamindb/_from_values.py +83 -98
- lamindb/_parents.py +4 -4
- lamindb/_query_set.py +59 -17
- lamindb/_record.py +171 -53
- lamindb/_run.py +4 -4
- lamindb/_save.py +33 -10
- lamindb/_schema.py +135 -38
- lamindb/_storage.py +1 -1
- lamindb/_tracked.py +106 -0
- lamindb/_transform.py +21 -8
- lamindb/_ulabel.py +5 -14
- lamindb/base/validation.py +2 -6
- lamindb/core/__init__.py +13 -14
- lamindb/core/_context.py +39 -36
- lamindb/core/_data.py +29 -25
- lamindb/core/_describe.py +1 -1
- lamindb/core/_django.py +1 -1
- lamindb/core/_feature_manager.py +54 -44
- lamindb/core/_label_manager.py +4 -4
- lamindb/core/_mapped_collection.py +20 -7
- lamindb/core/datasets/__init__.py +6 -1
- lamindb/core/datasets/_core.py +12 -11
- lamindb/core/datasets/_small.py +66 -20
- lamindb/core/exceptions.py +1 -90
- lamindb/core/loaders.py +7 -13
- lamindb/core/relations.py +6 -4
- lamindb/core/storage/_anndata_accessor.py +41 -0
- lamindb/core/storage/_backed_access.py +2 -2
- lamindb/core/storage/_pyarrow_dataset.py +25 -15
- lamindb/core/storage/_tiledbsoma.py +56 -12
- lamindb/core/storage/paths.py +41 -22
- lamindb/core/subsettings/_creation_settings.py +4 -16
- lamindb/curators/__init__.py +2168 -833
- lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
- lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
- lamindb/errors.py +96 -0
- lamindb/integrations/_vitessce.py +3 -3
- lamindb/migrations/0069_squashed.py +76 -75
- lamindb/migrations/0075_lamindbv1_part5.py +4 -5
- lamindb/migrations/0082_alter_feature_dtype.py +21 -0
- lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
- lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
- lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
- lamindb/migrations/0086_various.py +95 -0
- lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
- lamindb/migrations/0088_schema_components.py +273 -0
- lamindb/migrations/0088_squashed.py +4372 -0
- lamindb/models.py +423 -156
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
- lamindb-1.1.0.dist-info/RECORD +95 -0
- lamindb/curators/_spatial.py +0 -528
- lamindb/migrations/0052_squashed.py +0 -1261
- lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
- lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
- lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
- lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
- lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
- lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
- lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
- lamindb/migrations/0060_alter_artifact__actions.py +0 -22
- lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
- lamindb/migrations/0062_add_is_latest_field.py +0 -32
- lamindb/migrations/0063_populate_latest_field.py +0 -45
- lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
- lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
- lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
- lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
- lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
- lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
- lamindb-1.0.4.dist-info/RECORD +0 -102
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
- {lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0
lamindb/models.py
CHANGED
@@ -65,6 +65,7 @@ if TYPE_CHECKING:
|
|
65
65
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
66
66
|
from tiledbsoma import Collection as SOMACollection
|
67
67
|
from tiledbsoma import Experiment as SOMAExperiment
|
68
|
+
from tiledbsoma import Measurement as SOMAMeasurement
|
68
69
|
from upath import UPath
|
69
70
|
|
70
71
|
from lamindb.core import LabelManager, MappedCollection, QuerySet, RecordList
|
@@ -152,9 +153,13 @@ def current_run() -> Run | None:
|
|
152
153
|
if not _TRACKING_READY:
|
153
154
|
_TRACKING_READY = _check_instance_setup()
|
154
155
|
if _TRACKING_READY:
|
155
|
-
import lamindb
|
156
|
+
import lamindb
|
156
157
|
|
157
|
-
|
158
|
+
# also see get_run() in core._data
|
159
|
+
run = lamindb._tracked.get_current_tracked_run()
|
160
|
+
if run is None:
|
161
|
+
run = lamindb.context.run
|
162
|
+
return run
|
158
163
|
else:
|
159
164
|
return None
|
160
165
|
|
@@ -239,6 +244,7 @@ class CanCurate:
|
|
239
244
|
mute: bool = False,
|
240
245
|
organism: str | Record | None = None,
|
241
246
|
source: Record | None = None,
|
247
|
+
strict_source: bool = False,
|
242
248
|
) -> InspectResult:
|
243
249
|
"""Inspect if values are mappable to a field.
|
244
250
|
|
@@ -252,6 +258,10 @@ class CanCurate:
|
|
252
258
|
mute: Whether to mute logging.
|
253
259
|
organism: An Organism name or record.
|
254
260
|
source: A `bionty.Source` record that specifies the version to inspect against.
|
261
|
+
strict_source: Determines the validation behavior against records in the registry.
|
262
|
+
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
263
|
+
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
264
|
+
Note: this parameter won't affect validation against bionty/public sources.
|
255
265
|
|
256
266
|
See Also:
|
257
267
|
:meth:`~lamindb.core.CanCurate.validate`
|
@@ -278,10 +288,11 @@ class CanCurate:
|
|
278
288
|
mute: bool = False,
|
279
289
|
organism: str | Record | None = None,
|
280
290
|
source: Record | None = None,
|
291
|
+
strict_source: bool = False,
|
281
292
|
) -> np.ndarray:
|
282
293
|
"""Validate values against existing values of a string field.
|
283
294
|
|
284
|
-
Note this is
|
295
|
+
Note this is strict_source validation, only asserts exact matches.
|
285
296
|
|
286
297
|
Args:
|
287
298
|
values: Values that will be validated against the field.
|
@@ -291,6 +302,10 @@ class CanCurate:
|
|
291
302
|
mute: Whether to mute logging.
|
292
303
|
organism: An Organism name or record.
|
293
304
|
source: A `bionty.Source` record that specifies the version to validate against.
|
305
|
+
strict_source: Determines the validation behavior against records in the registry.
|
306
|
+
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
307
|
+
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
308
|
+
Note: this parameter won't affect validation against bionty/public sources.
|
294
309
|
|
295
310
|
Returns:
|
296
311
|
A vector of booleans indicating if an element is validated.
|
@@ -370,6 +385,7 @@ class CanCurate:
|
|
370
385
|
synonyms_field: str = "synonyms",
|
371
386
|
organism: str | Record | None = None,
|
372
387
|
source: Record | None = None,
|
388
|
+
strict_source: bool = False,
|
373
389
|
) -> list[str] | dict[str, str]:
|
374
390
|
"""Maps input synonyms to standardized names.
|
375
391
|
|
@@ -392,6 +408,10 @@ class CanCurate:
|
|
392
408
|
synonyms_field: A field containing the concatenated synonyms.
|
393
409
|
organism: An Organism name or record.
|
394
410
|
source: A `bionty.Source` record that specifies the version to validate against.
|
411
|
+
strict_source: Determines the validation behavior against records in the registry.
|
412
|
+
- If `False`, validation will include all records in the registry, ignoring the specified source.
|
413
|
+
- If `True`, validation will only include records in the registry that are linked to the specified source.
|
414
|
+
Note: this parameter won't affect validation against bionty/public sources.
|
395
415
|
|
396
416
|
Returns:
|
397
417
|
If `return_mapper` is `False`: a list of standardized names. Otherwise,
|
@@ -1187,7 +1207,7 @@ class Transform(Record, IsVersioned):
|
|
1187
1207
|
|
1188
1208
|
Create a transform for a pipeline:
|
1189
1209
|
|
1190
|
-
>>> transform = ln.Transform(
|
1210
|
+
>>> transform = ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
|
1191
1211
|
|
1192
1212
|
Create a transform from a notebook:
|
1193
1213
|
|
@@ -1230,7 +1250,11 @@ class Transform(Record, IsVersioned):
|
|
1230
1250
|
.. versionchanged:: 0.75
|
1231
1251
|
The `source_code` field is no longer an artifact, but a text field.
|
1232
1252
|
"""
|
1233
|
-
|
1253
|
+
# we have a unique constraint here but not on artifact because on artifact, we haven't yet
|
1254
|
+
# settled how we model the same artifact in different storage locations
|
1255
|
+
hash: str | None = CharField(
|
1256
|
+
max_length=HASH_LENGTH, db_index=True, null=True, unique=True
|
1257
|
+
)
|
1234
1258
|
"""Hash of the source code."""
|
1235
1259
|
reference: str | None = CharField(max_length=255, db_index=True, null=True)
|
1236
1260
|
"""Reference for the transform, e.g., a URL."""
|
@@ -1340,7 +1364,7 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1340
1364
|
_name_field: str = "name"
|
1341
1365
|
|
1342
1366
|
name: str = CharField(max_length=100, db_index=True)
|
1343
|
-
dtype: str = CharField(
|
1367
|
+
dtype: str | None = CharField(db_index=True, null=True)
|
1344
1368
|
"""Data type ("num", "cat", "int", "float", "bool", "datetime").
|
1345
1369
|
|
1346
1370
|
For categorical types, can define from which registry values are
|
@@ -1353,7 +1377,7 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1353
1377
|
"""
|
1354
1378
|
records: Param
|
1355
1379
|
"""Records of this type."""
|
1356
|
-
is_type: bool = BooleanField(default=
|
1380
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
1357
1381
|
"""Distinguish types from instances of the type."""
|
1358
1382
|
_expect_many: bool = models.BooleanField(default=False, db_default=False)
|
1359
1383
|
"""Indicates whether values for this param are expected to occur a single or multiple times for an artifact/run (default `False`).
|
@@ -1369,6 +1393,28 @@ class Param(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1369
1393
|
values: ParamValue
|
1370
1394
|
"""Values for this parameter."""
|
1371
1395
|
|
1396
|
+
def __init__(self, *args, **kwargs):
|
1397
|
+
from ._feature import process_init_feature_param
|
1398
|
+
from .errors import ValidationError
|
1399
|
+
|
1400
|
+
if len(args) == len(self._meta.concrete_fields):
|
1401
|
+
super().__init__(*args, **kwargs)
|
1402
|
+
return None
|
1403
|
+
|
1404
|
+
dtype = kwargs.get("dtype", None)
|
1405
|
+
kwargs = process_init_feature_param(args, kwargs, is_param=True)
|
1406
|
+
super().__init__(*args, **kwargs)
|
1407
|
+
dtype_str = kwargs.pop("dtype", None)
|
1408
|
+
if not self._state.adding:
|
1409
|
+
if not (
|
1410
|
+
self.dtype.startswith("cat")
|
1411
|
+
if dtype == "cat"
|
1412
|
+
else self.dtype == dtype_str
|
1413
|
+
):
|
1414
|
+
raise ValidationError(
|
1415
|
+
f"Feature {self.name} already exists with dtype {self.dtype}, you passed {dtype_str}"
|
1416
|
+
)
|
1417
|
+
|
1372
1418
|
|
1373
1419
|
# FeatureValue behaves in many ways like a link in a LinkORM
|
1374
1420
|
# in particular, we don't want a _public field on it
|
@@ -1460,8 +1506,8 @@ class Run(Record):
|
|
1460
1506
|
|
1461
1507
|
Create a run record:
|
1462
1508
|
|
1463
|
-
>>> ln.Transform(
|
1464
|
-
>>> transform = ln.Transform.get(
|
1509
|
+
>>> ln.Transform(key="Cell Ranger", version="7.2.0", type="pipeline").save()
|
1510
|
+
>>> transform = ln.Transform.get(key="Cell Ranger", version="7.2.0")
|
1465
1511
|
>>> run = ln.Run(transform)
|
1466
1512
|
|
1467
1513
|
Create a global run context for a custom transform:
|
@@ -1679,7 +1725,7 @@ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
|
|
1679
1725
|
)
|
1680
1726
|
"""A universal random id, valid across DB instances."""
|
1681
1727
|
name: str = CharField(max_length=150, db_index=True)
|
1682
|
-
"""Name or title of ulabel
|
1728
|
+
"""Name or title of ulabel."""
|
1683
1729
|
type: ULabel | None = ForeignKey("self", PROTECT, null=True, related_name="records")
|
1684
1730
|
"""Type of ulabel, e.g., `"donor"`, `"split"`, etc.
|
1685
1731
|
|
@@ -1687,7 +1733,7 @@ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
|
|
1687
1733
|
"""
|
1688
1734
|
records: ULabel
|
1689
1735
|
"""Records of this type."""
|
1690
|
-
is_type: bool = BooleanField(default=
|
1736
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
1691
1737
|
"""Distinguish types from instances of the type.
|
1692
1738
|
|
1693
1739
|
For example, a ulabel "Project" would be a type, and the actual projects "Project 1", "Project 2", would be records of that `type`.
|
@@ -1727,6 +1773,8 @@ class ULabel(Record, HasParents, CanCurate, TracksRun, TracksUpdates):
|
|
1727
1773
|
def __init__(
|
1728
1774
|
self,
|
1729
1775
|
name: str,
|
1776
|
+
type: ULabel | None = None,
|
1777
|
+
is_type: bool = False,
|
1730
1778
|
description: str | None = None,
|
1731
1779
|
reference: str | None = None,
|
1732
1780
|
reference_type: str | None = None,
|
@@ -1765,12 +1813,15 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1765
1813
|
|
1766
1814
|
Args:
|
1767
1815
|
name: `str` Name of the feature, typically. column name.
|
1768
|
-
dtype: `FeatureDtype | Registry | list[Registry]` See :class:`~lamindb.base.types.FeatureDtype`.
|
1816
|
+
dtype: `FeatureDtype | Registry | list[Registry] | FieldAttr` See :class:`~lamindb.base.types.FeatureDtype`.
|
1769
1817
|
For categorical types, can define from which registry values are
|
1770
1818
|
sampled, e.g., `ULabel` or `[ULabel, bionty.CellType]`.
|
1771
1819
|
unit: `str | None = None` Unit of measure, ideally SI (`"m"`, `"s"`, `"kg"`, etc.) or `"normalized"` etc.
|
1772
1820
|
description: `str | None = None` A description.
|
1773
1821
|
synonyms: `str | None = None` Bar-separated synonyms.
|
1822
|
+
nullable: `bool = True` Whether the feature can have null-like values (`None`, `pd.NA`, `NaN`, etc.), see :attr:`~lamindb.Feature.nullable`.
|
1823
|
+
default_value: `Any | None = None` Default value for the feature.
|
1824
|
+
cat_filters: `dict[str, str] | None = None` Subset a registry by additional filters to define valid categories.
|
1774
1825
|
|
1775
1826
|
Note:
|
1776
1827
|
|
@@ -1835,6 +1886,10 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1835
1886
|
abstract = False
|
1836
1887
|
|
1837
1888
|
_name_field: str = "name"
|
1889
|
+
_aux_fields: dict[str, tuple[str, type]] = {
|
1890
|
+
"0": ("default_value", bool),
|
1891
|
+
"1": ("nullable", bool),
|
1892
|
+
}
|
1838
1893
|
|
1839
1894
|
id: int = models.AutoField(primary_key=True)
|
1840
1895
|
"""Internal id, valid only in one DB instance."""
|
@@ -1843,8 +1898,8 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1843
1898
|
)
|
1844
1899
|
"""Universal id, valid across DB instances."""
|
1845
1900
|
name: str = CharField(max_length=150, db_index=True, unique=True)
|
1846
|
-
"""Name of feature (`unique=True`)."""
|
1847
|
-
dtype: FeatureDtype = CharField(db_index=True)
|
1901
|
+
"""Name of feature (hard unique constraint `unique=True`)."""
|
1902
|
+
dtype: FeatureDtype | None = CharField(db_index=True, null=True)
|
1848
1903
|
"""Data type (:class:`~lamindb.base.types.FeatureDtype`).
|
1849
1904
|
|
1850
1905
|
For categorical types, can define from which registry values are
|
@@ -1860,7 +1915,7 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1860
1915
|
"""
|
1861
1916
|
records: Feature
|
1862
1917
|
"""Records of this type."""
|
1863
|
-
is_type: bool = BooleanField(default=
|
1918
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
1864
1919
|
"""Distinguish types from instances of the type."""
|
1865
1920
|
unit: str | None = CharField(max_length=30, db_index=True, null=True)
|
1866
1921
|
"""Unit of measure, ideally SI (`m`, `s`, `kg`, etc.) or 'normalized' etc. (optional)."""
|
@@ -1922,10 +1977,15 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1922
1977
|
def __init__(
|
1923
1978
|
self,
|
1924
1979
|
name: str,
|
1925
|
-
dtype: FeatureDtype | Registry | list[Registry],
|
1926
|
-
|
1927
|
-
|
1928
|
-
|
1980
|
+
dtype: FeatureDtype | Registry | list[Registry] | FieldAttr,
|
1981
|
+
type: Feature | None = None,
|
1982
|
+
is_type: bool = False,
|
1983
|
+
unit: str | None = None,
|
1984
|
+
description: str | None = None,
|
1985
|
+
synonyms: str | None = None,
|
1986
|
+
nullable: bool = True,
|
1987
|
+
default_value: str | None = None,
|
1988
|
+
cat_filters: dict[str, str] | None = None,
|
1929
1989
|
): ...
|
1930
1990
|
|
1931
1991
|
@overload
|
@@ -1950,6 +2010,58 @@ class Feature(Record, CanCurate, TracksRun, TracksUpdates):
|
|
1950
2010
|
"""Save."""
|
1951
2011
|
pass
|
1952
2012
|
|
2013
|
+
@property
|
2014
|
+
def default_value(self) -> Any:
|
2015
|
+
"""A default value that overwrites missing values (default `None`).
|
2016
|
+
|
2017
|
+
This takes effect when you call `Curator.standardize()`.
|
2018
|
+
"""
|
2019
|
+
if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
|
2020
|
+
return self._aux["af"]["0"]
|
2021
|
+
else:
|
2022
|
+
return None
|
2023
|
+
|
2024
|
+
@default_value.setter
|
2025
|
+
def default_value(self, value: bool) -> None:
|
2026
|
+
if self._aux is None:
|
2027
|
+
self._aux = {}
|
2028
|
+
if "af" not in self._aux:
|
2029
|
+
self._aux["af"] = {}
|
2030
|
+
self._aux["af"]["0"] = value
|
2031
|
+
|
2032
|
+
@property
|
2033
|
+
def nullable(self) -> bool:
|
2034
|
+
"""Indicates whether the feature can have nullable values (default `True`).
|
2035
|
+
|
2036
|
+
Example::
|
2037
|
+
|
2038
|
+
import lamindb as ln
|
2039
|
+
import pandas as pd
|
2040
|
+
|
2041
|
+
disease = ln.Feature(name="disease", dtype=ln.ULabel, nullable=False).save()
|
2042
|
+
schema = ln.Schema(features=[disease]).save()
|
2043
|
+
dataset = {"disease": pd.Categorical([pd.NA, "asthma"])}
|
2044
|
+
df = pd.DataFrame(dataset)
|
2045
|
+
curator = ln.curators.DataFrameCurator(df, schema)
|
2046
|
+
try:
|
2047
|
+
curator.validate()
|
2048
|
+
except ln.errors.ValidationError as e:
|
2049
|
+
assert str(e).startswith("non-nullable series 'disease' contains null values")
|
2050
|
+
|
2051
|
+
"""
|
2052
|
+
if self._aux is not None and "af" in self._aux and "1" in self._aux["af"]:
|
2053
|
+
return self._aux["af"]["1"]
|
2054
|
+
else:
|
2055
|
+
return True
|
2056
|
+
|
2057
|
+
@nullable.setter
|
2058
|
+
def nullable(self, value: bool) -> None:
|
2059
|
+
if self._aux is None:
|
2060
|
+
self._aux = {}
|
2061
|
+
if "af" not in self._aux:
|
2062
|
+
self._aux["af"] = {}
|
2063
|
+
self._aux["af"]["1"] = value
|
2064
|
+
|
1953
2065
|
|
1954
2066
|
class FeatureValue(Record, TracksRun):
|
1955
2067
|
"""Non-categorical features values.
|
@@ -2000,9 +2112,10 @@ class FeatureValue(Record, TracksRun):
|
|
2000
2112
|
# Simple types: int, float, str, bool
|
2001
2113
|
if isinstance(value, (int, float, str, bool)):
|
2002
2114
|
try:
|
2003
|
-
return
|
2004
|
-
feature=feature, value=value, hash=None
|
2005
|
-
|
2115
|
+
return (
|
2116
|
+
cls.objects.create(feature=feature, value=value, hash=None),
|
2117
|
+
False,
|
2118
|
+
)
|
2006
2119
|
except IntegrityError:
|
2007
2120
|
return cls.objects.get(feature=feature, value=value), True
|
2008
2121
|
|
@@ -2010,15 +2123,16 @@ class FeatureValue(Record, TracksRun):
|
|
2010
2123
|
else:
|
2011
2124
|
hash = hash_dict(value)
|
2012
2125
|
try:
|
2013
|
-
return
|
2014
|
-
feature=feature, value=value, hash=hash
|
2015
|
-
|
2126
|
+
return (
|
2127
|
+
cls.objects.create(feature=feature, value=value, hash=hash),
|
2128
|
+
False,
|
2129
|
+
)
|
2016
2130
|
except IntegrityError:
|
2017
2131
|
return cls.objects.get(feature=feature, hash=hash), True
|
2018
2132
|
|
2019
2133
|
|
2020
2134
|
class Schema(Record, CanCurate, TracksRun):
|
2021
|
-
"""
|
2135
|
+
"""Schemas / feature sets.
|
2022
2136
|
|
2023
2137
|
Stores references to dataset schemas: these are the sets of columns in a dataset
|
2024
2138
|
that correspond to :class:`~lamindb.Feature`, :class:`~bionty.Gene`, :class:`~bionty.Protein` or other
|
@@ -2036,23 +2150,37 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2036
2150
|
These reasons do not hold for label sets. Hence, LaminDB does not model label sets.
|
2037
2151
|
|
2038
2152
|
Args:
|
2039
|
-
features: `Iterable[Record]` An iterable of :class:`~lamindb.Feature`
|
2153
|
+
features: `Iterable[Record] | None = None` An iterable of :class:`~lamindb.Feature`
|
2040
2154
|
records to hash, e.g., `[Feature(...), Feature(...)]`. Is turned into
|
2041
2155
|
a set upon instantiation. If you'd like to pass values, use
|
2042
2156
|
:meth:`~lamindb.Schema.from_values` or
|
2043
2157
|
:meth:`~lamindb.Schema.from_df`.
|
2158
|
+
components: `dict[str, Schema] | None = None` A dictionary mapping component names to
|
2159
|
+
their corresponding :class:`~lamindb.Schema` objects for composite schemas.
|
2160
|
+
name: `str | None = None` A name.
|
2161
|
+
description: `str | None = None` A description.
|
2044
2162
|
dtype: `str | None = None` The simple type. Defaults to
|
2045
2163
|
`None` for sets of :class:`~lamindb.Feature` records.
|
2046
2164
|
Otherwise defaults to `"num"` (e.g., for sets of :class:`~bionty.Gene`).
|
2047
|
-
|
2165
|
+
itype: `str | None = None` The schema identifier type (e.g. :class:`~lamindb.Feature`, :class:`~bionty.Gene`, ...).
|
2166
|
+
type: `Schema | None = None` A type.
|
2167
|
+
is_type: `bool = False` Distinguish types from instances of the type.
|
2168
|
+
otype: `str | None = None` An object type to define the structure of a composite schema.
|
2169
|
+
minimal_set: `bool = True` Whether the schema contains a minimal set of linked features.
|
2170
|
+
ordered_set: `bool = False` Whether features are required to be ordered.
|
2171
|
+
maximal_set: `bool = False` If `True`, no additional features are allowed.
|
2172
|
+
slot: `str | None = None` The slot name when this schema is used as a component in a
|
2173
|
+
composite schema.
|
2174
|
+
coerce_dtype: `bool = False` When True, attempts to coerce values to the specified dtype
|
2175
|
+
during validation, see :attr:`~lamindb.Schema.coerce_dtype`.
|
2048
2176
|
|
2049
2177
|
Note:
|
2050
2178
|
|
2051
|
-
A feature set can be identified by the `hash` its feature uids.
|
2179
|
+
A feature set can be identified by the `hash` of its feature uids.
|
2052
2180
|
It's stored in the `.hash` field.
|
2053
2181
|
|
2054
|
-
A `slot` provides a string key to access feature sets.
|
2055
|
-
|
2182
|
+
A `slot` provides a string key to access feature sets. For instance, for the schema of an
|
2183
|
+
`AnnData` object, it would be `'obs'` for `adata.obs`.
|
2056
2184
|
|
2057
2185
|
See Also:
|
2058
2186
|
:meth:`~lamindb.Schema.from_values`
|
@@ -2062,24 +2190,20 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2062
2190
|
|
2063
2191
|
Examples:
|
2064
2192
|
|
2065
|
-
Create a feature set
|
2193
|
+
Create a schema (feature set) from df with types:
|
2066
2194
|
|
2067
2195
|
>>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
|
2068
|
-
>>>
|
2196
|
+
>>> schema = ln.Schema.from_df(df)
|
2069
2197
|
|
2070
|
-
Create a feature set
|
2198
|
+
Create a schema (feature set) from features:
|
2071
2199
|
|
2072
2200
|
>>> features = [ln.Feature(name=feat, dtype="float").save() for feat in ["feat1", "feat2"]]
|
2073
|
-
>>>
|
2201
|
+
>>> schema = ln.Schema(features)
|
2074
2202
|
|
2075
|
-
Create a feature set
|
2203
|
+
Create a schema (feature set) from identifier values:
|
2076
2204
|
|
2077
2205
|
>>> import bionty as bt
|
2078
|
-
>>>
|
2079
|
-
|
2080
|
-
Link a feature set to an artifact:
|
2081
|
-
|
2082
|
-
>>> artifact.features.add_feature_set(feature_set, slot="var")
|
2206
|
+
>>> schema = ln.Schema.from_values(adata.var["ensemble_id"], Gene.ensembl_gene_id, organism="mouse").save()
|
2083
2207
|
|
2084
2208
|
"""
|
2085
2209
|
|
@@ -2087,6 +2211,7 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2087
2211
|
abstract = False
|
2088
2212
|
|
2089
2213
|
_name_field: str = "name"
|
2214
|
+
_aux_fields: dict[str, tuple[str, type]] = {"0": ("coerce_dtype", bool)}
|
2090
2215
|
|
2091
2216
|
id: int = models.AutoField(primary_key=True)
|
2092
2217
|
"""Internal id, valid only in one DB instance."""
|
@@ -2098,89 +2223,116 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2098
2223
|
"""A description."""
|
2099
2224
|
n = IntegerField()
|
2100
2225
|
"""Number of features in the set."""
|
2101
|
-
dtype: str | None = CharField(max_length=64, null=True)
|
2226
|
+
dtype: str | None = CharField(max_length=64, null=True, editable=False)
|
2102
2227
|
"""Data type, e.g., "num", "float", "int". Is `None` for :class:`~lamindb.Feature`.
|
2103
2228
|
|
2104
2229
|
For :class:`~lamindb.Feature`, types are expected to be heterogeneous and defined on a per-feature level.
|
2105
2230
|
"""
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2231
|
+
itype: str | None = CharField(
|
2232
|
+
max_length=120, db_index=True, null=True, editable=False
|
2233
|
+
)
|
2109
2234
|
"""A registry that stores feature identifiers used in this schema, e.g., `'Feature'` or `'bionty.Gene'`.
|
2110
2235
|
|
2111
2236
|
Depending on the registry, `.members` stores, e.g., `Feature` or `bionty.Gene` records.
|
2112
2237
|
|
2113
2238
|
.. versionchanged:: 1.0.0
|
2114
|
-
Was called `
|
2239
|
+
Was called `registry` before.
|
2115
2240
|
"""
|
2116
|
-
type:
|
2117
|
-
|
2118
|
-
|
2119
|
-
|
2241
|
+
type: Schema | None = ForeignKey("self", PROTECT, null=True, related_name="records")
|
2242
|
+
"""Type of schema.
|
2243
|
+
|
2244
|
+
Allows to group schemas by type, e.g., all meassurements evaluating gene expression vs. protein expression vs. multi modal.
|
2120
2245
|
|
2121
|
-
|
2246
|
+
You can define types via `ln.Schema(name="ProteinPanel", is_type=True)`.
|
2247
|
+
|
2248
|
+
Here are a few more examples for type names: `'ExpressionPanel'`, `'ProteinPanel'`, `'Multimodal'`, `'Metadata'`, `'Embedding'`.
|
2122
2249
|
"""
|
2123
|
-
records:
|
2250
|
+
records: Schema
|
2124
2251
|
"""Records of this type."""
|
2125
|
-
is_type: bool = BooleanField(default=
|
2252
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
2126
2253
|
"""Distinguish types from instances of the type."""
|
2127
2254
|
otype: str | None = CharField(max_length=64, db_index=True, null=True)
|
2128
2255
|
"""Default Python object type, e.g., DataFrame, AnnData."""
|
2129
|
-
hash: str | None = CharField(
|
2256
|
+
hash: str | None = CharField(
|
2257
|
+
max_length=HASH_LENGTH, db_index=True, null=True, editable=False
|
2258
|
+
)
|
2130
2259
|
"""A hash of the set of feature identifiers.
|
2131
2260
|
|
2132
2261
|
For a composite schema, the hash of hashes.
|
2133
2262
|
"""
|
2134
|
-
minimal_set: bool = BooleanField(default=True, db_index=True)
|
2263
|
+
minimal_set: bool = BooleanField(default=True, db_index=True, editable=False)
|
2135
2264
|
"""Whether the schema contains a minimal set of linked features (default `True`).
|
2136
2265
|
|
2137
2266
|
If `False`, no features are linked to this schema.
|
2138
2267
|
|
2139
2268
|
If `True`, features are linked and considered as a minimally required set in validation.
|
2140
2269
|
"""
|
2141
|
-
ordered_set: bool = BooleanField(default=False, db_index=True)
|
2142
|
-
"""Whether
|
2143
|
-
maximal_set: bool = BooleanField(default=False, db_index=True)
|
2270
|
+
ordered_set: bool = BooleanField(default=False, db_index=True, editable=False)
|
2271
|
+
"""Whether features are required to be ordered (default `False`)."""
|
2272
|
+
maximal_set: bool = BooleanField(default=False, db_index=True, editable=False)
|
2144
2273
|
"""If `False`, additional features are allowed (default `False`).
|
2145
2274
|
|
2146
2275
|
If `True`, the the minimal set is a maximal set and no additional features are allowed.
|
2147
2276
|
"""
|
2148
|
-
|
2149
|
-
"self",
|
2150
|
-
)
|
2151
|
-
"""The composite schema that contains this schema as a component.
|
2152
|
-
|
2153
|
-
The composite schema composes multiple simpler schemas into one object.
|
2154
|
-
|
2155
|
-
For example, an AnnData composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
|
2156
|
-
"""
|
2157
|
-
slot: str | None = CharField(max_length=100, db_index=True, null=True)
|
2158
|
-
"""The slot in which the schema is stored in the composite schema."""
|
2159
|
-
validated_by: Schema | None = ForeignKey(
|
2160
|
-
"self", PROTECT, related_name="validated_schemas", default=None, null=True
|
2277
|
+
components: Schema = ManyToManyField(
|
2278
|
+
"self", through="SchemaComponent", symmetrical=False, related_name="composites"
|
2161
2279
|
)
|
2162
|
-
"""
|
2280
|
+
"""Components of this schema."""
|
2281
|
+
composites: Schema
|
2282
|
+
"""The composite schemas that contains this schema as a component.
|
2163
2283
|
|
2164
|
-
|
2165
|
-
|
2166
|
-
For instance, the set of measured features might be a superset of the minimally required set of features.
|
2167
|
-
|
2168
|
-
Often, the curating schema does not specficy any concrete features at all
|
2284
|
+
For example, an `AnnData` composes multiple schemas: `var[DataFrameT]`, `obs[DataFrame]`, `obsm[Array]`, `uns[dict]`, etc.
|
2169
2285
|
"""
|
2170
2286
|
features: Feature
|
2171
2287
|
"""The features contained in the schema."""
|
2172
2288
|
params: Param
|
2173
2289
|
"""The params contained in the schema."""
|
2174
2290
|
artifacts: Artifact
|
2175
|
-
"""The artifacts that
|
2291
|
+
"""The artifacts that measure a feature set that matches this schema."""
|
2292
|
+
validated_artifacts: Artifact
|
2293
|
+
"""The artifacts that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
|
2294
|
+
projects: Project
|
2295
|
+
"""Associated projects."""
|
2176
2296
|
_curation: dict[str, Any] = JSONField(default=None, db_default=None, null=True)
|
2297
|
+
# lamindb v2
|
2298
|
+
# _itype: ContentType = models.ForeignKey(ContentType, on_delete=models.CASCADE)
|
2299
|
+
# ""Index of the registry that stores the feature identifiers, e.g., `Feature` or `Gene`."""
|
2300
|
+
# -- the following two fields are dynamically removed from the API for now
|
2301
|
+
validated_by: Schema | None = ForeignKey(
|
2302
|
+
"self", PROTECT, related_name="validated_schemas", default=None, null=True
|
2303
|
+
)
|
2304
|
+
# """The schema that validated this schema during curation.
|
2305
|
+
|
2306
|
+
# When performing validation, the schema that enforced validation is often less concrete than what is validated.
|
2307
|
+
|
2308
|
+
# For instance, the set of measured features might be a superset of the minimally required set of features.
|
2309
|
+
# """
|
2310
|
+
# validated_schemas: Schema
|
2311
|
+
# """The schemas that were validated against this schema with a :class:`~lamindb.curators.Curator`."""
|
2312
|
+
composite: Schema | None = ForeignKey(
|
2313
|
+
"self", PROTECT, related_name="+", default=None, null=True
|
2314
|
+
)
|
2315
|
+
# The legacy foreign key
|
2316
|
+
slot: str | None = CharField(max_length=100, db_index=True, null=True)
|
2317
|
+
# The legacy slot
|
2177
2318
|
|
2178
2319
|
@overload
|
2179
2320
|
def __init__(
|
2180
2321
|
self,
|
2181
|
-
features: Iterable[Record],
|
2182
|
-
|
2322
|
+
features: Iterable[Record] | None = None,
|
2323
|
+
components: dict[str, Schema] | None = None,
|
2183
2324
|
name: str | None = None,
|
2325
|
+
description: str | None = None,
|
2326
|
+
dtype: str | None = None,
|
2327
|
+
itype: str | Registry | FieldAttr | None = None,
|
2328
|
+
type: Schema | None = None,
|
2329
|
+
is_type: bool = False,
|
2330
|
+
otype: str | None = None,
|
2331
|
+
minimal_set: bool = True,
|
2332
|
+
ordered_set: bool = False,
|
2333
|
+
maximal_set: bool = False,
|
2334
|
+
slot: str | None = None,
|
2335
|
+
coerce_dtype: bool = False,
|
2184
2336
|
): ...
|
2185
2337
|
|
2186
2338
|
@overload
|
@@ -2256,6 +2408,25 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2256
2408
|
"""A queryset for the individual records of the set."""
|
2257
2409
|
pass
|
2258
2410
|
|
2411
|
+
@property
|
2412
|
+
def coerce_dtype(self) -> bool:
|
2413
|
+
"""Whether dtypes should be coerced during validation.
|
2414
|
+
|
2415
|
+
For example, a `objects`-dtyped pandas column can be coerced to `categorical` and would pass validation if this is true.
|
2416
|
+
"""
|
2417
|
+
if self._aux is not None and "af" in self._aux and "0" in self._aux["af"]:
|
2418
|
+
return self._aux["af"]["0"]
|
2419
|
+
else:
|
2420
|
+
return False
|
2421
|
+
|
2422
|
+
@coerce_dtype.setter
|
2423
|
+
def coerce_dtype(self, value: bool) -> None:
|
2424
|
+
if self._aux is None:
|
2425
|
+
self._aux = {}
|
2426
|
+
if "af" not in self._aux:
|
2427
|
+
self._aux["af"] = {}
|
2428
|
+
self._aux["af"]["0"] = value
|
2429
|
+
|
2259
2430
|
@property
|
2260
2431
|
@deprecated("itype")
|
2261
2432
|
def registry(self) -> str:
|
@@ -2265,8 +2436,23 @@ class Schema(Record, CanCurate, TracksRun):
|
|
2265
2436
|
def registry(self, value) -> None:
|
2266
2437
|
self.itype = value
|
2267
2438
|
|
2439
|
+
def describe(self, return_str=False) -> None | str:
|
2440
|
+
"""Describe schema."""
|
2441
|
+
message = str(self) + "\ncomponents:"
|
2442
|
+
for component in self.components.all():
|
2443
|
+
message += "\n " + str(component)
|
2444
|
+
if return_str:
|
2445
|
+
return message
|
2446
|
+
else:
|
2447
|
+
print(message)
|
2448
|
+
return None
|
2449
|
+
|
2450
|
+
def _get_component(self, slot: str) -> Schema:
|
2451
|
+
return self.components.get(links_component__slot=slot)
|
2452
|
+
|
2268
2453
|
|
2269
2454
|
class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
2455
|
+
# Note that this docstring has to be consistent with Curator.save_artifact()
|
2270
2456
|
"""Datasets & models stored as files, folders, or arrays.
|
2271
2457
|
|
2272
2458
|
Artifacts manage data in local or remote storage.
|
@@ -2276,10 +2462,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2276
2462
|
|
2277
2463
|
Args:
|
2278
2464
|
data: `UPathStr` A path to a local or remote folder or file.
|
2279
|
-
|
2280
|
-
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a
|
2465
|
+
kind: `Literal["dataset", "model"] | None = None` Distinguish models from datasets from other files & folders.
|
2466
|
+
key: `str | None = None` A path-like key to reference artifact in default storage, e.g., `"myfolder/myfile.fcs"`. Artifacts with the same key form a version family.
|
2281
2467
|
description: `str | None = None` A description.
|
2282
|
-
revises: `Artifact | None = None` Previous version of the artifact.
|
2468
|
+
revises: `Artifact | None = None` Previous version of the artifact. Is an alternative way to passing `key` to trigger a new version.
|
2283
2469
|
run: `Run | None = None` The run that creates the artifact.
|
2284
2470
|
|
2285
2471
|
.. dropdown:: Typical storage formats & their API accessors
|
@@ -2313,26 +2499,28 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2313
2499
|
|
2314
2500
|
Examples:
|
2315
2501
|
|
2316
|
-
Create an artifact
|
2502
|
+
Create an artifact by passing `key`:
|
2503
|
+
|
2504
|
+
>>> artifact = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
|
2505
|
+
>>> artifact = ln.Artifact("./my_folder", key="project1/my_folder").save()
|
2317
2506
|
|
2318
|
-
|
2319
|
-
|
2507
|
+
Calling `.save()` uploads the file to the default storage location of your lamindb instance.
|
2508
|
+
(If it's a local instance, the "upload" is a mere copy operation.)
|
2320
2509
|
|
2321
|
-
|
2510
|
+
If your artifact is already in the cloud, lamindb auto-populates the `key` field based on the S3 key and there is no upload:
|
2322
2511
|
|
2323
|
-
>>> artifact = ln.Artifact("
|
2512
|
+
>>> artifact = ln.Artifact("s3://my_bucket/my_folder/my_file.csv").save()
|
2324
2513
|
|
2325
|
-
|
2514
|
+
You can make a new version of the artifact with `key = "example_datasets/my_file.parquet"`
|
2326
2515
|
|
2327
|
-
>>>
|
2328
|
-
>>>
|
2329
|
-
>>> artifact = ln.Artifact("./my_local_folder", key="project1/my_target_folder")
|
2516
|
+
>>> artifact_v2 = ln.Artifact("./my_file.parquet", key="example_datasets/my_file.parquet").save()
|
2517
|
+
>>> artifact_v2.versions.df() # see all versions
|
2330
2518
|
|
2331
2519
|
.. dropdown:: Why does the API look this way?
|
2332
2520
|
|
2333
2521
|
It's inspired by APIs building on AWS S3.
|
2334
2522
|
|
2335
|
-
Both boto3 and quilt select a bucket (
|
2523
|
+
Both boto3 and quilt select a bucket (a storage location in LaminDB) and define a target path through a `key` argument.
|
2336
2524
|
|
2337
2525
|
In `boto3 <https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/bucket/upload_file.html>`__::
|
2338
2526
|
|
@@ -2349,16 +2537,18 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2349
2537
|
bucket = quilt3.Bucket('mybucket')
|
2350
2538
|
bucket.put_file('hello.txt', '/tmp/hello.txt')
|
2351
2539
|
|
2540
|
+
Sometimes you want to avoid mapping the artifact into a file hierarchy, and you can then _just_ populate `description` instead:
|
2352
2541
|
|
2353
|
-
|
2542
|
+
>>> artifact = ln.Artifact("s3://my_bucket/my_folder", description="My folder").save()
|
2543
|
+
>>> artifact = ln.Artifact("./my_local_folder", description="My local folder").save()
|
2354
2544
|
|
2355
|
-
|
2356
|
-
>>> artifact_v2 = ln.Artifact(df_updated, key="example_datasets/dataset1.parquet").save()
|
2545
|
+
Because you can then not use `key`-based versioning you have to pass `revises` to make a new artifact version:
|
2357
2546
|
|
2358
|
-
|
2547
|
+
>>> artifact_v2 = ln.Artifact("./my_file.parquet", revises=old_artifact).save()
|
2359
2548
|
|
2360
|
-
|
2361
|
-
|
2549
|
+
If an artifact with the exact same hash already exists, `Artifact()` returns the existing artifact. In concurrent workloads where
|
2550
|
+
the same artifact is created multiple times, `Artifact()` doesn't yet return the existing artifact but creates a new one; `.save()` however
|
2551
|
+
detects the duplication and will return the existing artifact.
|
2362
2552
|
|
2363
2553
|
"""
|
2364
2554
|
|
@@ -2455,9 +2645,11 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2455
2645
|
"""
|
2456
2646
|
description: str | None = CharField(db_index=True, null=True)
|
2457
2647
|
"""A description."""
|
2458
|
-
storage: Storage = ForeignKey(
|
2648
|
+
storage: Storage = ForeignKey(
|
2649
|
+
Storage, PROTECT, related_name="artifacts", editable=False
|
2650
|
+
)
|
2459
2651
|
"""Storage location, e.g. an S3 or GCP bucket or a local directory."""
|
2460
|
-
suffix: str = CharField(max_length=30, db_index=True)
|
2652
|
+
suffix: str = CharField(max_length=30, db_index=True, editable=False)
|
2461
2653
|
# Initially, we thought about having this be nullable to indicate folders
|
2462
2654
|
# But, for instance, .zarr is stored in a folder that ends with a .zarr suffix
|
2463
2655
|
"""Path suffix or empty string if no canonical suffix exists.
|
@@ -2470,19 +2662,27 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2470
2662
|
null=True,
|
2471
2663
|
)
|
2472
2664
|
""":class:`~lamindb.base.types.ArtifactKind` (default `None`)."""
|
2473
|
-
otype: str | None = CharField(
|
2665
|
+
otype: str | None = CharField(
|
2666
|
+
max_length=64, db_index=True, null=True, editable=False
|
2667
|
+
)
|
2474
2668
|
"""Default Python object type, e.g., DataFrame, AnnData."""
|
2475
|
-
size: int | None = BigIntegerField(
|
2669
|
+
size: int | None = BigIntegerField(
|
2670
|
+
null=True, db_index=True, default=None, editable=False
|
2671
|
+
)
|
2476
2672
|
"""Size in bytes.
|
2477
2673
|
|
2478
2674
|
Examples: 1KB is 1e3 bytes, 1MB is 1e6, 1GB is 1e9, 1TB is 1e12 etc.
|
2479
2675
|
"""
|
2480
|
-
hash: str | None = CharField(
|
2676
|
+
hash: str | None = CharField(
|
2677
|
+
max_length=HASH_LENGTH, db_index=True, null=True, unique=True, editable=False
|
2678
|
+
)
|
2481
2679
|
"""Hash or pseudo-hash of artifact content.
|
2482
2680
|
|
2483
2681
|
Useful to ascertain integrity and avoid duplication.
|
2484
2682
|
"""
|
2485
|
-
n_files: int | None = BigIntegerField(
|
2683
|
+
n_files: int | None = BigIntegerField(
|
2684
|
+
null=True, db_index=True, default=None, editable=False
|
2685
|
+
)
|
2486
2686
|
"""Number of files for folder-like artifacts, `None` for file-like artifacts.
|
2487
2687
|
|
2488
2688
|
Note that some arrays are also stored as folders, e.g., `.zarr` or `.tiledbsoma`.
|
@@ -2490,19 +2690,28 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2490
2690
|
.. versionchanged:: 1.0
|
2491
2691
|
Renamed from `n_objects` to `n_files`.
|
2492
2692
|
"""
|
2493
|
-
n_observations: int | None = BigIntegerField(
|
2693
|
+
n_observations: int | None = BigIntegerField(
|
2694
|
+
null=True, db_index=True, default=None, editable=False
|
2695
|
+
)
|
2494
2696
|
"""Number of observations.
|
2495
2697
|
|
2496
2698
|
Typically, this denotes the first array dimension.
|
2497
2699
|
"""
|
2498
|
-
_hash_type: str | None = CharField(
|
2700
|
+
_hash_type: str | None = CharField(
|
2701
|
+
max_length=30, db_index=True, null=True, editable=False
|
2702
|
+
)
|
2499
2703
|
"""Type of hash."""
|
2500
2704
|
ulabels: ULabel = models.ManyToManyField(
|
2501
2705
|
ULabel, through="ArtifactULabel", related_name="artifacts"
|
2502
2706
|
)
|
2503
2707
|
"""The ulabels measured in the artifact (:class:`~lamindb.ULabel`)."""
|
2504
2708
|
run: Run | None = ForeignKey(
|
2505
|
-
Run,
|
2709
|
+
Run,
|
2710
|
+
PROTECT,
|
2711
|
+
related_name="output_artifacts",
|
2712
|
+
null=True,
|
2713
|
+
default=None,
|
2714
|
+
editable=False,
|
2506
2715
|
)
|
2507
2716
|
"""Run that created the artifact."""
|
2508
2717
|
input_of_runs: Run = models.ManyToManyField(Run, related_name="input_artifacts")
|
@@ -2516,13 +2725,17 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2516
2725
|
collections: Collection
|
2517
2726
|
"""The collections that this artifact is part of."""
|
2518
2727
|
schema: Schema | None = ForeignKey(
|
2519
|
-
Schema,
|
2728
|
+
Schema,
|
2729
|
+
PROTECT,
|
2730
|
+
null=True,
|
2731
|
+
default=None,
|
2732
|
+
related_name="validated_artifacts",
|
2520
2733
|
)
|
2521
|
-
"""The schema
|
2522
|
-
|
2523
|
-
Schema, related_name="
|
2734
|
+
"""The schema that validated this artifact in a :class:`~lamindb.curators.Curator`."""
|
2735
|
+
feature_sets: Schema = models.ManyToManyField(
|
2736
|
+
Schema, related_name="artifacts", through="ArtifactSchema"
|
2524
2737
|
)
|
2525
|
-
"""
|
2738
|
+
"""The feature sets measured by the artifact."""
|
2526
2739
|
_feature_values: FeatureValue = models.ManyToManyField(
|
2527
2740
|
FeatureValue, through="ArtifactFeatureValue", related_name="artifacts"
|
2528
2741
|
)
|
@@ -2543,6 +2756,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2543
2756
|
PROTECT,
|
2544
2757
|
default=current_user_id,
|
2545
2758
|
related_name="created_artifacts",
|
2759
|
+
editable=False,
|
2546
2760
|
)
|
2547
2761
|
"""Creator of record."""
|
2548
2762
|
_overwrite_versions: bool = BooleanField(default=None)
|
@@ -2566,7 +2780,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2566
2780
|
# here; and we might refactor this but we might also keep that internal
|
2567
2781
|
# usage
|
2568
2782
|
data: UPathStr,
|
2569
|
-
|
2783
|
+
kind: ArtifactKind | None = None,
|
2570
2784
|
key: str | None = None,
|
2571
2785
|
description: str | None = None,
|
2572
2786
|
revises: Artifact | None = None,
|
@@ -2606,11 +2820,6 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2606
2820
|
def n_objects(self) -> int:
|
2607
2821
|
return self.n_files
|
2608
2822
|
|
2609
|
-
@property
|
2610
|
-
def feature_sets(self) -> QuerySet[Schema]:
|
2611
|
-
"""Feature sets linked to this artifact."""
|
2612
|
-
return self._schemas_m2m
|
2613
|
-
|
2614
2823
|
# add the below because this is what people will have in their code
|
2615
2824
|
# if they implement the recommended migration strategy
|
2616
2825
|
# - FeatureSet -> Schema
|
@@ -2620,14 +2829,14 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2620
2829
|
# def schemas(self) -> QuerySet[Schema]:
|
2621
2830
|
# """Schemas linked to artifact via many-to-many relationship.
|
2622
2831
|
|
2623
|
-
# Is now mediating the private `.
|
2832
|
+
# Is now mediating the private `.feature_sets` relationship during
|
2624
2833
|
# a transition period to better schema management.
|
2625
2834
|
|
2626
2835
|
# .. versionchanged: 1.0
|
2627
2836
|
# Was previously called `.feature_sets`.
|
2628
2837
|
|
2629
2838
|
# """
|
2630
|
-
# return self.
|
2839
|
+
# return self.feature_sets
|
2631
2840
|
|
2632
2841
|
@property
|
2633
2842
|
def path(self) -> Path:
|
@@ -2637,7 +2846,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2637
2846
|
|
2638
2847
|
>>> artifact = ln.Artifact("s3://my-bucket/my-file.csv").save()
|
2639
2848
|
>>> artifact.path
|
2640
|
-
|
2849
|
+
S3QueryPath('s3://my-bucket/my-file.csv')
|
2641
2850
|
|
2642
2851
|
File in local storage:
|
2643
2852
|
|
@@ -2652,6 +2861,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2652
2861
|
def from_df(
|
2653
2862
|
cls,
|
2654
2863
|
df: pd.DataFrame,
|
2864
|
+
*,
|
2655
2865
|
key: str | None = None,
|
2656
2866
|
description: str | None = None,
|
2657
2867
|
run: Run | None = None,
|
@@ -2692,6 +2902,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2692
2902
|
def from_anndata(
|
2693
2903
|
cls,
|
2694
2904
|
adata: AnnData | UPathStr,
|
2905
|
+
*,
|
2695
2906
|
key: str | None = None,
|
2696
2907
|
description: str | None = None,
|
2697
2908
|
run: Run | None = None,
|
@@ -2728,6 +2939,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2728
2939
|
def from_mudata(
|
2729
2940
|
cls,
|
2730
2941
|
mdata: MuData,
|
2942
|
+
*,
|
2731
2943
|
key: str | None = None,
|
2732
2944
|
description: str | None = None,
|
2733
2945
|
run: Run | None = None,
|
@@ -2760,11 +2972,38 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2760
2972
|
pass
|
2761
2973
|
|
2762
2974
|
@classmethod
|
2763
|
-
def
|
2975
|
+
def from_tiledbsoma(
|
2764
2976
|
cls,
|
2765
2977
|
path: UPathStr,
|
2978
|
+
*,
|
2766
2979
|
key: str | None = None,
|
2980
|
+
description: str | None = None,
|
2981
|
+
run: Run | None = None,
|
2982
|
+
revises: Artifact | None = None,
|
2983
|
+
**kwargs,
|
2984
|
+
) -> Artifact:
|
2985
|
+
"""Create from a tiledbsoma store.
|
2986
|
+
|
2987
|
+
Args:
|
2988
|
+
path: A tiledbsoma store with .tiledbsoma suffix.
|
2989
|
+
key: A relative path within default storage,
|
2990
|
+
e.g., `"myfolder/mystore.tiledbsoma"`.
|
2991
|
+
description: A description.
|
2992
|
+
revises: An old version of the artifact.
|
2993
|
+
run: The run that creates the artifact.
|
2994
|
+
|
2995
|
+
Examples:
|
2996
|
+
>>> artifact = ln.Artifact.from_tiledbsoma("s3://mybucket/store.tiledbsoma", description="a tiledbsoma store")
|
2997
|
+
>>> artifact.save()
|
2998
|
+
"""
|
2999
|
+
pass
|
3000
|
+
|
3001
|
+
@classmethod
|
3002
|
+
def from_dir(
|
3003
|
+
cls,
|
3004
|
+
path: UPathStr,
|
2767
3005
|
*,
|
3006
|
+
key: str | None = None,
|
2768
3007
|
run: Run | None = None,
|
2769
3008
|
) -> list[Artifact]:
|
2770
3009
|
"""Create a list of artifact objects from a directory.
|
@@ -2791,7 +3030,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2791
3030
|
|
2792
3031
|
def replace(
|
2793
3032
|
self,
|
2794
|
-
data: UPathStr,
|
3033
|
+
data: UPathStr | pd.DataFrame | AnnData | MuData,
|
2795
3034
|
run: Run | None = None,
|
2796
3035
|
format: str | None = None,
|
2797
3036
|
) -> None:
|
@@ -2824,6 +3063,7 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2824
3063
|
| BackedAccessor
|
2825
3064
|
| SOMACollection
|
2826
3065
|
| SOMAExperiment
|
3066
|
+
| SOMAMeasurement
|
2827
3067
|
| PyArrowDataset
|
2828
3068
|
):
|
2829
3069
|
"""Return a cloud-backed data object.
|
@@ -2966,13 +3206,13 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2966
3206
|
|
2967
3207
|
Args:
|
2968
3208
|
artifacts: `list[Artifact]` A list of artifacts.
|
2969
|
-
|
3209
|
+
key: `str` A file-path like key, analogous to the `key` parameter of `Artifact` and `Transform`.
|
2970
3210
|
description: `str | None = None` A description.
|
2971
3211
|
revises: `Collection | None = None` An old version of the collection.
|
2972
3212
|
run: `Run | None = None` The run that creates the collection.
|
2973
3213
|
meta: `Artifact | None = None` An artifact that defines metadata for the collection.
|
2974
|
-
reference: `str | None = None`
|
2975
|
-
reference_type: `str | None = None`
|
3214
|
+
reference: `str | None = None` A simple reference, e.g. an external ID or a URL.
|
3215
|
+
reference_type: `str | None = None` A way to indicate to indicate the type of the simple reference `"url"`.
|
2976
3216
|
|
2977
3217
|
See Also:
|
2978
3218
|
:class:`~lamindb.Artifact`
|
@@ -2981,11 +3221,11 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
2981
3221
|
|
2982
3222
|
Create a collection from a list of :class:`~lamindb.Artifact` objects:
|
2983
3223
|
|
2984
|
-
>>> collection = ln.Collection([artifact1, artifact2],
|
3224
|
+
>>> collection = ln.Collection([artifact1, artifact2], key="my_project/my_collection")
|
2985
3225
|
|
2986
3226
|
Create a collection that groups a data & a metadata artifact (e.g., here :doc:`docs:rxrx`):
|
2987
3227
|
|
2988
|
-
>>> collection = ln.Collection(data_artifact,
|
3228
|
+
>>> collection = ln.Collection(data_artifact, key="my_project/my_collection", meta=metadata_artifact)
|
2989
3229
|
|
2990
3230
|
"""
|
2991
3231
|
|
@@ -3008,13 +3248,15 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
3008
3248
|
"""Universal id, valid across DB instances."""
|
3009
3249
|
key: str = CharField(db_index=True)
|
3010
3250
|
"""Name or path-like key."""
|
3011
|
-
#
|
3251
|
+
# below is the only case in which we use a TextField
|
3012
3252
|
# for description; we do so because users had descriptions exceeding 255 chars
|
3013
3253
|
# in their instances
|
3014
3254
|
description: str | None = TextField(null=True, db_index=True)
|
3015
3255
|
"""A description or title."""
|
3016
|
-
hash: str | None = CharField(
|
3017
|
-
|
3256
|
+
hash: str | None = CharField(
|
3257
|
+
max_length=HASH_LENGTH, db_index=True, null=True, unique=True
|
3258
|
+
)
|
3259
|
+
"""Hash of collection content."""
|
3018
3260
|
reference: str | None = CharField(max_length=255, db_index=True, null=True)
|
3019
3261
|
"""A reference like URL or external ID."""
|
3020
3262
|
# also for reference_type here, we allow an extra long max_length
|
@@ -3058,7 +3300,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
3058
3300
|
def __init__(
|
3059
3301
|
self,
|
3060
3302
|
artifacts: list[Artifact],
|
3061
|
-
|
3303
|
+
key: str,
|
3062
3304
|
description: str | None = None,
|
3063
3305
|
meta: Any | None = None,
|
3064
3306
|
reference: str | None = None,
|
@@ -3084,21 +3326,39 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
3084
3326
|
"""Add an artifact to the collection.
|
3085
3327
|
|
3086
3328
|
Creates a new version of the collection.
|
3329
|
+
This does not modify the original collection in-place, but returns a new version
|
3330
|
+
of the original collection with the added artifact.
|
3087
3331
|
|
3088
3332
|
Args:
|
3089
3333
|
artifact: An artifact to add to the collection.
|
3090
3334
|
run: The run that creates the new version of the collection.
|
3091
3335
|
|
3336
|
+
Examples:
|
3337
|
+
>>> collection = ln.Collection(artifact, key="new collection")
|
3338
|
+
>>> collecton.save()
|
3339
|
+
>>> collection = collection.append(another_artifact) # returns a new version
|
3340
|
+
>>> collection.save() # save the new version
|
3341
|
+
|
3092
3342
|
.. versionadded:: 0.76.14
|
3093
3343
|
"""
|
3094
3344
|
pass
|
3095
3345
|
|
3346
|
+
def open(self, is_run_input: bool | None = None) -> PyArrowDataset:
|
3347
|
+
"""Return a cloud-backed pyarrow Dataset.
|
3348
|
+
|
3349
|
+
Works for `pyarrow` compatible formats.
|
3350
|
+
|
3351
|
+
Notes:
|
3352
|
+
For more info, see tutorial: :doc:`/arrays`.
|
3353
|
+
"""
|
3354
|
+
pass
|
3355
|
+
|
3096
3356
|
def mapped(
|
3097
3357
|
self,
|
3098
3358
|
layers_keys: str | list[str] | None = None,
|
3099
3359
|
obs_keys: str | list[str] | None = None,
|
3100
3360
|
obsm_keys: str | list[str] | None = None,
|
3101
|
-
obs_filter: dict[str, str |
|
3361
|
+
obs_filter: dict[str, str | list[str]] | None = None,
|
3102
3362
|
join: Literal["inner", "outer"] | None = "inner",
|
3103
3363
|
encode_labels: bool | list[str] = True,
|
3104
3364
|
unknown_label: str | dict[str, str] | None = None,
|
@@ -3136,7 +3396,7 @@ class Collection(Record, IsVersioned, TracksRun, TracksUpdates):
|
|
3136
3396
|
obsm_keys: Keys from the ``.obsm`` slots.
|
3137
3397
|
obs_filter: Select only observations with these values for the given obs columns.
|
3138
3398
|
Should be a dictionary with obs column names as keys
|
3139
|
-
and filtering values (a string or a
|
3399
|
+
and filtering values (a string or a list of strings) as values.
|
3140
3400
|
join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
|
3141
3401
|
does not join.
|
3142
3402
|
encode_labels: Encode labels into integers.
|
@@ -3330,7 +3590,7 @@ class Project(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
|
|
3330
3590
|
"""Type of project (e.g., 'Program', 'Project', 'GithubIssue', 'Task')."""
|
3331
3591
|
records: Project
|
3332
3592
|
"""Records of this type."""
|
3333
|
-
is_type: bool = BooleanField(default=
|
3593
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
3334
3594
|
"""Distinguish types from instances of the type."""
|
3335
3595
|
abbr: str | None = CharField(max_length=32, db_index=True, null=True)
|
3336
3596
|
"""An abbreviation."""
|
@@ -3434,7 +3694,7 @@ class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
|
|
3434
3694
|
"""
|
3435
3695
|
records: Reference
|
3436
3696
|
"""Records of this type."""
|
3437
|
-
is_type: bool = BooleanField(default=
|
3697
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
3438
3698
|
"""Distinguish types from instances of the type."""
|
3439
3699
|
url: str | None = URLField(null=True)
|
3440
3700
|
"""URL linking to the reference."""
|
@@ -3476,7 +3736,7 @@ class Reference(Record, CanCurate, TracksRun, TracksUpdates, ValidateFields):
|
|
3476
3736
|
# -------------------------------------------------------------------------------------
|
3477
3737
|
# Data models
|
3478
3738
|
|
3479
|
-
from django.contrib.postgres.fields import JSONField
|
3739
|
+
from django.contrib.postgres.fields import JSONField # type: ignore
|
3480
3740
|
from django.core.exceptions import ValidationError
|
3481
3741
|
from django.db import models
|
3482
3742
|
|
@@ -3543,7 +3803,7 @@ class RunData(BasicRecord, DataMixin):
|
|
3543
3803
|
class Meta:
|
3544
3804
|
constraints = [
|
3545
3805
|
models.CheckConstraint(
|
3546
|
-
|
3806
|
+
condition=(
|
3547
3807
|
models.Q(feature__isnull=False, param__isnull=True)
|
3548
3808
|
| models.Q(feature__isnull=True, param__isnull=False)
|
3549
3809
|
),
|
@@ -3574,7 +3834,7 @@ class FlexTable(Record, TracksRun, TracksUpdates):
|
|
3574
3834
|
"""Type of tidy table, e.g., `Cell`, `SampleSheet`, etc."""
|
3575
3835
|
records: ULabel
|
3576
3836
|
"""Records of this type."""
|
3577
|
-
is_type: bool = BooleanField(default=
|
3837
|
+
is_type: bool = BooleanField(default=False, db_index=True, null=True)
|
3578
3838
|
"""Distinguish types from instances of the type."""
|
3579
3839
|
description: str = CharField(null=True, db_index=True)
|
3580
3840
|
"""A description."""
|
@@ -3593,7 +3853,7 @@ class FlexTableData(BasicRecord, DataMixin):
|
|
3593
3853
|
class Meta:
|
3594
3854
|
constraints = [
|
3595
3855
|
models.CheckConstraint(
|
3596
|
-
|
3856
|
+
condition=(
|
3597
3857
|
models.Q(feature__isnull=False, param__isnull=True)
|
3598
3858
|
| models.Q(feature__isnull=True, param__isnull=False)
|
3599
3859
|
),
|
@@ -3621,8 +3881,8 @@ class LinkORM:
|
|
3621
3881
|
|
3622
3882
|
class SchemaFeature(BasicRecord, LinkORM):
|
3623
3883
|
id: int = models.BigAutoField(primary_key=True)
|
3624
|
-
schema: Schema = ForeignKey(Schema, CASCADE, related_name="
|
3625
|
-
feature: Feature = ForeignKey(Feature, PROTECT, related_name="
|
3884
|
+
schema: Schema = ForeignKey(Schema, CASCADE, related_name="links_feature")
|
3885
|
+
feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")
|
3626
3886
|
|
3627
3887
|
class Meta:
|
3628
3888
|
unique_together = ("schema", "feature")
|
@@ -3640,15 +3900,22 @@ class SchemaParam(BasicRecord, LinkORM):
|
|
3640
3900
|
class ArtifactSchema(BasicRecord, LinkORM, TracksRun):
|
3641
3901
|
id: int = models.BigAutoField(primary_key=True)
|
3642
3902
|
artifact: Artifact = ForeignKey(Artifact, CASCADE, related_name="_links_schema")
|
3643
|
-
# we follow the lower() case convention rather than snake case for link models
|
3644
3903
|
schema: Schema = ForeignKey(Schema, PROTECT, related_name="_links_artifact")
|
3645
|
-
slot: str | None = CharField(
|
3646
|
-
feature_ref_is_semantic: bool | None = BooleanField(
|
3647
|
-
null=True
|
3648
|
-
) # like Feature name or Gene symbol or CellMarker name
|
3904
|
+
slot: str | None = CharField(null=True)
|
3905
|
+
feature_ref_is_semantic: bool | None = BooleanField(null=True)
|
3649
3906
|
|
3650
3907
|
class Meta:
|
3651
|
-
unique_together = ("artifact", "schema")
|
3908
|
+
unique_together = (("artifact", "schema"), ("artifact", "slot"))
|
3909
|
+
|
3910
|
+
|
3911
|
+
class SchemaComponent(BasicRecord, LinkORM, TracksRun):
|
3912
|
+
id: int = models.BigAutoField(primary_key=True)
|
3913
|
+
composite: Schema = ForeignKey(Schema, CASCADE, related_name="links_composite")
|
3914
|
+
component: Schema = ForeignKey(Schema, PROTECT, related_name="links_component")
|
3915
|
+
slot: str | None = CharField(null=True)
|
3916
|
+
|
3917
|
+
class Meta:
|
3918
|
+
unique_together = (("composite", "component"), ("composite", "slot"))
|
3652
3919
|
|
3653
3920
|
|
3654
3921
|
class CollectionArtifact(BasicRecord, LinkORM, TracksRun):
|
@@ -3883,14 +4150,14 @@ class CollectionReference(BasicRecord, LinkORM, TracksRun):
|
|
3883
4150
|
unique_together = ("collection", "reference")
|
3884
4151
|
|
3885
4152
|
|
3886
|
-
|
3887
|
-
|
3888
|
-
|
3889
|
-
|
4153
|
+
class Migration(BasicRecord):
|
4154
|
+
app = CharField(max_length=255)
|
4155
|
+
name = CharField(max_length=255)
|
4156
|
+
applied: datetime = DateTimeField()
|
3890
4157
|
|
3891
|
-
|
3892
|
-
|
3893
|
-
|
4158
|
+
class Meta:
|
4159
|
+
db_table = "django_migrations"
|
4160
|
+
managed = False
|
3894
4161
|
|
3895
4162
|
|
3896
4163
|
# -------------------------------------------------------------------------------------
|