PyPI - lamindb - Versions diffs - 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl - Mend

lamindb 1.10.2py3-none-any.whl → 1.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

lamindb/__init__.py +89 -49
lamindb/_finish.py +17 -15
lamindb/_tracked.py +2 -4
lamindb/_view.py +1 -1
lamindb/base/__init__.py +2 -1
lamindb/base/dtypes.py +76 -0
lamindb/core/_settings.py +2 -2
lamindb/core/storage/_anndata_accessor.py +29 -9
lamindb/curators/_legacy.py +16 -3
lamindb/curators/core.py +442 -188
lamindb/errors.py +6 -0
lamindb/examples/cellxgene/__init__.py +8 -3
lamindb/examples/cellxgene/_cellxgene.py +127 -13
lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
lamindb/examples/croissant/__init__.py +32 -6
lamindb/examples/datasets/__init__.py +2 -2
lamindb/examples/datasets/_core.py +9 -2
lamindb/examples/datasets/_small.py +66 -22
lamindb/examples/fixtures/sheets.py +8 -2
lamindb/integrations/_croissant.py +34 -11
lamindb/migrations/0119_squashed.py +5 -2
lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
lamindb/migrations/0121_recorduser.py +60 -0
lamindb/models/__init__.py +4 -1
lamindb/models/_describe.py +2 -2
lamindb/models/_feature_manager.py +131 -71
lamindb/models/_from_values.py +2 -2
lamindb/models/_is_versioned.py +4 -4
lamindb/models/_label_manager.py +4 -4
lamindb/models/artifact.py +326 -172
lamindb/models/artifact_set.py +45 -1
lamindb/models/can_curate.py +1 -2
lamindb/models/collection.py +3 -34
lamindb/models/feature.py +111 -7
lamindb/models/has_parents.py +11 -11
lamindb/models/project.py +18 -0
lamindb/models/query_manager.py +16 -7
lamindb/models/query_set.py +191 -78
lamindb/models/record.py +30 -5
lamindb/models/run.py +10 -33
lamindb/models/save.py +6 -8
lamindb/models/schema.py +54 -26
lamindb/models/sqlrecord.py +152 -40
lamindb/models/storage.py +59 -14
lamindb/models/transform.py +17 -17
lamindb/models/ulabel.py +6 -1
{lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/METADATA +12 -18
{lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/RECORD +50 -47
{lamindb-1.10.2.dist-info → lamindb-1.11.0.dist-info}/WHEEL +1 -1
{lamindb-1.10.2.dist-info/licenses → lamindb-1.11.0.dist-info}/LICENSE +0 -0

lamindb/models/save.py CHANGED Viewed

@@ -47,11 +47,10 @@ def save(
     Args:
         records: Multiple :class:`~lamindb.models.SQLRecord` objects.
-        ignore_conflicts: If ``True``, do not error if some records violate a
-            unique or another constraint. However, it won't inplace update the id
-            fields of records. If you need records with ids, you need to query
-            them from the database.
-        batch_size: Number of records to process in each batch. Defaults to 10000.
+        ignore_conflicts: If `True`, do not error if some records violate a unique or another constraint.
+            However, it won't inplace update the id fields of records.
+            If you need records with ids, you need to query them from the database.
+        batch_size: Number of records to process in each batch.
             Large batch sizes can improve performance but may lead to memory issues.
     Examples:
@@ -130,7 +129,7 @@ def bulk_create(
     Args:
         records: Iterable of SQLRecord objects to create
         ignore_conflicts: Whether to ignore conflicts during creation
-        batch_size: Number of records to process in each batch. Defaults to 10000.
+        batch_size: Number of records to process in each batch.
     """
     records_by_orm = defaultdict(list)
     for record in records:
@@ -332,8 +331,7 @@ def store_artifacts(
     from .artifact import Artifact
     exception: Exception | None = None
-    # because uploads might fail, we need to maintain a new list
-    # of the succeeded uploads
+    # because uploads might fail, we need to maintain a new list of the succeeded uploads
     stored_artifacts = []
     # upload new local artifacts

lamindb/models/schema.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 from django.db import models
 from django.db.models import CASCADE, PROTECT, ManyToManyField
 from lamin_utils import logger
+from lamindb_setup.core import deprecated
 from lamindb_setup.core.hashing import HASH_LENGTH, hash_string
 from rich.table import Table
 from rich.text import Text
@@ -348,11 +349,12 @@ class Schema(SQLRecord, CanCurate, TracksRun):
             # from a dataframe
             df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
-            schema = ln.Schema.from_df(df)
+            schema = ln.Schema.from_dataframe(df)
     """
     class Meta(SQLRecord.Meta, TracksRun.Meta, TracksUpdates.Meta):
         abstract = False
+        app_label = "lamindb"
     _name_field: str = "name"
     _aux_fields: dict[str, tuple[str, type]] = {
@@ -576,19 +578,22 @@ class Schema(SQLRecord, CanCurate, TracksRun):
                 self.optionals.set(optional_features)
                 return None
         self._slots: dict[str, Schema] = {}
         if features:
             self._features = (get_related_name(features_registry), features)  # type: ignore
-        elif slots:
+        if slots:
             for slot_key, component in slots.items():
                 if component._state.adding:
                     raise InvalidArgument(
                         f"schema for {slot_key} {component} must be saved before use"
                     )
             self._slots = slots
         if validated_kwargs["hash"] in KNOWN_SCHEMAS:
             validated_kwargs["uid"] = KNOWN_SCHEMAS[validated_kwargs["hash"]]
         else:
             validated_kwargs["uid"] = ids.base62_16()
         super().__init__(**validated_kwargs)
     def _validate_kwargs_calculate_hash(
@@ -623,14 +628,20 @@ class Schema(SQLRecord, CanCurate, TracksRun):
                 raise TypeError("index must be a Feature")
             features.insert(0, index)
+        if slots:
+            itype = "Composite"
+            if otype is None:
+                raise InvalidArgument("Please pass otype != None for composite schemas")
         if features:
             features, configs = get_features_config(features)
             features_registry = validate_features(features)
-            itype_compare = features_registry.__get_name_with_module__()
-            if itype is not None:
-                assert itype.startswith(itype_compare), str(itype_compare)  # noqa: S101
-            else:
-                itype = itype_compare
+            if itype != "Composite":
+                itype_compare = features_registry.__get_name_with_module__()
+                if itype is not None:
+                    assert itype.startswith(itype_compare), str(itype_compare)  # noqa: S101
+                else:
+                    itype = itype_compare
             if n_features is not None:
                 if n_features != len(features):
                     logger.important(f"updating to n {len(features)} features")
@@ -654,11 +665,6 @@ class Schema(SQLRecord, CanCurate, TracksRun):
         if flexible is None:
             flexible = flexible_default
-        if slots:
-            itype = "Composite"
-            if otype is None:
-                raise InvalidArgument("Please pass otype != None for composite schemas")
         if itype is not None and not isinstance(itype, str):
             itype_str = serialize_dtype(itype, is_itype=True)
         else:
@@ -771,7 +777,7 @@ class Schema(SQLRecord, CanCurate, TracksRun):
         cls,
         values: ListLike,
         field: FieldAttr = Feature.name,
-        type: str | None = None,
+        dtype: str | None = None,
         name: str | None = None,
         mute: bool = False,
         organism: SQLRecord | str | None = None,
@@ -783,7 +789,7 @@ class Schema(SQLRecord, CanCurate, TracksRun):
         Args:
             values: A list of values, like feature names or ids.
             field: The field of a reference registry to map values.
-            type: The simple type.
+            dtype: The simple dtype.
                 Defaults to `None` if reference registry is :class:`~lamindb.Feature`,
                 defaults to `"float"` otherwise.
             name: A name.
@@ -816,8 +822,8 @@ class Schema(SQLRecord, CanCurate, TracksRun):
         if isinstance(values, DICT_KEYS_TYPE):
             values = list(values)
         registry = field.field.model
-        if registry != Feature and type is None:
-            type = NUMBER_TYPE
+        if registry != Feature and dtype is None:
+            dtype = NUMBER_TYPE
             logger.debug("setting feature set to 'number'")
         validated = registry.validate(values, field=field, mute=mute, organism=organism)
         values_array = np.array(values)
@@ -841,12 +847,12 @@ class Schema(SQLRecord, CanCurate, TracksRun):
         schema = Schema(
             features=validated_features,
             name=name,
-            dtype=get_type_str(type),
+            dtype=get_type_str(dtype),
         )
         return schema
     @classmethod
-    def from_df(
+    def from_dataframe(
         cls,
         df: pd.DataFrame,
         field: FieldAttr = Feature.name,
@@ -889,15 +895,28 @@ class Schema(SQLRecord, CanCurate, TracksRun):
             )
         return schema
+    @classmethod
+    @deprecated("from_dataframe")
+    def from_df(
+        cls,
+        df: pd.DataFrame,
+        field: FieldAttr = Feature.name,
+        name: str | None = None,
+        mute: bool = False,
+        organism: SQLRecord | str | None = None,
+        source: SQLRecord | None = None,
+    ) -> Schema | None:
+        return cls.from_dataframe(df, field, name, mute, organism, source)
     def save(self, *args, **kwargs) -> Schema:
-        """Save."""
+        """Save schema."""
         from .save import bulk_create
         if self.pk is not None:
             features = (
                 self._features[1]
                 if hasattr(self, "_features")
-                else (self.members.list() if self.members.exists() else [])
+                else (self.members.to_list() if self.members.exists() else [])
             )
             index_feature = self.index
             _, validated_kwargs, _, _, _ = self._validate_kwargs_calculate_hash(
@@ -925,7 +944,7 @@ class Schema(SQLRecord, CanCurate, TracksRun):
                 datasets = Artifact.filter(schema=self).all()
                 if datasets.exists():
                     logger.warning(
-                        f"you updated the schema hash and might invalidate datasets that were previously validated with this schema: {datasets.list('uid')}"
+                        f"you updated the schema hash and might invalidate datasets that were previously validated with this schema: {datasets.to_list('uid')}"
                     )
                 self.hash = validated_kwargs["hash"]
                 self.n = validated_kwargs["n"]
@@ -947,13 +966,16 @@ class Schema(SQLRecord, CanCurate, TracksRun):
             assert self.n > 0  # noqa: S101
             using: bool | None = kwargs.pop("using", None)
             related_name, records = self._features
+            # .set() does not preserve the order but orders by the feature primary key
             # only the following method preserves the order
-            # .set() does not preserve the order but orders by
-            # the feature primary key
             through_model = getattr(self, related_name).through
-            related_model_split = parse_cat_dtype(self.itype, is_itype=True)[
-                "registry_str"
-            ].split(".")
+            if self.itype == "Composite":
+                related_model_split = ["Feature"]
+            else:
+                related_model_split = parse_cat_dtype(self.itype, is_itype=True)[
+                    "registry_str"
+                ].split(".")
             if len(related_model_split) == 1:
                 related_field = related_model_split[0].lower()
             else:
@@ -965,6 +987,7 @@ class Schema(SQLRecord, CanCurate, TracksRun):
             ]
             through_model.objects.using(using).bulk_create(links, ignore_conflicts=True)
             delattr(self, "_features")
         return self
     @property
@@ -978,6 +1001,8 @@ class Schema(SQLRecord, CanCurate, TracksRun):
             # this should return a queryset and not a list...
             # need to fix this
             return self._features[1]
+        if len(self.features.all()) > 0:
+            return self.features.order_by("links_schema__id")
         if self.itype == "Composite" or self.is_type:
             return Feature.objects.none()
         related_name = self._get_related_name()
@@ -1200,6 +1225,7 @@ class SchemaFeature(BaseSQLRecord, IsLink):
     feature: Feature = ForeignKey(Feature, PROTECT, related_name="links_schema")
     class Meta:
+        app_label = "lamindb"
         unique_together = ("schema", "feature")
@@ -1211,6 +1237,7 @@ class ArtifactSchema(BaseSQLRecord, IsLink, TracksRun):
     feature_ref_is_semantic: bool | None = BooleanField(null=True)
     class Meta:
+        app_label = "lamindb"
         unique_together = (("artifact", "schema"), ("artifact", "slot"))
@@ -1221,6 +1248,7 @@ class SchemaComponent(BaseSQLRecord, IsLink, TracksRun):
     slot: str | None = CharField(null=True)
     class Meta:
+        app_label = "lamindb"
         unique_together = (("composite", "slot", "component"), ("composite", "slot"))

lamindb/models/sqlrecord.py CHANGED Viewed

@@ -319,6 +319,43 @@ def suggest_records_with_similar_names(
     return None
+def delete_record(record: BaseSQLRecord, is_soft: bool = True):
+    def delete():
+        if is_soft:
+            record.branch_id = -1
+            record.save()
+        else:
+            super(BaseSQLRecord, record).delete()
+    # deal with versioned records
+    # if _ovewrite_version = True, there is only a single version and
+    # no need to set the new latest version because all versions are deleted
+    # when deleting the latest version
+    if (
+        isinstance(record, IsVersioned)
+        and record.is_latest
+        and not getattr(record, "_overwrite_versions", False)
+    ):
+        new_latest = (
+            record.__class__.objects.using(record._state.db)
+            .filter(is_latest=False, uid__startswith=record.stem_uid)
+            .exclude(branch_id=-1)  # exclude candidates in the trash
+            .order_by("-created_at")
+            .first()
+        )
+        if new_latest is not None:
+            new_latest.is_latest = True
+            if is_soft:
+                record.is_latest = False
+            with transaction.atomic():
+                new_latest.save()
+                delete()
+            logger.warning(f"new latest version is: {new_latest}")
+            return None
+    # deal with all other cases of the nested if condition now
+    delete()
 RECORD_REGISTRY_EXAMPLE = """Example::
         from lamindb import SQLRecord, fields
@@ -334,7 +371,7 @@ RECORD_REGISTRY_EXAMPLE = """Example::
         experiment.save()
         # `Experiment` refers to the registry, which you can query
-        df = Experiment.filter(name__startswith="my ").df()
+        df = Experiment.filter(name__startswith="my ").to_dataframe()
 """
@@ -425,7 +462,7 @@ class Registry(ModelBase):
         Examples:
             >>> ln.ULabel(name="my label").save()
-            >>> ln.ULabel.filter(name__startswith="my").df()
+            >>> ln.ULabel.filter(name__startswith="my").to_dataframe()
         """
         from .query_set import QuerySet
@@ -464,7 +501,7 @@ class Registry(ModelBase):
         return QuerySet(model=cls).get(idlike, **expressions)
-    def df(
+    def to_dataframe(
         cls,
         include: str | list[str] | None = None,
         features: bool | list[str] | str = False,
@@ -492,21 +529,30 @@ class Registry(ModelBase):
             Include the name of the creator in the `DataFrame`:
-            >>> ln.ULabel.df(include="created_by__name"])
+            >>> ln.ULabel.to_dataframe(include="created_by__name"])
             Include display of features for `Artifact`:
-            >>> df = ln.Artifact.df(features=True)
+            >>> df = ln.Artifact.to_dataframe(features=True)
             >>> ln.view(df)  # visualize with type annotations
             Only include select features:
-            >>> df = ln.Artifact.df(features=["cell_type_by_expert", "cell_type_by_model"])
+            >>> df = ln.Artifact.to_dataframe(features=["cell_type_by_expert", "cell_type_by_model"])
         """
         query_set = cls.filter()
         if hasattr(cls, "updated_at"):
             query_set = query_set.order_by("-updated_at")
-        return query_set[:limit].df(include=include, features=features)
+        return query_set[:limit].to_dataframe(include=include, features=features)
+    @deprecated(new_name="to_dataframe")
+    def df(
+        cls,
+        include: str | list[str] | None = None,
+        features: bool | list[str] | str = False,
+        limit: int = 100,
+    ) -> pd.DataFrame:
+        return cls.to_dataframe(include, features, limit)
     @doc_args(_search.__doc__)
     def search(
@@ -580,7 +626,7 @@ class Registry(ModelBase):
             # this just retrives the full connection string from iresult
             db = update_db_using_local(iresult, settings_file)
             cache_using_filepath.write_text(
-                f"{iresult['lnid']}\n{iresult['schema_str']}"
+                f"{iresult['lnid']}\n{iresult['schema_str']}", encoding="utf-8"
             )
             # need to set the token if it is a fine_grained_access and the user is jwt (not public)
             is_fine_grained_access = (
@@ -593,7 +639,7 @@ class Registry(ModelBase):
             source_modules = isettings.modules
             db = isettings.db
             cache_using_filepath.write_text(
-                f"{isettings.uid}\n{','.join(source_modules)}"
+                f"{isettings.uid}\n{','.join(source_modules)}", encoding="utf-8"
             )
             # need to set the token if it is a fine_grained_access and the user is jwt (not public)
             is_fine_grained_access = (
@@ -795,7 +841,7 @@ class BaseSQLRecord(models.Model, metaclass=Registry):
         artifacts: list = []
         if self.__class__.__name__ == "Collection" and self.id is not None:
             # when creating a new collection without being able to access artifacts
-            artifacts = self.ordered_artifacts.list()
+            artifacts = self.ordered_artifacts.to_list()
         pre_existing_record = None
         # consider records that are being transferred from other databases
         transfer_logs: dict[str, list[str]] = {
@@ -920,27 +966,7 @@ class BaseSQLRecord(models.Model, metaclass=Registry):
     def delete(self) -> None:
         """Delete."""
-        # note that the logic below does not fire if a record is moved to the trash
-        # the idea is that moving a record to the trash should move its entire version family
-        # to the trash, whereas permanently deleting should default to only deleting a single record
-        # of a version family
-        # we can consider making it easy to permanently delete entire version families as well,
-        # but that's for another time
-        if isinstance(self, IsVersioned) and self.is_latest:
-            new_latest = (
-                self.__class__.objects.using(self._state.db)
-                .filter(is_latest=False, uid__startswith=self.stem_uid)
-                .order_by("-created_at")
-                .first()
-            )
-            if new_latest is not None:
-                new_latest.is_latest = True
-                with transaction.atomic():
-                    new_latest.save()
-                    super().delete()  # type: ignore
-                logger.warning(f"new latest version is {new_latest}")
-                return None
-        super().delete()
+        delete_record(self, is_soft=False)
 class Space(BaseSQLRecord):
@@ -952,6 +978,7 @@ class Space(BaseSQLRecord):
     """
     class Meta:
+        app_label = "lamindb"
         constraints = [
             models.UniqueConstraint(Lower("name"), name="unique_space_name_lower")
         ]
@@ -964,8 +991,7 @@ class Space(BaseSQLRecord):
         editable=False,
         unique=True,
         max_length=12,
-        default="aaaaaaaaaaaaa",
-        db_default="aaaaaaaaaaaa",
+        default=base62_12,
         db_index=True,
     )
     """Universal id."""
@@ -998,6 +1024,21 @@ class Space(BaseSQLRecord):
         *args,
         **kwargs,
     ):
+        if not args and "uid" not in kwargs:
+            warn = False
+            msg = ""
+            isettings = setup_settings.instance
+            if (dialect := isettings.dialect) != "postgresql":
+                warn = True
+                msg = f"on {dialect} databases"
+            elif not isettings.is_on_hub:
+                warn = True
+                msg = "on local instances"
+            if warn:
+                logger.warning(
+                    f"creating spaces manually {msg} is possible for demo purposes, "
+                    "but does *not* affect access permissions"
+                )
         super().__init__(*args, **kwargs)
@@ -1007,6 +1048,12 @@ class Branch(BaseSQLRecord):
     Every `SQLRecord` has a `branch` field, which dictates where a record appears in queries & searches.
     """
+    class Meta:
+        app_label = "lamindb"
+        constraints = [
+            models.UniqueConstraint(Lower("name"), name="unique_branch_name_lower")
+        ]
     # below isn't fully implemented but a roadmap
     # - 3: template (hidden in queries & searches)
     # - 2: locked (same as default, but locked for edits except for space admins)
@@ -1018,11 +1065,6 @@ class Branch(BaseSQLRecord):
     # that can be merged onto the main branch in an experience akin to a Pull Request. The mapping
     # onto a semantic branch name is handled through LaminHub.
-    class Meta:
-        constraints = [
-            models.UniqueConstraint(Lower("name"), name="unique_branch_name_lower")
-        ]
     id: int = models.AutoField(primary_key=True)
     """An integer id that's synchronized for a family of coupled database instances.
@@ -1119,6 +1161,75 @@ class SQLRecord(BaseSQLRecord, metaclass=Registry):
     def _branch_code(self, value: int):
         self.branch_id = value
+    def delete(self, permanent: bool | None = None, **kwargs) -> None:
+        """Delete record.
+        Args:
+            permanent: Whether to permanently delete the record (skips trash).
+        Examples:
+            For any `SQLRecord` object `record`, call:
+            >>> record.delete()
+        """
+        if self._state.adding:
+            logger.warning("record is not yet saved, delete has no effect")
+            return
+        name_with_module = self.__class__.__get_name_with_module__()
+        if name_with_module == "Artifact":
+            # this first check means an invalid delete fails fast rather than cascading through
+            # database and storage permission errors
+            isettings = setup_settings.instance
+            if self.storage.instance_uid != isettings.uid and (
+                kwargs["storage"] or kwargs["storage"] is None
+            ):
+                from ..errors import IntegrityError
+                from .storage import Storage
+                raise IntegrityError(
+                    "Cannot simply delete artifacts outside of this instance's managed storage locations."
+                    "\n(1) If you only want to delete the metadata record in this instance, pass `storage=False`"
+                    f"\n(2) If you want to delete the artifact in storage, please load the managing lamindb instance (uid={self.storage.instance_uid})."
+                    f"\nThese are all managed storage locations of this instance:\n{Storage.filter(instance_uid=isettings.uid).to_dataframe()}"
+                )
+        # change branch_id to trash
+        trash_branch_id = -1
+        if self.branch_id > trash_branch_id and permanent is not True:
+            delete_record(self, is_soft=True)
+            logger.warning(f"moved record to trash (branch_id = -1): {self}")
+            return
+        # permanent delete
+        if permanent is None:
+            response = input(
+                f"Record {self.uid} is already in trash! Are you sure you want to delete it from your"
+                " database? You can't undo this action. (y/n) "
+            )
+            confirm_delete = response == "y"
+        else:
+            confirm_delete = permanent
+        if confirm_delete:
+            if name_with_module == "Run":
+                from .run import delete_run_artifacts
+                delete_run_artifacts(self)
+            elif name_with_module == "Transform":
+                from .transform import delete_transform_relations
+                delete_transform_relations(self)
+            elif name_with_module == "Artifact":
+                from .artifact import delete_permanently
+                delete_permanently(
+                    self, storage=kwargs["storage"], using_key=kwargs["using_key"]
+                )
+            if name_with_module != "Artifact":
+                super().delete()
 def _format_django_validation_error(record: SQLRecord, e: DjangoValidationError):
     """Pretty print Django validation errors."""
@@ -1464,7 +1575,7 @@ def check_name_change(record: SQLRecord):
                 .exclude(feature_id=None)  # must have a feature
                 .distinct()
             )
-            artifact_ids = linked_records.list("artifact__uid")
+            artifact_ids = linked_records.to_list("artifact__uid")
             n = len(artifact_ids)
             if n > 0:
                 s = "s" if n > 1 else ""
@@ -1482,7 +1593,7 @@ def check_name_change(record: SQLRecord):
         # when a feature is renamed
         elif isinstance(record, Feature):
             # only internal features are associated with schemas
-            linked_artifacts = Artifact.filter(feature_sets__features=record).list(
+            linked_artifacts = Artifact.filter(feature_sets__features=record).to_list(
                 "uid"
             )
             n = len(linked_artifacts)
@@ -1806,6 +1917,7 @@ class Migration(BaseSQLRecord):
     class Meta:
         db_table = "django_migrations"
+        app_label = "lamindb"
         managed = False

lamindb 1.10.2__py3-none-any.whl → 1.11.0__py3-none-any.whl

lamindb 1.10.2py3-none-any.whl → 1.11.0py3-none-any.whl