PyPI - lamindb - Versions diffs - 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl - Mend

lamindb 0.77.0py3-none-any.whl → 0.77.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

lamindb/__init__.py +1 -1
lamindb/_artifact.py +6 -3
lamindb/_can_curate.py +3 -1
lamindb/_collection.py +1 -1
lamindb/_curate.py +387 -318
lamindb/_feature.py +84 -58
lamindb/_feature_set.py +6 -4
lamindb/_finish.py +68 -13
lamindb/_from_values.py +10 -6
lamindb/_query_set.py +321 -102
lamindb/_record.py +5 -3
lamindb/_save.py +1 -0
lamindb/_view.py +105 -9
lamindb/core/__init__.py +2 -2
lamindb/core/_context.py +9 -13
lamindb/core/_data.py +58 -88
lamindb/core/_describe.py +139 -0
lamindb/core/_django.py +5 -6
lamindb/core/_feature_manager.py +408 -198
lamindb/core/_label_manager.py +147 -109
lamindb/core/datasets/__init__.py +31 -2
lamindb/core/datasets/_core.py +0 -27
lamindb/core/datasets/_small.py +100 -0
lamindb/core/exceptions.py +1 -1
lamindb/core/storage/paths.py +9 -4
lamindb/core/types.py +12 -2
{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/METADATA +7 -8
{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/RECORD +30 -28
{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/LICENSE +0 -0
{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/WHEEL +0 -0

lamindb/core/_label_manager.py CHANGED Viewed

@@ -1,12 +1,14 @@
 from __future__ import annotations
+import warnings
 from collections import defaultdict
 from typing import TYPE_CHECKING
-import numpy as np
 from django.db import connections
 from lamin_utils import colors, logger
 from lnschema_core.models import CanCurate, Feature
+from rich.table import Column, Table
+from rich.text import Text
 from lamindb._from_values import _print_values
 from lamindb._record import (
@@ -17,123 +19,164 @@ from lamindb._record import (
 )
 from lamindb._save import save
+from ._describe import (
+    NAME_WIDTH,
+    TYPE_WIDTH,
+    VALUES_WIDTH,
+    describe_header,
+    print_rich_tree,
+)
 from ._django import get_artifact_with_related, get_related_model
 from ._settings import settings
 from .schema import dict_related_model_to_related_name
 if TYPE_CHECKING:
     from lnschema_core.models import Artifact, Collection, Record
+    from rich.tree import Tree
     from lamindb._query_set import QuerySet
-LABELS_EXCLUDE_SET = {"feature_sets"}
-def get_labels_as_dict(
-    self: Artifact | Collection, links: bool = False, instance: str | None = None
-) -> dict:
-    labels = {}  # type: ignore
-    if self.id is None:
-        return labels
-    for related_model_name, related_name in dict_related_model_to_related_name(
-        self.__class__, links=links, instance=instance
-    ).items():
-        if related_name not in LABELS_EXCLUDE_SET and not related_name.startswith("_"):
-            labels[related_name] = (
-                related_model_name,
-                getattr(self, related_name).all(),
-            )
+EXCLUDE_LABELS = {"feature_sets"}
+def _get_labels(
+    obj, links: bool = False, instance: str | None = None
+) -> dict[str, QuerySet]:
+    """Get all labels associated with an object as a dictionary.
+    This is a generic approach that uses django orm.
+    """
+    if obj.id is None:
+        return {}
+    labels = {}
+    related_models = dict_related_model_to_related_name(
+        obj.__class__, links=links, instance=instance
+    )
+    for _, related_name in related_models.items():
+        if related_name not in EXCLUDE_LABELS and not related_name.startswith("_"):
+            labels[related_name] = getattr(obj, related_name).all()
     return labels
-def _print_labels_postgres(
-    self: Artifact | Collection, m2m_data: dict | None = None, print_types: bool = False
-) -> str:
-    labels_msg = ""
-    if not m2m_data:
+def _get_labels_postgres(
+    self: Artifact | Collection, m2m_data: dict | None = None
+) -> dict[str, dict[int, str]]:
+    """Get all labels associated with an artifact or collection as a dictionary.
+    This is a postgres-specific approach that uses django Subquery.
+    """
+    if m2m_data is None:
         artifact_meta = get_artifact_with_related(self, include_m2m=True)
         m2m_data = artifact_meta.get("related_data", {}).get("m2m", {})
-    if m2m_data:
-        for related_name, labels in m2m_data.items():
-            if not labels or related_name == "feature_sets":
-                continue
-            related_model = get_related_model(self, related_name)
-            print_values = _print_values(labels.values(), n=10)
-            type_str = f": {related_model}" if print_types else ""
-            labels_msg += f"    .{related_name}{type_str} = {print_values}\n"
-    return labels_msg
+    return m2m_data
-def print_labels(
+def describe_labels(
     self: Artifact | Collection,
-    m2m_data: dict | None = None,
-    print_types: bool = False,
+    labels_data: dict | None = None,
+    print_types: bool = False,  # deprecated
+    tree: Tree | None = None,
+    as_subtree: bool = False,
 ):
+    """Describe labels associated with an artifact or collection."""
+    if print_types:
+        warnings.warn(
+            "`print_types` parameter is deprecated and will be removed in a future version. Types are now always printed.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
     if not self._state.adding and connections[self._state.db].vendor == "postgresql":
-        labels_msg = _print_labels_postgres(self, m2m_data, print_types)
-    else:
-        labels_msg = ""
-        for related_name, (related_model, labels) in get_labels_as_dict(
-            self, instance=self._state.db
-        ).items():
+        labels_data = _get_labels_postgres(self, labels_data)
+    if not labels_data:
+        labels_data = _get_labels(self, instance=self._state.db)
+    # initialize tree
+    if tree is None:
+        tree = describe_header(self)
+    if not labels_data:
+        return tree
+    labels_table = Table(
+        Column(
+            Text.assemble(("Labels", "green_yellow")),
+            style="",
+            no_wrap=True,
+            width=NAME_WIDTH,
+        ),
+        Column("", style="dim", no_wrap=True, width=TYPE_WIDTH),
+        Column("", width=VALUES_WIDTH, no_wrap=True),
+        # show_header=True,
+        box=None,
+        pad_edge=False,
+    )
+    for related_name, labels in labels_data.items():
+        if not labels or related_name == "feature_sets":
+            continue
+        if isinstance(labels, dict):  # postgres, labels are a dict[id, name]
+            print_values = _print_values(labels.values(), n=10)
+        else:  # labels are a QuerySet
             field = get_name_field(labels)
-            labels_list = list(labels.values_list(field, flat=True))
-            if len(labels_list) > 0:
-                print_values = _print_values(labels_list, n=10)
-                type_str = f": {related_model}" if print_types else ""
-                labels_msg += f"    .{related_name}{type_str} = {print_values}\n"
-    msg = ""
-    if labels_msg:
-        msg += f"  {colors.italic('Labels')}\n"
-        msg += labels_msg
-    return msg
-# Alex: is this a label transfer function?
-def validate_labels(labels: QuerySet | list | dict):
-    def validate_labels_registry(
-        labels: QuerySet | list | dict,
-    ) -> tuple[list[str], list[str]]:
-        if len(labels) == 0:
-            return [], []
-        registry = labels[0].__class__
-        field = REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
-        if hasattr(registry, "_ontology_id_field"):
-            field = registry._ontology_id_field
-        # if the field value is None, use uid field
-        label_uids = np.array(
-            [getattr(label, field) for label in labels if label is not None]
-        )
-        # save labels from ontology_ids
-        if hasattr(registry, "_ontology_id_field") and len(label_uids) > 0:
-            try:
-                labels_records = registry.from_values(label_uids, field=field)
-                save([r for r in labels_records if r._state.adding])
-            except Exception:  # noqa S110
-                pass
-            field = "uid"
-            label_uids = np.array(
-                [getattr(label, field) for label in labels if label is not None]
+            print_values = _print_values(labels.values_list(field, flat=True), n=10)
+        if print_values:
+            related_model = get_related_model(self, related_name)
+            type_str = related_model.__get_name_with_schema__()
+            labels_table.add_row(
+                f".{related_name}", Text(type_str, style="dim"), print_values
             )
-        if issubclass(registry, CanCurate):
-            validated = registry.validate(label_uids, field=field, mute=True)
-            validated_uids = label_uids[validated]
-            validated_labels = registry.filter(
-                **{f"{field}__in": validated_uids}
-            ).list()
-            new_labels = [labels[int(i)] for i in np.argwhere(~validated).flatten()]
-        else:
-            validated_labels = []
-            new_labels = list(labels)
-        return validated_labels, new_labels
-    if isinstance(labels, dict):
-        result = {}
-        for registry, labels_registry in labels.items():
-            result[registry] = validate_labels_registry(labels_registry)
+    if as_subtree:
+        if labels_table.rows:
+            return labels_table
     else:
-        return validate_labels_registry(labels)
+        if labels_table.rows:
+            tree.add(labels_table)
+        return tree
+def _save_validated_records(
+    labels: QuerySet | list | dict,
+) -> list[str]:
+    if not labels:
+        return []
+    registry = labels[0].__class__
+    field = (
+        REGISTRY_UNIQUE_FIELD.get(registry.__name__.lower(), "uid")
+        if not hasattr(registry, "_ontology_id_field")
+        else registry._ontology_id_field
+    )
+    # if the field value is None, use uid field
+    label_uids = [getattr(label, field) for label in labels if label is not None]
+    # save labels from ontology_ids
+    if hasattr(registry, "_ontology_id_field") and label_uids:
+        try:
+            records = registry.from_values(label_uids, field=field)
+            save([r for r in records if r._state.adding])
+        except Exception:  # noqa: S110
+            pass
+        field = "uid"
+        label_uids = [label.uid for label in labels if label is not None]
+    if issubclass(registry, CanCurate):
+        validated = registry.validate(label_uids, field=field, mute=True)
+        new_labels = [
+            label for label, is_valid in zip(labels, validated) if not is_valid
+        ]
+        return new_labels
+    return list(labels)
+def save_validated_records(
+    labels: QuerySet | list | dict,
+) -> list[str] | dict[str, list[str]]:
+    """Save validated labels from public based on ontology_id_fields."""
+    if isinstance(labels, dict):
+        return {
+            registry: _save_validated_records(registry_labels)
+            for registry, registry_labels in labels.items()
+        }
+    return _save_validated_records(labels)
 class LabelManager:
@@ -144,15 +187,12 @@ class LabelManager:
     with features.
     """
-    def __init__(self, host: Artifact | Collection):
+    def __init__(self, host: Artifact | Collection) -> None:
         self._host = host
     def __repr__(self) -> str:
-        msg = print_labels(self._host)
-        if len(msg) > 0:
-            return msg
-        else:
-            return "no linked labels"
+        tree = describe_labels(self._host)
+        return print_rich_tree(tree, fallback="no linked labels")
     def add(
         self,
@@ -201,9 +241,7 @@ class LabelManager:
         if transfer_logs is None:
             transfer_logs = {"mapped": [], "transferred": [], "run": None}
         using_key = settings._using_key
-        for related_name, (_, labels) in get_labels_as_dict(
-            data, instance=data._state.db
-        ).items():
+        for related_name, labels in _get_labels(data, instance=data._state.db).items():
             labels = labels.all()
             if not labels.exists():
                 continue
@@ -211,7 +249,7 @@ class LabelManager:
             data_name_lower = data.__class__.__name__.lower()
             labels_by_features = defaultdict(list)
             features = set()
-            _, new_labels = validate_labels(labels)
+            new_labels = save_validated_records(labels)
             if len(new_labels) > 0:
                 transfer_fk_to_default_db_bulk(
                     new_labels, using_key, transfer_logs=transfer_logs
@@ -241,7 +279,7 @@ class LabelManager:
                     label = label_returned
                 labels_by_features[key].append(label)
             # treat features
-            _, new_features = validate_labels(list(features))
+            new_features = save_validated_records(list(features))
             if len(new_features) > 0:
                 transfer_fk_to_default_db_bulk(
                     new_features, using_key, transfer_logs=transfer_logs
@@ -255,16 +293,16 @@ class LabelManager:
                     )
                 save(new_features)
             if hasattr(self._host, related_name):
-                for feature_name, labels in labels_by_features.items():
+                for feature_name, feature_labels in labels_by_features.items():
                     if feature_name is not None:
                         feature_id = Feature.get(name=feature_name).id
                     else:
                         feature_id = None
                     getattr(self._host, related_name).add(
-                        *labels, through_defaults={"feature_id": feature_id}
+                        *feature_labels, through_defaults={"feature_id": feature_id}
                     )
-    def make_external(self, label: Record):
+    def make_external(self, label: Record) -> None:
         """Make a label external, aka dissociate label from internal features.
         Args:

lamindb/core/datasets/__init__.py CHANGED Viewed

@@ -1,4 +1,15 @@
-"""Test collections.
+"""Test datasets.
+Small in-memory datasets.
+.. autosummary::
+   :toctree: .
+   small_dataset1
+   small_dataset2
+   anndata_with_obs
+Files.
 .. autosummary::
    :toctree: .
@@ -11,8 +22,20 @@
    file_fastq
    file_bam
    file_mini_csv
+Directories.
+.. autosummary::
+   :toctree: .
    dir_scrnaseq_cellranger
    dir_iris_images
+Dataframe, AnnData, MuData.
+.. autosummary::
+   :toctree: .
    df_iris
    df_iris_in_meter
    df_iris_in_meter_study1
@@ -27,6 +50,12 @@
    mudata_papalexi21_subset
    schmidt22_crispra_gws_IFNG
    schmidt22_perturbseq
+Other.
+.. autosummary::
+   :toctree: .
    fake_bio_notebook_titles
 """
@@ -37,7 +66,6 @@ from ._core import (
     anndata_pbmc3k_processed,
     anndata_pbmc68k_reduced,
     anndata_suo22_Visium10X,
-    anndata_with_obs,
     df_iris,
     df_iris_in_meter,
     df_iris_in_meter_study1,
@@ -57,3 +85,4 @@ from ._core import (
     schmidt22_perturbseq,
 )
 from ._fake import fake_bio_notebook_titles
+from ._small import anndata_with_obs, small_dataset1, small_dataset2

lamindb/core/datasets/_core.py CHANGED Viewed

@@ -342,33 +342,6 @@ def anndata_human_immune_cells(
     return adata
-def anndata_with_obs() -> ad.AnnData:
-    """Create a mini anndata with cell_type, disease and tissue."""
-    import anndata as ad
-    import bionty.base as bionty_base
-    celltypes = ["T cell", "hematopoietic stem cell", "hepatocyte", "my new cell type"]
-    celltype_ids = ["CL:0000084", "CL:0000037", "CL:0000182", ""]
-    diseases = [
-        "chronic kidney disease",
-        "liver lymphoma",
-        "cardiac ventricle disorder",
-        "Alzheimer disease",
-    ]
-    tissues = ["kidney", "liver", "heart", "brain"]
-    df = pd.DataFrame()
-    df["cell_type"] = celltypes * 10
-    df["cell_type_id"] = celltype_ids * 10
-    df["tissue"] = tissues * 10
-    df["disease"] = diseases * 10
-    df.index = "obs" + df.index.astype(str)
-    adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
-    adata.var.index = bionty_base.Gene().df().head(100)["ensembl_gene_id"].values
-    return adata
 def anndata_suo22_Visium10X():  # pragma: no cover
     """AnnData from Suo22 generated by 10x Visium."""
     import anndata as ad

lamindb/core/datasets/_small.py ADDED Viewed

@@ -0,0 +1,100 @@
+from __future__ import annotations
+from typing import Any, Literal
+import anndata as ad
+import numpy as np
+import pandas as pd
+def small_dataset1(
+    format: Literal["df", "anndata"],
+    with_typo: bool = False,
+) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
+    # define the data in the dataset
+    # it's a mix of numerical measurements and observation-level metadata
+    ifng = "IFNJ" if with_typo else "IFNG"
+    dataset_dict = {
+        "CD8A": [1, 2, 3],
+        "CD4": [3, 4, 5],
+        "CD14": [5, 6, 7],
+        "cell_medium": ["DMSO", ifng, "DMSO"],
+        "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
+        "cell_type_by_expert": ["B cell", "T cell", "T cell"],
+        "cell_type_by_model": ["B cell", "T cell", "T cell"],
+    }
+    # define the dataset-level metadata
+    metadata = {
+        "temperature": 21.6,
+        "study": "Candidate marker study 1",
+        "date_of_study": "2024-12-01",
+        "study_note": "We had a great time performing this study and the results look compelling.",
+    }
+    # the dataset as DataFrame
+    dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
+    if format == "df":
+        return dataset_df, metadata
+    else:
+        dataset_ad = ad.AnnData(
+            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
+        )
+        return dataset_ad
+def small_dataset2(
+    format: Literal["df", "anndata"],
+) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
+    dataset_dict = {
+        "CD8A": [2, 3, 3],
+        "CD4": [3, 4, 5],
+        "CD38": [4, 2, 3],
+        "cell_medium": ["DMSO", "IFNG", "IFNG"],
+        "cell_type_by_model": ["B cell", "T cell", "T cell"],
+    }
+    metadata = {
+        "temperature": 22.6,
+        "study": "Candidate marker study 2",
+        "date_of_study": "2025-02-13",
+    }
+    dataset_df = pd.DataFrame(
+        dataset_dict,
+        index=["sample4", "sample5", "sample6"],
+    )
+    ad.AnnData(
+        dataset_df[["CD8A", "CD4", "CD38"]],
+        obs=dataset_df[["cell_medium", "cell_type_by_model"]],
+    )
+    if format == "df":
+        return dataset_df, metadata
+    else:
+        dataset_ad = ad.AnnData(
+            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
+        )
+        return dataset_ad
+def anndata_with_obs() -> ad.AnnData:
+    """Create a mini anndata with cell_type, disease and tissue."""
+    import anndata as ad
+    import bionty.base as bionty_base
+    celltypes = ["T cell", "hematopoietic stem cell", "hepatocyte", "my new cell type"]
+    celltype_ids = ["CL:0000084", "CL:0000037", "CL:0000182", ""]
+    diseases = [
+        "chronic kidney disease",
+        "liver lymphoma",
+        "cardiac ventricle disorder",
+        "Alzheimer disease",
+    ]
+    tissues = ["kidney", "liver", "heart", "brain"]
+    df = pd.DataFrame()
+    df["cell_type"] = celltypes * 10
+    df["cell_type_id"] = celltype_ids * 10
+    df["tissue"] = tissues * 10
+    df["disease"] = diseases * 10
+    df.index = "obs" + df.index.astype(str)
+    adata = ad.AnnData(X=np.zeros(shape=(40, 100), dtype=np.float32), obs=df)
+    adata.var.index = bionty_base.Gene().df().head(100)["ensembl_gene_id"].values
+    return adata

lamindb/core/exceptions.py CHANGED Viewed

@@ -79,7 +79,7 @@ class IntegrityError(Exception):
     pass
-class NoTitleError(Exception):
+class NoTitleError(SystemExit):
     """Notebook has no title."""
     pass

lamindb/core/storage/paths.py CHANGED Viewed

@@ -4,6 +4,7 @@ import shutil
 from typing import TYPE_CHECKING
 import anndata as ad
+import fsspec
 import pandas as pd
 from lamin_utils import logger
 from lamindb_setup.core import StorageSettings
@@ -45,12 +46,16 @@ def auto_storage_key_from_artifact_uid(uid: str, suffix: str, is_dir: bool) -> s
     return storage_key
-def check_path_is_child_of_root(path: Path | UPath, root: Path | UPath | None) -> bool:
+def check_path_is_child_of_root(path: UPathStr, root: UPathStr) -> bool:
     # str is needed to eliminate UPath storage_options
     # from the equality checks below
-    path = UPath(str(path))
-    root = UPath(str(root))
-    return root.resolve() in path.resolve().parents
+    # and for fsspec.utils.get_protocol
+    path_str = str(path)
+    root_str = str(root)
+    # check that the protocols are the same first
+    if fsspec.utils.get_protocol(path_str) != fsspec.utils.get_protocol(root_str):
+        return False
+    return UPath(root_str).resolve() in UPath(path_str).resolve().parents
 # returns filepath and root of the storage

lamindb/core/types.py CHANGED Viewed

@@ -1,18 +1,28 @@
 """Types.
+Central object types.
+.. autosummary::
+   :toctree: .
+   ArtifactType
+   TransformType
+   FeatureDtype
+Basic types.
 .. autosummary::
    :toctree: .
    UPathStr
    StrField
    ListLike
-   TransformType
-   ArtifactType
 """
 from lamindb_setup.core.types import UPathStr
 from lnschema_core.types import (
     ArtifactType,
+    FeatureDtype,
     FieldAttr,
     ListLike,
     StrField,

{lamindb-0.77.0.dist-info → lamindb-0.77.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lamindb
-Version: 0.77.0
+Version: 0.77.2
 Summary: A data framework for biology.
 Author-email: Lamin Labs <open-source@lamin.ai>
 Requires-Python: >=3.9,<3.13
@@ -9,11 +9,10 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
-Requires-Dist: lnschema_core==0.77.0
-Requires-Dist: lamin_utils==0.13.8
-Requires-Dist: lamin_cli==0.21.3
-Requires-Dist: lamindb_setup
-Requires-Dist: rapidfuzz
+Requires-Dist: lnschema_core==0.77.1
+Requires-Dist: lamin_utils==0.13.9
+Requires-Dist: lamin_cli==0.22.0
+Requires-Dist: lamindb_setup==0.81.2
 Requires-Dist: pyarrow
 Requires-Dist: typing_extensions!=4.6.0
 Requires-Dist: python-dateutil
@@ -22,7 +21,7 @@ Requires-Dist: fsspec
 Requires-Dist: graphviz
 Requires-Dist: psycopg2-binary
 Requires-Dist: lamindb_setup[aws] ; extra == "aws"
-Requires-Dist: bionty==0.53.1 ; extra == "bionty"
+Requires-Dist: bionty==0.53.2 ; extra == "bionty"
 Requires-Dist: cellregistry ; extra == "cellregistry"
 Requires-Dist: clinicore ; extra == "clinicore"
 Requires-Dist: line_profiler ; extra == "dev"
@@ -41,7 +40,7 @@ Requires-Dist: findrefs ; extra == "findrefs"
 Requires-Dist: lamindb_setup[gcp] ; extra == "gcp"
 Requires-Dist: nbproject==0.10.5 ; extra == "jupyter"
 Requires-Dist: jupytext ; extra == "jupyter"
-Requires-Dist: nbconvert ; extra == "jupyter"
+Requires-Dist: nbconvert>=7.2.1 ; extra == "jupyter"
 Requires-Dist: omop ; extra == "omop"
 Requires-Dist: ourprojects ; extra == "ourprojects"
 Requires-Dist: wetlab ; extra == "wetlab"

lamindb 0.77.0__py3-none-any.whl → 0.77.2__py3-none-any.whl

lamindb 0.77.0py3-none-any.whl → 0.77.2py3-none-any.whl