PyPI - lamindb - Versions diffs - 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

lamindb 1.2.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

lamindb/__init__.py +1 -1
lamindb/core/_context.py +6 -0
lamindb/core/datasets/__init__.py +1 -0
lamindb/core/datasets/_core.py +23 -0
lamindb/core/datasets/_small.py +16 -2
lamindb/core/storage/objects.py +1 -2
lamindb/curators/__init__.py +1269 -1513
lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
lamindb/models/_feature_manager.py +65 -14
lamindb/models/_from_values.py +113 -78
lamindb/models/artifact.py +138 -95
lamindb/models/can_curate.py +185 -216
lamindb/models/feature.py +32 -2
lamindb/models/project.py +69 -7
lamindb/models/record.py +43 -25
lamindb/models/run.py +18 -1
lamindb/models/schema.py +0 -8
{lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/METADATA +6 -5
{lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/RECORD +22 -22
lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
{lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/LICENSE +0 -0
{lamindb-1.2.0.dist-info → lamindb-1.3.0.dist-info}/WHEEL +0 -0

lamindb/curators/_cellxgene_schemas/__init__.py CHANGED Viewed

@@ -1,26 +1,198 @@
-from pathlib import Path
 import pandas as pd
-import yaml  # type: ignore
+from lamin_utils import logger
+from lamindb_setup.core.upath import UPath
+from lamindb.base.types import FieldAttr
+from lamindb.models import Record, ULabel
+from lamindb.models._from_values import _format_values
+RESERVED_NAMES = {
+    "ethnicity",
+    "ethnicity_ontology_term_id",
+    "X_normalization",
+    "default_field",
+    "layer_descriptions",
+    "tags",
+    "versions",
+    "contributors",
+    "preprint_doi",
+    "project_description",
+    "project_links",
+    "project_name",
+    "publication_doi",
+}
+def _get_cxg_categoricals() -> dict[str, FieldAttr]:
+    import bionty as bt
+    return {
+        "assay": bt.ExperimentalFactor.name,
+        "assay_ontology_term_id": bt.ExperimentalFactor.ontology_id,
+        "cell_type": bt.CellType.name,
+        "cell_type_ontology_term_id": bt.CellType.ontology_id,
+        "development_stage": bt.DevelopmentalStage.name,
+        "development_stage_ontology_term_id": bt.DevelopmentalStage.ontology_id,
+        "disease": bt.Disease.name,
+        "disease_ontology_term_id": bt.Disease.ontology_id,
+        # "donor_id": "str",  via pandera
+        "self_reported_ethnicity": bt.Ethnicity.name,
+        "self_reported_ethnicity_ontology_term_id": bt.Ethnicity.ontology_id,
+        "sex": bt.Phenotype.name,
+        "sex_ontology_term_id": bt.Phenotype.ontology_id,
+        "suspension_type": ULabel.name,
+        "tissue": bt.Tissue.name,
+        "tissue_ontology_term_id": bt.Tissue.ontology_id,
+        "tissue_type": ULabel.name,
+        "organism": bt.Organism.name,
+        "organism_ontology_term_id": bt.Organism.ontology_id,
+    }
+def _restrict_obs_fields(
+    obs: pd.DataFrame, obs_fields: dict[str, FieldAttr]
+) -> dict[str, FieldAttr]:
+    """Restrict the obs fields only available obs fields.
+    To simplify the curation, we only validate against either name or ontology_id.
+    If both are available, we validate against ontology_id.
+    If none are available, we validate against name.
+    """
+    obs_fields_unique = {k: v for k, v in obs_fields.items() if k in obs.columns}
+    for name, field in obs_fields.items():
+        if name.endswith("_ontology_term_id"):
+            continue
+        # if both the ontology id and the name are present, only validate on the ontology_id
+        if name in obs.columns and f"{name}_ontology_term_id" in obs.columns:
+            obs_fields_unique.pop(name)
+        # if the neither name nor ontology id are present, validate on the name
+        # this will raise error downstream, we just use name to be more readable
+        if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
+            obs_fields_unique[name] = field
+    # Only retain obs_fields_unique that have keys in adata.obs.columns
+    available_obs_fields = {
+        k: v for k, v in obs_fields_unique.items() if k in obs.columns
+    }
+    return available_obs_fields
-def _read_schema_versions(ontology_versions: Path) -> dict[str, pd.DataFrame]:
-    data = yaml.safe_load(open(ontology_versions))
-    schema_versions = data["schema-version"]
-    def _schema_to_df(schema_data):
-        return pd.DataFrame(
+def _add_defaults_to_obs(obs: pd.DataFrame, defaults: dict[str, str]) -> None:
+    """Add default columns and values to obs DataFrame."""
+    added_defaults: dict = {}
+    for name, default in defaults.items():
+        if name not in obs.columns and f"{name}_ontology_term_id" not in obs.columns:
+            obs[name] = default
+            added_defaults[name] = default
+            logger.important(
+                f"added default value '{default}' to the adata.obs['{name}']"
+            )
+def _create_sources(
+    categoricals: dict[str, FieldAttr], schema_version: str, organism: str
+) -> dict[str, Record]:
+    """Creates a sources dictionary that can be passed to AnnDataCatManager."""
+    import bionty as bt
+    def _fetch_bionty_source(entity: str, organism: str) -> Record | None:  # type: ignore
+        """Fetch the Bionty source of the pinned ontology."""
+        entity_sources = sources_df.loc[(sources_df.entity == entity)].copy()
+        if not entity_sources.empty:
+            if len(entity_sources) == 1:
+                row = entity_sources.iloc[0]  # for sources with organism "all"
+            else:
+                row = entity_sources[entity_sources.organism == organism].iloc[0]
+            source = bt.Source.filter(
+                organism=row.organism,
+                entity=f"bionty.{entity}",
+                name=row.source,
+                version=row.version,
+            ).one_or_none()
+            if source is None:
+                logger.error(
+                    f"Could not find source: {entity}\n"
+                    "    → consider running `bionty.core.sync_all_sources_to_latest()` and re-connect to your instance"
+                )
+            return source
+    sources_df = pd.read_csv(UPath(__file__).parent / "schema_versions.csv")
+    sources_df = sources_df[sources_df.schema_version == schema_version]
+    if sources_df.empty:
+        raise ValueError(
+            f"Invalid schema_version: {schema_version}\n"
+            f"Valid versions are: {_format_values(sources_df.schema_version.unique())}"
+        )
+    key_to_source: dict[str, bt.Source] = {}
+    for key, field in categoricals.items():
+        if field.field.model.__get_module_name__() == "bionty":
+            entity = field.field.model.__name__
+            key_to_source[key] = _fetch_bionty_source(entity, organism)
+    key_to_source["var_index"] = _fetch_bionty_source("Gene", organism)
+    return key_to_source
+def _init_categoricals_additional_values() -> None:
+    """Add additional values from CellxGene schema to the DB."""
+    import bionty as bt
+    # Note: if you add another control below, be mindful to change the if condition that
+    # triggers whether creating these records is re-considered
+    controls_were_created = (
+        ULabel.filter(name="SuspensionType", is_type=True).one_or_none() is not None
+    )
+    if not controls_were_created:
+        logger.important("Creating control labels in the CellxGene schema.")
+        # "normal" in Disease
+        normal = bt.Phenotype.from_source(
+            ontology_id="PATO:0000461",
+            source=bt.Source.get(name="pato", version="2024-03-28"),
+        )
+        bt.Disease(
+            uid=normal.uid,
+            name=normal.name,
+            ontology_id=normal.ontology_id,
+            description=normal.description,
+            source=normal.source,  # not sure
+        ).save()
+        # na, unknown
+        for model, name in zip(
             [
-                (entity, organism, ontology, version)
-                for entity, details in schema_data.items()
-                for ontology, values in details.items()
-                for organism, version in values.items()
+                bt.Ethnicity,
+                bt.Ethnicity,
+                bt.DevelopmentalStage,
+                bt.Phenotype,
+                bt.CellType,
             ],
-            columns=["entity", "organism", "source", "version"],
-        ).set_index("entity")
+            ["na", "unknown", "unknown", "unknown", "unknown"],
+        ):
+            model(
+                ontology_id=name, name=name, description="From CellxGene schema."
+            ).save()
-    schema_versions_df = {
-        version: _schema_to_df(details) for version, details in schema_versions.items()
-    }
+        # tissue_type
+        tissue_type = ULabel(
+            name="TissueType",
+            is_type=True,
+            description='From CellxGene schema. Is "tissue", "organoid", or "cell culture".',
+        ).save()
+        for name in ["tissue", "organoid", "cell culture"]:
+            ULabel(
+                name=name, type=tissue_type, description="From CellxGene schema."
+            ).save()
-    return schema_versions_df
+        # suspension_type
+        suspension_type = ULabel(
+            name="SuspensionType",
+            is_type=True,
+            description='From CellxGene schema. This MUST be "cell", "nucleus", or "na".',
+        ).save()
+        for name in ["cell", "nucleus", "na"]:
+            ULabel(
+                name=name, type=suspension_type, description="From CellxGene schema."
+            ).save()

lamindb/curators/_cellxgene_schemas/schema_versions.csv ADDED Viewed

@@ -0,0 +1,43 @@
+schema_version,entity,organism,source,version
+4.0.0,CellType,all,cl,2023-08-24
+4.0.0,ExperimentalFactor,all,efo,3.57.0
+4.0.0,Ethnicity,human,hancestro,3.0
+4.0.0,DevelopmentalStage,human,hsapdv,2020-03-10
+4.0.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
+4.0.0,Disease,all,mondo,2023-08-02
+4.0.0,Organism,all,ncbitaxon,2023-06-20
+4.0.0,Phenotype,all,pato,2023-05-18
+4.0.0,Tissue,all,uberon,2023-09-05
+5.0.0,CellType,all,cl,2024-01-04
+5.0.0,ExperimentalFactor,all,efo,3.62.0
+5.0.0,Ethnicity,human,hancestro,3.0
+5.0.0,DevelopmentalStage,human,hsapdv,2020-03-10
+5.0.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
+5.0.0,Disease,all,mondo,2024-01-03
+5.0.0,Organism,all,ncbitaxon,2023-06-20
+5.0.0,Phenotype,all,pato,2023-05-18
+5.0.0,Tissue,all,uberon,2024-01-18
+5.0.0,Gene,human,ensembl,release-110
+5.0.0,Gene,mouse,ensembl,release-110
+5.1.0,CellType,all,cl,2024-04-05
+5.1.0,ExperimentalFactor,all,efo,3.65.0
+5.1.0,Ethnicity,human,hancestro,3.0
+5.1.0,DevelopmentalStage,human,hsapdv,2020-03-10
+5.1.0,DevelopmentalStage,mouse,mmusdv,2020-03-10
+5.1.0,Disease,all,mondo,2024-05-08
+5.1.0,Organism,all,ncbitaxon,2023-06-20
+5.1.0,Phenotype,all,pato,2023-05-18
+5.1.0,Tissue,all,uberon,2024-03-22
+5.1.0,Gene,human,ensembl,release-110
+5.1.0,Gene,mouse,ensembl,release-110
+5.2.0,CellType,all,cl,2024-08-16
+5.2.0,ExperimentalFactor,all,efo,3.69.0
+5.2.0,Ethnicity,human,hancestro,3.0
+5.2.0,DevelopmentalStage,human,hsapdv,2024-05-28
+5.2.0,DevelopmentalStage,mouse,mmusdv,2024-05-28
+5.2.0,Disease,all,mondo,2024-08-06
+5.2.0,Organism,all,ncbitaxon,2023-06-20
+5.2.0,Phenotype,all,pato,2023-05-18
+5.2.0,Tissue,all,uberon,2024-08-07
+5.2.0,Gene,human,ensembl,release-110
+5.2.0,Gene,mouse,ensembl,release-110

lamindb/models/_feature_manager.py CHANGED Viewed

@@ -5,7 +5,7 @@ from collections import defaultdict
 from collections.abc import Iterable
 from datetime import date, datetime
 from itertools import compress
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, MutableMapping
 import anndata as ad
 import numpy as np
@@ -201,7 +201,11 @@ def _get_categoricals(
             if hasattr(link, "feature_id") and link.feature_id is not None:
                 feature = Feature.objects.using(self._state.db).get(id=link.feature_id)
                 link_attr = get_link_attr(link, self)
-                label_name = getattr(link, link_attr).name
+                label = getattr(link, link_attr)
+                name_attr = (
+                    "name" if hasattr(label, "name") else label.__class__._name_field
+                )
+                label_name = getattr(label, name_attr)
                 result[(feature.name, feature.dtype)].add(label_name)
     return dict(result)
@@ -1137,10 +1141,25 @@ def _add_set_from_anndata(
     self._host.save()
+def _unify_staged_feature_sets_by_hash(
+    feature_sets: MutableMapping[str, Schema],
+):
+    unique_values: dict[str, Any] = {}
+    for key, value in feature_sets.items():
+        value_hash = value.hash  # Assuming each value has a .hash attribute
+        if value_hash in unique_values:
+            feature_sets[key] = unique_values[value_hash]
+        else:
+            unique_values[value_hash] = value
+    return feature_sets
 def _add_set_from_mudata(
     self,
-    var_fields: dict[str, FieldAttr],
-    obs_fields: dict[str, FieldAttr] = None,
+    var_fields: dict[str, FieldAttr] | None = None,
+    obs_fields: dict[str, FieldAttr] | None = None,
     mute: bool = False,
     organism: str | Record | None = None,
 ):
@@ -1152,6 +1171,7 @@ def _add_set_from_mudata(
     # parse and register features
     mdata = self._host.load()
     feature_sets = {}
     obs_features = Feature.from_values(mdata.obs.columns)  # type: ignore
     if len(obs_features) > 0:
         feature_sets["obs"] = Schema(features=obs_features)
@@ -1166,20 +1186,50 @@ def _add_set_from_mudata(
         for k, v in modality_fs.items():
             feature_sets[f"['{modality}'].{k}"] = v
-    def unify_staged_feature_sets_by_hash(feature_sets):
-        unique_values = {}
+    # link feature sets
+    self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
+    self._host.save()
-        for key, value in feature_sets.items():
-            value_hash = value.hash  # Assuming each value has a .hash attribute
-            if value_hash in unique_values:
-                feature_sets[key] = unique_values[value_hash]
-            else:
-                unique_values[value_hash] = value
+def _add_set_from_spatialdata(
+    self,
+    sample_metadata_key: str,
+    sample_metadata_field: FieldAttr = Feature.name,
+    var_fields: dict[str, FieldAttr] | None = None,
+    obs_fields: dict[str, FieldAttr] | None = None,
+    mute: bool = False,
+    organism: str | Record | None = None,
+):
+    """Add features from SpatialData."""
+    obs_fields, var_fields = obs_fields or {}, var_fields or {}
+    assert self._host.otype == "SpatialData"  # noqa: S101
+    # parse and register features
+    sdata = self._host.load()
+    feature_sets = {}
-        return feature_sets
+    # sample features
+    sample_features = Feature.from_values(
+        sdata.get_attrs(key=sample_metadata_key, return_as="df", flatten=True).columns,
+        field=sample_metadata_field,
+    )  # type: ignore
+    if len(sample_features) > 0:
+        feature_sets[sample_metadata_key] = Schema(features=sample_features)
+    # table features
+    for table, field in var_fields.items():
+        table_fs = parse_staged_feature_sets_from_anndata(
+            sdata[table],
+            var_field=field,
+            obs_field=obs_fields.get(table, Feature.name),
+            mute=mute,
+            organism=organism,
+        )
+        for k, v in table_fs.items():
+            feature_sets[f"['{table}'].{k}"] = v
     # link feature sets
-    self._host._staged_feature_sets = unify_staged_feature_sets_by_hash(feature_sets)
+    self._host._staged_feature_sets = _unify_staged_feature_sets_by_hash(feature_sets)
     self._host.save()
@@ -1311,6 +1361,7 @@ FeatureManager._accessor_by_registry = _accessor_by_registry
 FeatureManager._add_set_from_df = _add_set_from_df
 FeatureManager._add_set_from_anndata = _add_set_from_anndata
 FeatureManager._add_set_from_mudata = _add_set_from_mudata
+FeatureManager._add_set_from_spatialdata = _add_set_from_spatialdata
 FeatureManager._add_from = _add_from
 FeatureManager.filter = filter
 FeatureManager.get = get

lamindb 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

lamindb 1.2.0py3-none-any.whl → 1.3.0py3-none-any.whl