PyPI - lamindb - Versions diffs - 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

lamindb 1.3.0py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

lamindb/__init__.py +1 -1
lamindb/_view.py +2 -2
lamindb/base/types.py +50 -11
lamindb/core/types.py +1 -1
lamindb/curators/__init__.py +232 -222
lamindb/curators/_cellxgene_schemas/__init__.py +1 -1
lamindb/models/_feature_manager.py +21 -28
lamindb/models/_from_values.py +53 -97
lamindb/models/_label_manager.py +17 -10
lamindb/models/artifact.py +30 -6
lamindb/models/can_curate.py +20 -20
lamindb/models/feature.py +47 -48
lamindb/models/record.py +29 -25
lamindb/models/run.py +4 -8
lamindb/models/schema.py +7 -7
{lamindb-1.3.0.dist-info → lamindb-1.3.1.dist-info}/METADATA +3 -3
{lamindb-1.3.0.dist-info → lamindb-1.3.1.dist-info}/RECORD +19 -19
{lamindb-1.3.0.dist-info → lamindb-1.3.1.dist-info}/LICENSE +0 -0
{lamindb-1.3.0.dist-info → lamindb-1.3.1.dist-info}/WHEEL +0 -0

lamindb/curators/_cellxgene_schemas/__init__.py CHANGED Viewed

@@ -113,7 +113,7 @@ def _create_sources(
             if source is None:
                 logger.error(
                     f"Could not find source: {entity}\n"
-                    "    → consider running `bionty.core.sync_all_sources_to_latest()` and re-connect to your instance"
+                    "    → consider running `bionty.core.sync_public_sources()`"
                 )
             return source

lamindb/models/_feature_manager.py CHANGED Viewed

@@ -24,7 +24,7 @@ from lamindb.core.storage import LocalPathClasses
 from lamindb.errors import DoesNotExist, ValidationError
 from lamindb.models._from_values import _format_values
 from lamindb.models.feature import (
-    convert_pandas_dtype_to_lamin_dtype,
+    serialize_pandas_dtype,
     suggest_categorical_for_str_iterable,
 )
 from lamindb.models.record import (
@@ -485,6 +485,7 @@ def parse_staged_feature_sets_from_anndata(
     adata: AnnData,
     var_field: FieldAttr | None = None,
     obs_field: FieldAttr = Feature.name,
+    uns_field: FieldAttr | None = None,
     mute: bool = False,
     organism: str | Record | None = None,
 ) -> dict:
@@ -501,15 +502,9 @@ def parse_staged_feature_sets_from_anndata(
             data_parse = ad.read_h5ad(filepath, backed="r")
         type = "float"
     else:
-        type = (
-            "float"
-            if adata.X is None
-            else convert_pandas_dtype_to_lamin_dtype(adata.X.dtype)
-        )
+        type = "float" if adata.X is None else serialize_pandas_dtype(adata.X.dtype)
     feature_sets = {}
     if var_field is not None:
-        logger.info("parsing feature names of X stored in slot 'var'")
-        logger.indent = "   "
         schema_var = Schema.from_values(
             data_parse.var.index,
             var_field,
@@ -520,13 +515,7 @@ def parse_staged_feature_sets_from_anndata(
         )
         if schema_var is not None:
             feature_sets["var"] = schema_var
-            logger.save(f"linked: {schema_var}")
-        logger.indent = ""
-        if schema_var is None:
-            logger.warning("skip linking features to artifact in slot 'var'")
-    if len(data_parse.obs.columns) > 0:
-        logger.info("parsing feature names of slot 'obs'")
-        logger.indent = "   "
+    if obs_field is not None and len(data_parse.obs.columns) > 0:
         schema_obs = Schema.from_df(
             df=data_parse.obs,
             field=obs_field,
@@ -535,10 +524,13 @@ def parse_staged_feature_sets_from_anndata(
         )
         if schema_obs is not None:
             feature_sets["obs"] = schema_obs
-            logger.save(f"linked: {schema_obs}")
-        logger.indent = ""
-        if schema_obs is None:
-            logger.warning("skip linking features to artifact in slot 'obs'")
+    if uns_field is not None and len(data_parse.uns) > 0:
+        validated_features = Feature.from_values(  # type: ignore
+            data_parse.uns.keys(), field=uns_field, organism=organism
+        )
+        if len(validated_features) > 0:
+            schema_uns = Schema(validated_features, dtype=None, otype="dict")
+            feature_sets["uns"] = schema_uns
     return feature_sets
@@ -575,7 +567,7 @@ def infer_feature_type_convert_json(
             return "cat ? str", value, message
     elif isinstance(value, Iterable) and not isinstance(value, (str, bytes)):
         if isinstance(value, (pd.Series, np.ndarray, pd.Categorical)):
-            dtype = convert_pandas_dtype_to_lamin_dtype(value.dtype)
+            dtype = serialize_pandas_dtype(value.dtype)
             if dtype == "str":
                 # ndarray doesn't know categorical, so there was no conscious choice
                 # offer both options
@@ -848,7 +840,7 @@ def _add_values(
                 )
     validated = registry.validate(keys, field=feature_param_field, mute=True)
     keys_array = np.array(keys)
-    validated_keys = keys_array[validated]
+    keys_array[validated]
     if validated.sum() != len(keys):
         not_validated_keys = keys_array[~validated]
         not_validated_keys_dtype_message = [
@@ -874,10 +866,7 @@ def _add_values(
             f"Here is how to create a {model_name.lower()}:\n\n{hint}"
         )
         raise ValidationError(msg)
-    registry.from_values(
-        validated_keys,
-        field=feature_param_field,
-    )
     # figure out which of the values go where
     features_labels = defaultdict(list)
     _feature_values = []
@@ -937,12 +926,14 @@ def _add_values(
                 if "ULabel" not in feature.dtype:
                     feature.dtype += "[ULabel]"
                     feature.save()
-                validated = ULabel.validate(values, field="name", mute=True)
+                validated = ULabel.validate(values, field=ULabel.name, mute=True)
                 values_array = np.array(values)
                 validated_values = values_array[validated]
                 if validated.sum() != len(values):
                     not_validated_values += values_array[~validated].tolist()
-                label_records = ULabel.from_values(validated_values, field="name")  # type: ignore
+                label_records = ULabel.from_values(
+                    validated_values, field=ULabel.name, mute=True
+                )  # type: ignore
                 features_labels["ULabel"] += [
                     (feature, label_record) for label_record in label_records
                 ]
@@ -1120,6 +1111,7 @@ def _add_set_from_anndata(
     self,
     var_field: FieldAttr | None = None,
     obs_field: FieldAttr | None = Feature.name,
+    uns_field: FieldAttr | None = None,
     mute: bool = False,
     organism: str | Record | None = None,
 ):
@@ -1132,6 +1124,7 @@ def _add_set_from_anndata(
         adata,
         var_field=var_field,
         obs_field=obs_field,
+        uns_field=uns_field,
         mute=mute,
         organism=organism,
     )
@@ -1255,7 +1248,7 @@ def _add_from(self, data: Artifact | Collection, transfer_logs: dict = None):
         # create records from ontology_id
         if hasattr(registry, "_ontology_id_field") and len(member_uids) > 0:
             # create from bionty
-            members_records = registry.from_values(member_uids, field=field)
+            members_records = registry.from_values(member_uids, field=field, mute=True)
             save([r for r in members_records if r._state.adding])
         validated = registry.validate(member_uids, field=field, mute=True)
         new_members_uids = list(compress(member_uids, ~validated))

lamindb/models/_from_values.py CHANGED Viewed

@@ -1,15 +1,11 @@
 from __future__ import annotations
-import re
 from typing import TYPE_CHECKING
 import pandas as pd
-from django.core.exceptions import FieldDoesNotExist
 from lamin_utils import colors, logger
 if TYPE_CHECKING:
-    from collections.abc import Iterable
     from lamindb.base.types import FieldAttr, ListLike
     from .query_set import RecordList
@@ -30,7 +26,7 @@ def _from_values(
     from .query_set import RecordList
     registry = field.field.model  # type: ignore
-    organism_record = _get_organism_record(field, organism, values=iterable)
+    organism_record = get_organism_record_from_field(field, organism, values=iterable)
     # TODO: the create is problematic if field is not a name field
     if create:
         create_kwargs = {}
@@ -55,15 +51,17 @@ def _from_values(
     # new records to be created based on new values
     if len(nonexist_values) > 0:
-        if hasattr(registry, "source_id"):
+        if registry.__base__.__name__ == "BioRecord":
+            from bionty._organism import is_organism_required
             # if can and needed, get organism record from the existing records
             if (
                 organism_record is None
                 and len(records) > 0
-                and _is_organism_required(registry)
+                and is_organism_required(registry)
             ):
                 organism_record = records[0].organism
-            records_bionty, unmapped_values = create_records_from_source(
+            records_public, unmapped_values = create_records_from_source(
                 iterable_idx=nonexist_values,
                 field=field,
                 organism=organism_record,
@@ -71,11 +69,11 @@ def _from_values(
                 msg=msg,
                 mute=mute,
             )
-            if len(records_bionty) > 0:
+            if len(records_public) > 0:
                 msg = ""
-            for record in records_bionty:
+            for record in records_public:
                 record._from_source = True
-            records += records_bionty
+            records += records_public
         else:
             unmapped_values = nonexist_values
         # unmapped new_ids will NOT create records
@@ -187,25 +185,26 @@ def create_records_from_source(
     """Create records from source."""
     model = field.field.model  # type: ignore
     records: list = []
-    # populate additional fields from bionty
-    from bionty._bionty import get_source_record
-    from bionty.core._bionty import filter_bionty_df_columns
+    # populate additional fields from public_df
+    from bionty._source import filter_public_df_columns, get_source_record
+    # get the default source
+    source_record = get_source_record(model, organism, source)
-    # create the corresponding bionty object from model
+    # create the corresponding PublicOntology object from model
     try:
-        # TODO: more generic
-        public_ontology = model.public(organism=organism, source=source)
+        public_ontology = model.public(source=source_record)
     except Exception:
-        # for custom records that are not created from public sources
+        # no public source
         return records, iterable_idx
-    # get the default source
-    if source is None:
-        source = get_source_record(public_ontology, model)
-    # filter the columns in bionty df based on fields
-    bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
+    # filter the columns in public df based on fields
+    public_df = filter_public_df_columns(model=model, public_ontology=public_ontology)
+    if public_df.empty:
+        return records, iterable_idx
-    # standardize in the bionty reference
+    # standardize in the public reference
     # do not inspect synonyms if the field is not name field
     inspect_synonyms = True
     if hasattr(model, "_name_field") and field.field.name != model._name_field:  # type: ignore
@@ -231,30 +230,30 @@ def create_records_from_source(
         iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
-    # create records for values that are found in the bionty reference
+    # create records for values that are found in the public reference
     # matching either field or synonyms
-    mapped_values = iterable_idx.intersection(bionty_df[field.field.name])  # type: ignore
+    mapped_values = iterable_idx.intersection(public_df[field.field.name])  # type: ignore
     multi_msg = ""
     if len(mapped_values) > 0:
-        bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
+        public_kwargs, multi_msg = _bulk_create_dicts_from_df(
             keys=mapped_values,
             column_name=field.field.name,  # type: ignore
-            df=bionty_df,
+            df=public_df,
         )
         # this here is needed when the organism is required to create new records
         if organism is None:
-            organism = _get_organism_record(
-                field, source.organism, values=mapped_values
+            organism = get_organism_record_from_field(
+                field, source_record.organism, values=mapped_values
             )
         create_kwargs = (
-            {"organism": organism, "source": source}
+            {"organism": organism, "source": source_record}
             if organism is not None
-            else {"source": source}
+            else {"source": source_record}
         )
-        for bk in bionty_kwargs:
+        for bk in public_kwargs:
             records.append(model(**bk, **create_kwargs, _skip_validation=True))
         # number of records that matches field (not synonyms)
@@ -279,12 +278,12 @@ def create_records_from_source(
     if len(multi_msg) > 0 and not mute:
         logger.warning(multi_msg)
-    # return the values that are not found in the bionty reference
+    # return the values that are not found in the public reference
     unmapped_values = iterable_idx.difference(mapped_values)
     return records, unmapped_values
-def index_iterable(iterable: Iterable) -> pd.Index:
+def index_iterable(iterable: ListLike) -> pd.Index:
     """Get unique values from an iterable."""
     idx = pd.Index(iterable).unique()
     # No entries are made for NAs, '', None
@@ -293,7 +292,7 @@ def index_iterable(iterable: Iterable) -> pd.Index:
 def _format_values(
-    names: Iterable, n: int = 20, quotes: bool = True, sep: str = "'"
+    names: ListLike, n: int = 20, quotes: bool = True, sep: str = "'"
 ) -> str:
     """Format values for printing."""
     if isinstance(names, dict):
@@ -340,36 +339,10 @@ def _bulk_create_dicts_from_df(
     return df.reset_index().to_dict(orient="records"), multi_msg
-def _is_organism_required(registry: type[Record]) -> bool:
-    """Check if the registry has an organism field and is required.
-    Returns:
-        True if the registry has an organism field and is required, False otherwise.
-    """
-    try:
-        organism_field = registry._meta.get_field("organism")
-        # organism is not required or not a relation
-        if organism_field.null or not organism_field.is_relation:
-            return False
-        else:
-            return True
-    except FieldDoesNotExist:
-        return False
-def _is_simple_field_unique(field: FieldAttr) -> bool:
-    """Check if the field is an id field."""
-    # id field is a unique field that's not a relation
-    field = field.field
-    if field.unique and not field.is_relation:
-        return True
-    return False
-def _get_organism_record(  # type: ignore
+def get_organism_record_from_field(  # type: ignore
     field: FieldAttr,
     organism: str | Record | None = None,
-    values: Iterable = [],
+    values: ListLike = None,
     using_key: str | None = None,
 ) -> Record | None:
     """Get organism record.
@@ -385,45 +358,28 @@ def _get_organism_record(  # type: ignore
             The organism FK is required for the registry
             The field is not unique or the organism is not None
     """
+    if values is None:
+        values = []
     registry = field.field.model
     field_str = field.field.name
-    check = not _is_simple_field_unique(field=field) or organism is not None
+    # id field is a unique field that's not a relation
+    is_simple_field_unique = field.field.unique and not field.field.is_relation
+    check = not is_simple_field_unique or organism is not None
-    if field_str == "ensembl_gene_id" and len(values) > 0 and organism is None:  # type: ignore
-        return _organism_from_ensembl_id(values[0], using_key)  # type: ignore
+    if (
+        registry.__get_name_with_module__() == "bionty.Gene"
+        and field.field.name == "ensembl_gene_id"
+        and len(values) > 0
+        and organism is None
+    ):  # type: ignore
+        from bionty._organism import organism_from_ensembl_id
-    if _is_organism_required(registry) and check:
-        from bionty._bionty import create_or_get_organism_record
+        return organism_from_ensembl_id(values[0], using_key)  # type: ignore
+    if registry.__base__.__name__ == "BioRecord" and check:
+        from bionty._organism import create_or_get_organism_record
         organism_record = create_or_get_organism_record(
             organism=organism, registry=registry, field=field_str
         )
-        if organism_record is not None:
-            return organism_record.save()
-def _organism_from_ensembl_id(id: str, using_key: str | None) -> Record | None:  # type: ignore
-    """Get organism record from ensembl id."""
-    import bionty as bt
-    from bionty.base.dev._io import s3_bionty_assets
-    localpath = s3_bionty_assets(
-        ".lamindb/0QeqXlKq9aqW8aqe0000.parquet", bt.base.settings.versionsdir
-    )
-    ensembl_prefixes = pd.read_parquet(localpath).set_index("gene_prefix")
-    prefix = re.sub(r"\d+", "", id)
-    if prefix in ensembl_prefixes.index:
-        organism_name = ensembl_prefixes.loc[prefix, "name"].lower()
-        using_key = None if using_key == "default" else using_key
-        organism_record = (
-            bt.Organism.using(using_key).filter(name=organism_name).one_or_none()
-        )
-        if organism_record is None:
-            organism_record = bt.Organism.from_source(name=organism_name)
-            if organism_record is not None:
-                organism_record.save(using=using_key)
         return organism_record

lamindb/models/_label_manager.py CHANGED Viewed

@@ -142,7 +142,7 @@ def _save_validated_records(
     # save labels from ontology_ids
     if hasattr(registry, "_ontology_id_field") and label_uids:
         try:
-            records = registry.from_values(label_uids, field=field)
+            records = registry.from_values(label_uids, field=field, mute=True)
             save([r for r in records if r._state.adding])
         except Exception:  # noqa: S110
             pass
@@ -240,7 +240,7 @@ class LabelManager:
                 continue
             # look for features
             data_name_lower = data.__class__.__name__.lower()
-            labels_by_features = defaultdict(list)
+            labels_by_features: dict = defaultdict(list)
             features = set()
             new_labels = save_validated_records(labels)
             if len(new_labels) > 0:
@@ -248,18 +248,24 @@ class LabelManager:
                     new_labels, using_key, transfer_logs=transfer_logs
                 )
             for label in labels:
+                keys: list = []
                 # if the link table doesn't follow this convention, we'll ignore it
                 if not hasattr(label, f"links_{data_name_lower}"):
                     key = None
+                    keys.append(key)
                 else:
-                    link = getattr(label, f"links_{data_name_lower}").get(
-                        **{f"{data_name_lower}_id": data.id}
+                    links = (
+                        getattr(label, f"links_{data_name_lower}")
+                        .filter(**{f"{data_name_lower}_id": data.id})
+                        .all()
                     )
-                    if link.feature is not None:
-                        features.add(link.feature)
-                        key = link.feature.name
-                    else:
-                        key = None
+                    for link in links:
+                        if link.feature is not None:
+                            features.add(link.feature)
+                            key = link.feature.name
+                        else:
+                            key = None
+                        keys.append(key)
                 label_returned = transfer_to_default_db(
                     label,
                     using_key,
@@ -270,7 +276,8 @@ class LabelManager:
                 # TODO: refactor return value of transfer to default db
                 if label_returned is not None:
                     label = label_returned
-                labels_by_features[key].append(label)
+                for key in keys:
+                    labels_by_features[key].append(label)
             # treat features
             new_features = save_validated_records(list(features))
             if len(new_features) > 0:

lamindb/models/artifact.py CHANGED Viewed

@@ -16,6 +16,7 @@ from django.db.models import CASCADE, PROTECT, Q
 from lamin_utils import colors, logger
 from lamindb_setup import settings as setup_settings
 from lamindb_setup._init_instance import register_storage_in_instance
+from lamindb_setup.core import doc_args
 from lamindb_setup.core._settings_storage import init_storage
 from lamindb_setup.core.hashing import HASH_LENGTH, hash_dir, hash_file
 from lamindb_setup.core.types import UPathStr
@@ -93,6 +94,8 @@ WARNING_RUN_TRANSFORM = "no run & transform got linked, call `ln.track()` & re-r
 WARNING_NO_INPUT = "run input wasn't tracked, call `ln.track()` and re-run"
+DEBUG_KWARGS_DOC = "**kwargs: Internal arguments for debugging."
 try:
     from ..core.storage._zarr import identify_zarr_type
 except ImportError:
@@ -1428,7 +1431,11 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             kwargs["uid"] = uid
         # only set key now so that we don't do a look-up on it in case revises is passed
-        if revises is not None:
+        if revises is not None and revises.key is not None:
+            assert revises.key.endswith(kwargs["suffix"]), (  # noqa: S101
+                revises.key,
+                kwargs["suffix"],
+            )
             kwargs["key"] = revises.key
         kwargs["kind"] = kind
@@ -2010,6 +2017,10 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         # no need to upload if new file is already in storage
         self._to_store = not check_path_in_storage
+        # update old suffix with the new one so that checks in record pass
+        # replace() supports changing the suffix
+        self._old_suffix = self.suffix
     def open(
         self, mode: str = "r", is_run_input: bool | None = None, **kwargs
     ) -> Union[
@@ -2146,13 +2157,16 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         _track_run_input(self, is_run_input)
         return access
-    def load(self, is_run_input: bool | None = None, **kwargs) -> Any:
+    def load(
+        self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs
+    ) -> Any:
         """Cache and load into memory.
         See all :mod:`~lamindb.core.loaders`.
         Args:
             is_run_input: Whether to track this artifact as run input.
+            mute: Silence logging of caching progress.
             **kwargs: Keyword arguments for the loader.
         Examples:
@@ -2188,7 +2202,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
             filepath, cache_key = filepath_cache_key_from_artifact(
                 self, using_key=settings._using_key
             )
-            cache_path = _synchronize_cleanup_on_error(filepath, cache_key=cache_key)
+            cache_path = _synchronize_cleanup_on_error(
+                filepath, cache_key=cache_key, print_progress=not mute
+            )
             try:
                 # cache_path is local so doesn't trigger any sync in load_to_memory
                 access_memory = load_to_memory(cache_path, **kwargs)
@@ -2209,14 +2225,17 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
                     cache_path.unlink(missing_ok=True)
                 # download again and try to load into memory
                 cache_path = _synchronize_cleanup_on_error(
-                    filepath, cache_key=cache_key
+                    filepath, cache_key=cache_key, print_progress=not mute
                 )
                 access_memory = load_to_memory(cache_path, **kwargs)
         # only call if load is successfull
         _track_run_input(self, is_run_input)
         return access_memory
-    def cache(self, is_run_input: bool | None = None, **kwargs) -> Path:
+    @doc_args(DEBUG_KWARGS_DOC)
+    def cache(
+        self, *, is_run_input: bool | None = None, mute: bool = False, **kwargs
+    ) -> Path:
         """Download cloud artifact to local cache.
         Follows synching logic: only caches an artifact if it's outdated in the local cache.
@@ -2224,8 +2243,9 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         Returns a path to a locally cached on-disk object (say a `.jpg` file).
         Args:
+            mute: Silence logging of caching progress.
             is_run_input: Whether to track this artifact as run input.
-            **kwargs: Keyword arguments for synchronization.
+            {}
         Example::
@@ -2241,6 +2261,8 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
         filepath, cache_key = filepath_cache_key_from_artifact(
             self, using_key=settings._using_key
         )
+        if mute:
+            kwargs["print_progress"] = False
         cache_path = _synchronize_cleanup_on_error(
             filepath, cache_key=cache_key, **kwargs
         )
@@ -2368,11 +2390,13 @@ class Artifact(Record, IsVersioned, TracksRun, TracksUpdates):
                 if delete_msg != "did-not-delete":
                     logger.success(f"deleted {colors.yellow(f'{path}')}")
+    @doc_args(DEBUG_KWARGS_DOC)
     def save(self, upload: bool | None = None, **kwargs) -> Artifact:
         """Save to database & storage.
         Args:
             upload: Trigger upload to cloud storage in instances with hybrid storage mode.
+            {}
         Example::

lamindb 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

lamindb 1.3.0py3-none-any.whl → 1.3.1py3-none-any.whl