PyPI - lamindb - Versions diffs - 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

lamindb 1.2a2py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

lamindb/__init__.py +3 -1
lamindb/_view.py +2 -2
lamindb/base/types.py +50 -11
lamindb/core/_compat.py +60 -0
lamindb/core/_context.py +15 -12
lamindb/core/datasets/__init__.py +1 -0
lamindb/core/datasets/_core.py +23 -0
lamindb/core/datasets/_small.py +16 -2
lamindb/core/loaders.py +22 -12
lamindb/core/storage/_tiledbsoma.py +2 -2
lamindb/core/storage/_zarr.py +84 -26
lamindb/core/storage/objects.py +45 -44
lamindb/core/types.py +11 -1
lamindb/curators/__init__.py +1430 -1665
lamindb/curators/_cellxgene_schemas/__init__.py +190 -18
lamindb/curators/_cellxgene_schemas/schema_versions.csv +43 -0
lamindb/models/_feature_manager.py +86 -42
lamindb/models/_from_values.py +110 -119
lamindb/models/_label_manager.py +17 -10
lamindb/models/artifact.py +170 -102
lamindb/models/can_curate.py +200 -231
lamindb/models/feature.py +76 -47
lamindb/models/project.py +69 -7
lamindb/models/query_set.py +12 -2
lamindb/models/record.py +77 -50
lamindb/models/run.py +20 -7
lamindb/models/schema.py +7 -15
{lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/METADATA +8 -7
{lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/RECORD +31 -30
lamindb/curators/_cellxgene_schemas/schema_versions.yml +0 -104
{lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/LICENSE +0 -0
{lamindb-1.2a2.dist-info → lamindb-1.3.1.dist-info}/WHEEL +0 -0

lamindb/models/_from_values.py CHANGED Viewed

@@ -3,26 +3,21 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 import pandas as pd
-from django.core.exceptions import FieldDoesNotExist
 from lamin_utils import colors, logger
-from .record import Record
 if TYPE_CHECKING:
-    from collections.abc import Iterable
-    from lamindb.base.types import ListLike, StrField
+    from lamindb.base.types import FieldAttr, ListLike
     from .query_set import RecordList
+    from .record import Record
 # The base function for `from_values`
-def get_or_create_records(
+def _from_values(
     iterable: ListLike,
-    field: StrField,
+    field: FieldAttr,
     *,
     create: bool = False,
-    from_source: bool = False,
     organism: Record | str | None = None,
     source: Record | None = None,
     mute: bool = False,
@@ -31,68 +26,67 @@ def get_or_create_records(
     from .query_set import RecordList
     registry = field.field.model  # type: ignore
+    organism_record = get_organism_record_from_field(field, organism, values=iterable)
+    # TODO: the create is problematic if field is not a name field
     if create:
-        return RecordList([registry(**{field.field.name: value}) for value in iterable])  # type: ignore
-    organism = _get_organism_record(field, organism)
+        create_kwargs = {}
+        if organism_record:
+            create_kwargs["organism"] = organism_record
+        return RecordList(
+            [
+                registry(**{field.field.name: value}, **create_kwargs)
+                for value in iterable
+            ]
+        )  # type: ignore
     iterable_idx = index_iterable(iterable)
     # returns existing records & non-existing values
     records, nonexist_values, msg = get_existing_records(
         iterable_idx=iterable_idx,
         field=field,
-        organism=organism,
+        organism=organism_record,
         mute=mute,
     )
     # new records to be created based on new values
     if len(nonexist_values) > 0:
-        source_record = None
-        if from_source:
-            if isinstance(source, Record):
-                source_record = source
-        if not source_record and hasattr(registry, "public"):
-            if organism is None:
-                organism = _ensembl_prefix(nonexist_values[0], field, organism)
-                organism = _get_organism_record(field, organism, force=True)
-        if source_record:
-            from bionty.core._add_ontology import check_source_in_db
-            check_source_in_db(registry=registry, source=source_record)
-            from_source = not source_record.in_db
-        elif hasattr(registry, "source_id"):
-            from_source = True
-        else:
-            from_source = False
-        if from_source:
-            records_bionty, unmapped_values = create_records_from_source(
+        if registry.__base__.__name__ == "BioRecord":
+            from bionty._organism import is_organism_required
+            # if can and needed, get organism record from the existing records
+            if (
+                organism_record is None
+                and len(records) > 0
+                and is_organism_required(registry)
+            ):
+                organism_record = records[0].organism
+            records_public, unmapped_values = create_records_from_source(
                 iterable_idx=nonexist_values,
                 field=field,
-                organism=organism,
-                source=source_record,
+                organism=organism_record,
+                source=source,
                 msg=msg,
                 mute=mute,
             )
-            if len(records_bionty) > 0:
+            if len(records_public) > 0:
                 msg = ""
-            for record in records_bionty:
+            for record in records_public:
                 record._from_source = True
-            records += records_bionty
+            records += records_public
         else:
             unmapped_values = nonexist_values
         # unmapped new_ids will NOT create records
         if len(unmapped_values) > 0:
+            # first log the success message
             if len(msg) > 0 and not mute:
                 logger.success(msg)
             s = "" if len(unmapped_values) == 1 else "s"
             print_values = colors.yellow(_format_values(unmapped_values))
-            name = registry.__name__
             n_nonval = colors.yellow(f"{len(unmapped_values)} non-validated")
             if not mute:
                 logger.warning(
-                    f"{colors.red('did not create')} {name} record{s} for "
+                    f"{colors.red('did not create')} {registry.__name__} record{s} for "
                     f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"  # type: ignore
                 )
     return RecordList(records)
@@ -100,25 +94,21 @@ def get_or_create_records(
 def get_existing_records(
     iterable_idx: pd.Index,
-    field: StrField,
+    field: FieldAttr,
     organism: Record | None = None,
     mute: bool = False,
-):
+) -> tuple[list, pd.Index, str]:
+    """Get existing records from the database."""
     # NOTE: existing records matching is agnostic to the source
     model = field.field.model  # type: ignore
-    if organism is None and field.field.name == "ensembl_gene_id":  # type: ignore
-        if len(iterable_idx) > 0:
-            organism = _ensembl_prefix(iterable_idx[0], field, organism)  # type: ignore
-            organism = _get_organism_record(field, organism, force=True)
-    # standardize based on the DB reference
     # log synonyms mapped terms
     syn_mapper = model.standardize(
         iterable_idx,
         field=field,
         organism=organism,
         mute=True,
-        public_aware=False,
+        source_aware=False,  # standardize only based on the DB reference
         return_mapper=True,
     )
     iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
@@ -137,7 +127,6 @@ def get_existing_records(
     is_validated = model.validate(
         iterable_idx, field=field, organism=organism, mute=True
     )
     if len(is_validated) > 0:
         validated = iterable_idx[is_validated]
     else:
@@ -151,7 +140,7 @@ def get_existing_records(
             msg = (
                 "loaded"
                 f" {colors.green(f'{len(validated)} {model.__name__} record{s}')}"
-                f" matching {colors.italic(f'{field.field.name}')}: {print_values}"  # type: ignore
+                f" matching {colors.italic(f'{field.field.name}')}: {print_values}"
             )
         if len(syn_mapper) > 0:
             s = "" if len(syn_mapper) == 1 else "s"
@@ -173,15 +162,13 @@ def get_existing_records(
         msg = ""
     # get all existing records in the db
-    # if necessary, create records for the values in kwargs
-    # k:v -> k:v_record
     query = {f"{field.field.name}__in": iterable_idx.values}  # type: ignore
     if organism is not None:
         query["organism"] = organism
     records = model.filter(**query).list()
     if len(validated) == len(iterable_idx):
-        return records, [], msg
+        return records, pd.Index([]), msg
     else:
         nonval_values = iterable_idx.difference(validated)
         return records, nonval_values, msg
@@ -189,33 +176,35 @@ def get_existing_records(
 def create_records_from_source(
     iterable_idx: pd.Index,
-    field: StrField,
+    field: FieldAttr,
     organism: Record | None = None,
     source: Record | None = None,
     msg: str = "",
     mute: bool = False,
-):
+) -> tuple[list, pd.Index]:
+    """Create records from source."""
     model = field.field.model  # type: ignore
     records: list = []
-    # populate additional fields from bionty
-    from bionty._bionty import get_source_record
-    from bionty.core._bionty import filter_bionty_df_columns
+    # populate additional fields from public_df
+    from bionty._source import filter_public_df_columns, get_source_record
+    # get the default source
+    source_record = get_source_record(model, organism, source)
-    # create the corresponding bionty object from model
+    # create the corresponding PublicOntology object from model
     try:
-        # TODO: more generic
-        public_ontology = model.public(organism=organism, source=source)
+        public_ontology = model.public(source=source_record)
     except Exception:
-        # for custom records that are not created from public sources
+        # no public source
         return records, iterable_idx
-    # get the default source
-    if source is None:
-        source = get_source_record(public_ontology, model)
-    # filter the columns in bionty df based on fields
-    bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
+    # filter the columns in public df based on fields
+    public_df = filter_public_df_columns(model=model, public_ontology=public_ontology)
+    if public_df.empty:
+        return records, iterable_idx
-    # standardize in the bionty reference
+    # standardize in the public reference
     # do not inspect synonyms if the field is not name field
     inspect_synonyms = True
     if hasattr(model, "_name_field") and field.field.name != model._name_field:  # type: ignore
@@ -241,27 +230,30 @@ def create_records_from_source(
         iterable_idx = iterable_idx.to_frame().rename(index=syn_mapper).index
-    # create records for values that are found in the bionty reference
+    # create records for values that are found in the public reference
     # matching either field or synonyms
-    mapped_values = iterable_idx.intersection(bionty_df[field.field.name])  # type: ignore
+    mapped_values = iterable_idx.intersection(public_df[field.field.name])  # type: ignore
     multi_msg = ""
     if len(mapped_values) > 0:
-        bionty_kwargs, multi_msg = _bulk_create_dicts_from_df(
+        public_kwargs, multi_msg = _bulk_create_dicts_from_df(
             keys=mapped_values,
             column_name=field.field.name,  # type: ignore
-            df=bionty_df,
+            df=public_df,
         )
-        if hasattr(model, "organism_id") and organism is None:
-            organism = _get_organism_record(field, source.organism, force=True)
+        # this here is needed when the organism is required to create new records
+        if organism is None:
+            organism = get_organism_record_from_field(
+                field, source_record.organism, values=mapped_values
+            )
         create_kwargs = (
-            {"organism": organism, "source": source}
+            {"organism": organism, "source": source_record}
             if organism is not None
-            else {"source": source}
+            else {"source": source_record}
         )
-        for bk in bionty_kwargs:
+        for bk in public_kwargs:
             records.append(model(**bk, **create_kwargs, _skip_validation=True))
         # number of records that matches field (not synonyms)
@@ -286,12 +278,13 @@ def create_records_from_source(
     if len(multi_msg) > 0 and not mute:
         logger.warning(multi_msg)
-    # return the values that are not found in the bionty reference
+    # return the values that are not found in the public reference
     unmapped_values = iterable_idx.difference(mapped_values)
     return records, unmapped_values
-def index_iterable(iterable: Iterable) -> pd.Index:
+def index_iterable(iterable: ListLike) -> pd.Index:
+    """Get unique values from an iterable."""
     idx = pd.Index(iterable).unique()
     # No entries are made for NAs, '', None
     # returns an ordered unique not null list
@@ -299,8 +292,9 @@ def index_iterable(iterable: Iterable) -> pd.Index:
 def _format_values(
-    names: Iterable, n: int = 20, quotes: bool = True, sep: str = "'"
+    names: ListLike, n: int = 20, quotes: bool = True, sep: str = "'"
 ) -> str:
+    """Format values for printing."""
     if isinstance(names, dict):
         items = {
             f"{key}: {value}": None
@@ -345,50 +339,47 @@ def _bulk_create_dicts_from_df(
     return df.reset_index().to_dict(orient="records"), multi_msg
-def _has_organism_field(registry: type[Record]) -> bool:
-    try:
-        registry._meta.get_field("organism")
-        return True
-    except FieldDoesNotExist:
-        return False
-def _get_organism_record(  # type: ignore
-    field: StrField, organism: str | Record, force: bool = False
-) -> Record:
+def get_organism_record_from_field(  # type: ignore
+    field: FieldAttr,
+    organism: str | Record | None = None,
+    values: ListLike = None,
+    using_key: str | None = None,
+) -> Record | None:
     """Get organism record.
     Args:
         field: the field to get the organism record for
         organism: the organism to get the record for
-        force: whether to force fetching the organism record
-    """
-    registry = field.field.model  # type: ignore
-    check = True
-    if not force and hasattr(registry, "_ontology_id_field"):
-        check = field.field.name != registry._ontology_id_field  # type: ignore
-        # e.g. bionty.CellMarker has "name" as _ontology_id_field
-        if not registry._ontology_id_field.endswith("id"):
-            check = True
+        values: the values to get the organism record for
+        using_key: the db to get the organism record for
-    if _has_organism_field(registry) and check:
-        from bionty._bionty import create_or_get_organism_record
-        if field and not isinstance(field, str):
-            field = field.field.name
+    Returns:
+        The organism record if:
+            The organism FK is required for the registry
+            The field is not unique or the organism is not None
+    """
+    if values is None:
+        values = []
+    registry = field.field.model
+    field_str = field.field.name
+    # id field is a unique field that's not a relation
+    is_simple_field_unique = field.field.unique and not field.field.is_relation
+    check = not is_simple_field_unique or organism is not None
+    if (
+        registry.__get_name_with_module__() == "bionty.Gene"
+        and field.field.name == "ensembl_gene_id"
+        and len(values) > 0
+        and organism is None
+    ):  # type: ignore
+        from bionty._organism import organism_from_ensembl_id
+        return organism_from_ensembl_id(values[0], using_key)  # type: ignore
+    if registry.__base__.__name__ == "BioRecord" and check:
+        from bionty._organism import create_or_get_organism_record
         organism_record = create_or_get_organism_record(
-            organism=organism, registry=registry, field=field
+            organism=organism, registry=registry, field=field_str
         )
-        if organism_record is not None:
-            return organism_record
-def _ensembl_prefix(id: str, field: StrField, organism: Record | None) -> str | None:
-    if field.field.name == "ensembl_gene_id" and organism is None:  # type: ignore
-        if id.startswith("ENSG"):
-            organism = "human"  # type: ignore
-        elif id.startswith("ENSMUSG"):
-            organism = "mouse"  # type: ignore
-    return organism
+        return organism_record

lamindb/models/_label_manager.py CHANGED Viewed

@@ -142,7 +142,7 @@ def _save_validated_records(
     # save labels from ontology_ids
     if hasattr(registry, "_ontology_id_field") and label_uids:
         try:
-            records = registry.from_values(label_uids, field=field)
+            records = registry.from_values(label_uids, field=field, mute=True)
             save([r for r in records if r._state.adding])
         except Exception:  # noqa: S110
             pass
@@ -240,7 +240,7 @@ class LabelManager:
                 continue
             # look for features
             data_name_lower = data.__class__.__name__.lower()
-            labels_by_features = defaultdict(list)
+            labels_by_features: dict = defaultdict(list)
             features = set()
             new_labels = save_validated_records(labels)
             if len(new_labels) > 0:
@@ -248,18 +248,24 @@ class LabelManager:
                     new_labels, using_key, transfer_logs=transfer_logs
                 )
             for label in labels:
+                keys: list = []
                 # if the link table doesn't follow this convention, we'll ignore it
                 if not hasattr(label, f"links_{data_name_lower}"):
                     key = None
+                    keys.append(key)
                 else:
-                    link = getattr(label, f"links_{data_name_lower}").get(
-                        **{f"{data_name_lower}_id": data.id}
+                    links = (
+                        getattr(label, f"links_{data_name_lower}")
+                        .filter(**{f"{data_name_lower}_id": data.id})
+                        .all()
                     )
-                    if link.feature is not None:
-                        features.add(link.feature)
-                        key = link.feature.name
-                    else:
-                        key = None
+                    for link in links:
+                        if link.feature is not None:
+                            features.add(link.feature)
+                            key = link.feature.name
+                        else:
+                            key = None
+                        keys.append(key)
                 label_returned = transfer_to_default_db(
                     label,
                     using_key,
@@ -270,7 +276,8 @@ class LabelManager:
                 # TODO: refactor return value of transfer to default db
                 if label_returned is not None:
                     label = label_returned
-                labels_by_features[key].append(label)
+                for key in keys:
+                    labels_by_features[key].append(label)
             # treat features
             new_features = save_validated_records(list(features))
             if len(new_features) > 0:

lamindb 1.2a2__py3-none-any.whl → 1.3.1__py3-none-any.whl

lamindb 1.2a2py3-none-any.whl → 1.3.1py3-none-any.whl