PyPI - lamindb - Versions diffs - 0.74.3__py3-none-any.whl → 0.75.0__py3-none-any.whl - Mend

lamindb 0.74.3py3-none-any.whl → 0.75.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

lamindb/__init__.py +1 -1
lamindb/_artifact.py +85 -43
lamindb/_can_validate.py +55 -20
lamindb/_collection.py +36 -28
lamindb/_curate.py +55 -44
lamindb/_feature_set.py +5 -5
lamindb/_filter.py +3 -3
lamindb/_finish.py +29 -23
lamindb/_from_values.py +41 -60
lamindb/_is_versioned.py +1 -1
lamindb/_parents.py +38 -13
lamindb/_record.py +19 -20
lamindb/_save.py +2 -2
lamindb/_transform.py +27 -16
lamindb/core/_data.py +14 -16
lamindb/core/_feature_manager.py +34 -44
lamindb/core/_label_manager.py +17 -19
lamindb/core/_mapped_collection.py +1 -1
lamindb/core/_run_context.py +6 -8
lamindb/core/datasets/_core.py +7 -7
lamindb/core/exceptions.py +11 -0
lamindb/core/storage/__init__.py +1 -0
lamindb/core/storage/_anndata_accessor.py +735 -0
lamindb/core/storage/_backed_access.py +77 -747
lamindb/core/storage/paths.py +9 -14
lamindb/core/types.py +3 -0
lamindb/core/versioning.py +1 -1
lamindb/integrations/__init__.py +1 -0
{lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/METADATA +5 -5
lamindb-0.75.0.dist-info/RECORD +58 -0
lamindb-0.74.3.dist-info/RECORD +0 -57
{lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/LICENSE +0 -0
{lamindb-0.74.3.dist-info → lamindb-0.75.0.dist-info}/WHEEL +0 -0

lamindb/_curate.py CHANGED Viewed

@@ -9,7 +9,6 @@ from lamin_utils import colors, logger
 from lamindb_setup.core._docs import doc_args
 from lnschema_core import (
     Artifact,
-    Collection,
     Feature,
     Record,
     Run,
@@ -92,6 +91,7 @@ class DataFrameCurator:
         using: The reference instance containing registries to validate against.
         verbosity: The verbosity level.
         organism: The organism name.
+        sources: A dictionary mapping column names to Source records.
     Examples:
         >>> import bionty as bt
@@ -109,6 +109,7 @@ class DataFrameCurator:
         using: str | None = None,
         verbosity: str = "hint",
         organism: str | None = None,
+        sources: dict[str, Record] | None = None,
     ) -> None:
         from lamindb.core._settings import settings
@@ -121,6 +122,9 @@ class DataFrameCurator:
         self._collection = None
         self._validated = False
         self._kwargs = {"organism": organism} if organism else {}
+        if sources is None:
+            sources = {}
+        self._sources = sources
         self._save_columns()
     @property
@@ -158,6 +162,7 @@ class DataFrameCurator:
             save_function="add_new_from_columns",
             using=self._using,
             validated_only=False,
+            source=self._sources.get("columns"),
             **kwargs,
         )
@@ -172,6 +177,7 @@ class DataFrameCurator:
                 using=self._using,
                 validated_only=validated_only,
                 df=self._df,  # Get the Feature type from df
+                source=self._sources.get("columns"),
                 **kwargs,
             )
@@ -222,6 +228,7 @@ class DataFrameCurator:
                 key=categorical,
                 using=self._using,
                 validated_only=validated_only,
+                sources=self._sources.get(categorical),
                 **kwargs,
             )
@@ -242,6 +249,7 @@ class DataFrameCurator:
             self._df,
             fields=self.fields,
             using=self._using,
+            sources=self._sources,
             **self._kwargs,
         )
         return self._validated
@@ -283,41 +291,6 @@ class DataFrameCurator:
         return self._artifact
-    def save_collection(
-        self,
-        artifact: Artifact | Iterable[Artifact],
-        name: str,
-        description: str | None = None,
-        reference: str | None = None,
-        reference_type: str | None = None,
-    ) -> Collection:
-        """Save a collection from artifact/artifacts.
-        Args:
-            artifact: One or several saved Artifacts.
-            name: Title of the publication.
-            description: Description of the publication.
-            reference: Accession number (e.g. GSE#, E-MTAB#, etc.).
-            reference_type: Source type (e.g. GEO, ArrayExpress, SRA, etc.).
-        """
-        collection = Collection(
-            artifact,
-            name=name,
-            description=description,
-            reference=reference,
-            reference_type=reference_type,
-        )
-        slug = ln_setup.settings.instance.slug
-        if collection._state.adding:
-            collection.save()
-        else:  # pragma: no cover
-            collection.save()
-            logger.warning(f"collection already exists in {colors.italic(slug)}!")
-        if ln_setup.settings.instance.is_remote:  # pragma: no cover
-            logger.print(f"go to https://lamin.ai/{slug}/collection/{collection.uid}")
-        self._collection = collection
-        return collection
     def clean_up_failed_runs(self):
         """Clean up previous failed runs that don't save any outputs."""
         from lamindb.core._run_context import run_context
@@ -338,6 +311,7 @@ class AnnDataCurator(DataFrameCurator):
         using: A reference LaminDB instance.
         verbosity: The verbosity level.
         organism: The organism name.
+        sources: A dictionary mapping ``.obs.columns`` to Source records.
     Examples:
         >>> import bionty as bt
@@ -357,11 +331,14 @@ class AnnDataCurator(DataFrameCurator):
         using: str = "default",
         verbosity: str = "hint",
         organism: str | None = None,
+        sources: dict[str, Record] | None = None,
     ) -> None:
         from lamindb_setup.core import upath
         from ._artifact import data_is_anndata
+        if sources is None:
+            sources = {}
         if not data_is_anndata(data):
             raise ValueError(
                 "data has to be an AnnData object or a path to AnnData-like"
@@ -381,6 +358,7 @@ class AnnDataCurator(DataFrameCurator):
             using=using,
             verbosity=verbosity,
             organism=organism,
+            sources=sources,
         )
         self._obs_fields = categoricals
         self._save_from_var_index(validated_only=True, **self._kwargs)
@@ -421,6 +399,7 @@ class AnnDataCurator(DataFrameCurator):
             using=self._using,
             validated_only=validated_only,
             organism=organism,
+            source=self._sources.get("var_index"),
         )
     def add_new_from_var_index(self, organism: str | None = None, **kwargs):
@@ -455,7 +434,11 @@ class AnnDataCurator(DataFrameCurator):
             **self._kwargs,
         )
         validated_obs = validate_categories_in_df(
-            self._adata.obs, fields=self.categoricals, using=self._using, **self._kwargs
+            self._adata.obs,
+            fields=self.categoricals,
+            using=self._using,
+            sources=self._sources,
+            **self._kwargs,
         )
         self._validated = validated_var and validated_obs
         return self._validated
@@ -519,7 +502,11 @@ class MuDataCurator:
         using: str = "default",
         verbosity: str = "hint",
         organism: str | None = None,
+        sources: dict[str, Record] | None = None,
     ) -> None:
+        if sources is None:
+            sources = {}
+        self._sources = sources
         self._mdata = mdata
         self._kwargs = {"organism": organism} if organism else {}
         self._var_fields = var_index
@@ -534,6 +521,7 @@ class MuDataCurator:
                 categoricals=self._obs_fields.get(modality, {}),
                 using=using,
                 verbosity=verbosity,
+                sources=self._sources.get(modality),
                 **self._kwargs,
             )
             for modality in self._modalities
@@ -713,7 +701,11 @@ class MuDataCurator:
             else:
                 obs = self._mdata[modality].obs
             validated_obs &= validate_categories_in_df(
-                obs, fields=fields, using=self._using, **self._kwargs
+                obs,
+                fields=fields,
+                using=self._using,
+                sources=self._sources.get(modality),
+                **self._kwargs,
             )
         self._validated = validated_var and validated_obs
         return self._validated
@@ -776,6 +768,7 @@ class Curate:
         using: str = "default",
         verbosity: str = "hint",
         organism: str | None = None,
+        sources: dict[str, Record] | None = None,
     ) -> AnnDataCurator:
         """{}"""  # noqa: D415
         return AnnDataCurator(
@@ -785,6 +778,7 @@ class Curate:
             using=using,
             verbosity=verbosity,
             organism=organism,
+            sources=sources,
         )
     @classmethod
@@ -848,6 +842,7 @@ def validate_categories(
     key: str,
     using: str | None = None,
     organism: str | None = None,
+    source: Record | None = None,
 ) -> bool:
     """Validate ontology terms in a pandas series using LaminDB registries."""
     from lamindb._from_values import _print_values
@@ -862,6 +857,7 @@ def validate_categories(
     registry = field.field.model
     filter_kwargs = check_registry_organism(registry, organism)
+    filter_kwargs.update({"source": source} if source else {})
     # Inspect the default instance
     inspect_result = standardize_and_inspect(
@@ -927,9 +923,12 @@ def validate_categories_in_df(
     df: pd.DataFrame,
     fields: dict[str, FieldAttr],
     using: str | None = None,
+    sources: dict[str, Record] = None,
     **kwargs,
 ) -> bool:
     """Validate categories in DataFrame columns using LaminDB registries."""
+    if sources is None:
+        sources = {}
     validated = True
     for key, field in fields.items():
         validated &= validate_categories(
@@ -937,6 +936,7 @@ def validate_categories_in_df(
             field=field,
             key=key,
             using=using,
+            source=sources.get(key),
             **kwargs,
         )
     return validated
@@ -998,13 +998,13 @@ def save_artifact(
         organism,
     )
-    if artifact.accessor == "DataFrame":
+    if artifact._accessor == "DataFrame":
         artifact.features._add_set_from_df(field=columns_field, **feature_kwargs)
-    elif artifact.accessor == "AnnData":
+    elif artifact._accessor == "AnnData":
         artifact.features._add_set_from_anndata(
             var_field=columns_field, **feature_kwargs
         )
-    elif artifact.accessor == "MuData":
+    elif artifact._accessor == "MuData":
         artifact.features._add_set_from_mudata(
             var_fields=columns_field, **feature_kwargs
         )
@@ -1021,7 +1021,7 @@ def save_artifact(
             labels = registry.from_values(df[key], field=field, **filter_kwargs)
             artifact.labels.add(labels, feature)
-    if artifact.accessor == "MuData":
+    if artifact._accessor == "MuData":
         for modality, modality_fields in fields.items():
             if modality == "obs":
                 _add_labels(data, artifact, modality_fields)
@@ -1046,6 +1046,7 @@ def update_registry(
     df: pd.DataFrame | None = None,
     organism: str | None = None,
     dtype: str | None = None,
+    source: Record | None = None,
     **kwargs,
 ) -> list[Record]:
     """Save features or labels records in the default instance from the using instance.
@@ -1060,6 +1061,7 @@ def update_registry(
         df: A DataFrame to save labels from.
         organism: The organism name.
         dtype: The type of the feature.
+        source: The source record.
         kwargs: Additional keyword arguments to pass to the registry model to create new records.
     """
     from lamindb._save import save as ln_save
@@ -1067,6 +1069,7 @@ def update_registry(
     registry = field.field.model
     filter_kwargs = check_registry_organism(registry, organism)
+    filter_kwargs.update({"source": source} if source else {})
     verbosity = settings.verbosity
     try:
@@ -1098,6 +1101,10 @@ def update_registry(
             if non_validated_labels
             else []
         )
+        # here we check to only save the public records if they are from the specified source
+        # TODO: this if shouldn't be needed
+        if source:
+            public_records = [r for r in public_records if r.source == source]
         ln_save(public_records)
         labels_saved["from public"] = [
             getattr(r, field.field.name) for r in public_records
@@ -1119,7 +1126,11 @@ def update_registry(
                     if registry == Feature:
                         init_kwargs["dtype"] = "cat" if dtype is None else dtype
                     non_validated_records.append(
-                        registry(**init_kwargs, **filter_kwargs, **kwargs)
+                        registry(
+                            **init_kwargs,
+                            **{k: v for k, v in filter_kwargs.items() if k != "source"},
+                            **{k: v for k, v in kwargs.items() if k != "sources"},
+                        )
                     )
             ln_save(non_validated_records)
@@ -1242,7 +1253,7 @@ def _save_organism(name: str):  # pragma: no cover
     organism = bt.Organism.filter(name=name).one_or_none()
     if organism is None:
-        organism = bt.Organism.from_public(name=name)
+        organism = bt.Organism.from_source(name=name)
         if organism is None:
             raise ValueError(
                 f"Organism '{name}' not found\n"

lamindb/_feature_set.py CHANGED Viewed

@@ -118,7 +118,7 @@ def from_values(
     name: str | None = None,
     mute: bool = False,
     organism: Record | str | None = None,
-    public_source: Record | None = None,
+    source: Record | None = None,
     raise_validation_error: bool = True,
 ) -> FeatureSet:
     """{}"""  # noqa: D415
@@ -139,7 +139,7 @@ def from_values(
         not_validated_values = values_array[~validated]
         msg = (
             f"These values could not be validated: {not_validated_values.tolist()}\n"
-            f"If there are no typos, add them to their registry: {registry}"
+            f"If there are no typos, add them to their registry: {registry.__name__}"
         )
         if raise_validation_error:
             raise ValidationError(msg)
@@ -149,7 +149,7 @@ def from_values(
         validated_values,
         field=field,
         organism=organism,
-        public_source=public_source,
+        source=source,
     )
     feature_set = FeatureSet(
         features=validated_features,
@@ -168,7 +168,7 @@ def from_df(
     name: str | None = None,
     mute: bool = False,
     organism: Record | str | None = None,
-    public_source: Record | None = None,
+    source: Record | None = None,
 ) -> FeatureSet | None:
     """{}"""  # noqa: D415
     registry = field.field.model
@@ -189,7 +189,7 @@ def from_df(
             df.columns[validated],
             field=field,
             organism=organism,
-            public_source=public_source,
+            source=source,
         )
         feature_set = FeatureSet(
             features=validated_features,

lamindb/_filter.py CHANGED Viewed

@@ -21,9 +21,9 @@ def filter(Record: type[Record], **expressions) -> QuerySet:
         ):
             visibility = "visibility"
             if not any(e.startswith(visibility) for e in expressions):
-                expressions[
-                    visibility
-                ] = VisibilityChoice.default.value  # default visibility
+                expressions[visibility] = (
+                    VisibilityChoice.default.value
+                )  # default visibility
             # if visibility is None, do not apply a filter
             # otherwise, it would mean filtering for NULL values, which doesn't make
             # sense for a non-NULLABLE column

lamindb/_finish.py CHANGED Viewed

@@ -80,8 +80,8 @@ def save_run_context_core(
     # for scripts, things are easy
     is_consecutive = True
-    is_notebook = transform.type == TransformType.notebook
-    source_code_path = filepath
+    is_notebook = transform.type == "notebook"
+    _source_code_artifact_path = filepath
     # for notebooks, we need more work
     if is_notebook:
         try:
@@ -134,12 +134,12 @@ def save_run_context_core(
         )
         # strip the output from the notebook to create the source code file
         # first, copy the notebook file to a temporary file in the cache
-        source_code_path = ln_setup.settings.storage.cache_dir / filepath.name
-        shutil.copy2(filepath, source_code_path)  # copy
+        _source_code_artifact_path = ln_setup.settings.storage.cache_dir / filepath.name
+        shutil.copy2(filepath, _source_code_artifact_path)  # copy
         subprocess.run(
             [
                 "nbstripout",
-                source_code_path,
+                _source_code_artifact_path,
                 "--extra-keys",
                 "metadata.version metadata.kernelspec metadata.language_info metadata.pygments_lexer metadata.name metadata.file_extension",
             ],
@@ -152,31 +152,34 @@ def save_run_context_core(
         transform_family = transform.versions
     if len(transform_family) > 0:
         for prev_transform in transform_family.order_by("-created_at"):
-            if prev_transform.latest_report_id is not None:
-                prev_report = prev_transform.latest_report
-            if prev_transform.source_code_id is not None:
-                prev_source = prev_transform.source_code
+            if (
+                prev_transform.latest_run is not None
+                and prev_transform.latest_run.report_id is not None
+            ):
+                prev_report = prev_transform.latest_run.report
+            if prev_transform._source_code_artifact_id is not None:
+                prev_source = prev_transform._source_code_artifact
     ln.settings.creation.artifact_silence_missing_run_warning = True
     # track source code
-    if transform.source_code_id is not None:
+    if transform._source_code_artifact_id is not None:
         # check if the hash of the transform source code matches
         # (for scripts, we already run the same logic in track() - we can deduplicate the call at some point)
-        hash, _ = hash_file(source_code_path)  # ignore hash_type for now
-        if hash != transform.source_code.hash:
+        hash, _ = hash_file(_source_code_artifact_path)  # ignore hash_type for now
+        if hash != transform._source_code_artifact.hash:
             if os.getenv("LAMIN_TESTING") is None:
                 # in test, auto-confirm overwrite
                 response = input(
-                    f"You are about to replace (overwrite) existing source code (hash '{transform.source_code.hash}') for transform version"
+                    f"You are about to replace (overwrite) existing source code (hash '{transform._source_code_artifact.hash}') for transform version"
                     f" '{transform.version}'. Proceed? (y/n)"
                 )
             else:
                 response = "y"
             if response == "y":
-                transform.source_code.replace(source_code_path)
-                transform.source_code.save(upload=True)
+                transform._source_code_artifact.replace(_source_code_artifact_path)
+                transform._source_code_artifact.save(upload=True)
                 logger.success(
-                    f"replaced transform.source_code: {transform.source_code}"
+                    f"replaced transform._source_code_artifact: {transform._source_code_artifact}"
                 )
             else:
                 logger.warning("Please re-run `ln.track()` to make a new version")
@@ -184,17 +187,19 @@ def save_run_context_core(
         else:
             logger.important("source code is already saved")
     else:
-        source_code = ln.Artifact(
-            source_code_path,
+        _source_code_artifact = ln.Artifact(
+            _source_code_artifact_path,
             description=f"Source of transform {transform.uid}",
             version=transform.version,
             is_new_version_of=prev_source,
             visibility=0,  # hidden file
             run=False,
         )
-        source_code.save(upload=True, print_progress=False)
-        transform.source_code = source_code
-        logger.debug(f"saved transform.source_code: {transform.source_code}")
+        _source_code_artifact.save(upload=True, print_progress=False)
+        transform._source_code_artifact = _source_code_artifact
+        logger.debug(
+            f"saved transform._source_code_artifact: {transform._source_code_artifact}"
+        )
     # track environment
     env_path = ln_setup.settings.storage.cache_dir / f"run_env_pip_{run.uid}.txt"
@@ -257,8 +262,9 @@ def save_run_context_core(
             run.report = report_file
         run.is_consecutive = is_consecutive
         run.save()
-        transform.latest_report = run.report
-        logger.debug(f"saved transform.latest_report: {transform.latest_report}")
+        logger.debug(
+            f"saved transform.latest_run.report: {transform.latest_run.report}"
+        )
     transform.save()
     # finalize

lamindb/_from_values.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Any, Iterable
+from typing import TYPE_CHECKING, Iterable
 import pandas as pd
 from django.core.exceptions import FieldDoesNotExist
@@ -19,9 +19,9 @@ def get_or_create_records(
     field: StrField,
     *,
     create: bool = False,
-    from_public: bool = False,
+    from_source: bool = False,
     organism: Record | str | None = None,
-    public_source: Record | None = None,
+    source: Record | None = None,
     mute: bool = False,
 ) -> list[Record]:
     """Get or create records from iterables."""
@@ -34,8 +34,8 @@ def get_or_create_records(
     kwargs: dict = {}
     if organism is not None:
         kwargs["organism"] = organism
-    if public_source is not None:
-        kwargs["public_source"] = public_source
+    if source is not None:
+        kwargs["source"] = source
     settings.creation.search_names = False
     try:
         iterable_idx = index_iterable(iterable)
@@ -47,8 +47,17 @@ def get_or_create_records(
         # new records to be created based on new values
         if len(nonexist_values) > 0:
-            if from_public:
-                records_bionty, unmapped_values = create_records_from_public(
+            if source:
+                from_source = not source.in_db
+            elif (
+                records
+                and hasattr(records[0], "source_id")
+                and records[0].source_id
+                and records[0].source.in_db
+            ):
+                from_source = False
+            if from_source:
+                records_bionty, unmapped_values = create_records_from_source(
                     iterable_idx=nonexist_values,
                     field=field,
                     msg=msg,
@@ -58,7 +67,7 @@ def get_or_create_records(
                 if len(records_bionty) > 0:
                     msg = ""
                 for record in records_bionty:
-                    record._from_public = True
+                    record._from_source = True
                 records += records_bionty
             else:
                 unmapped_values = nonexist_values
@@ -75,7 +84,7 @@ def get_or_create_records(
                         f"{colors.red('did not create')} {name} record{s} for "
                         f"{n_nonval} {colors.italic(f'{field.field.name}{s}')}: {print_values}"
                     )
-        if Record.__module__.startswith("lnschema_bionty.") or Record == ULabel:
+        if Record.__module__.startswith("bionty.") or Record == ULabel:
             if isinstance(iterable, pd.Series):
                 feature = iterable.name
             feature_name = None
@@ -100,8 +109,8 @@ def get_existing_records(
     model = field.field.model
     condition: dict = {} if len(kwargs) == 0 else kwargs.copy()
     # existing records matching is agnostic to the bionty source
-    if "public_source" in condition:
-        condition.pop("public_source")
+    if "source" in condition:
+        condition.pop("source")
     # standardize based on the DB reference
     # log synonyms mapped terms
@@ -109,7 +118,7 @@ def get_existing_records(
         iterable_idx,
         field=field,
         organism=kwargs.get("organism"),
-        public_source=kwargs.get("public_source"),
+        source=kwargs.get("source"),
         mute=True,
     )
     syn_mapper = result.synonyms_mapper
@@ -174,7 +183,7 @@ def get_existing_records(
     return records, nonexist_values, msg
-def create_records_from_public(
+def create_records_from_source(
     iterable_idx: pd.Index,
     field: StrField,
     msg: str = "",
@@ -184,7 +193,8 @@ def create_records_from_public(
     model = field.field.model
     records: list = []
     # populate additional fields from bionty
-    from lnschema_bionty._bionty import get_public_source_record
+    from bionty._bionty import get_source_record
+    from bionty.core._bionty import filter_bionty_df_columns
     # create the corresponding bionty object from model
     try:
@@ -195,17 +205,20 @@ def create_records_from_public(
                 organism = "human"
             elif iterable_idx[0].startswith("ENSMUSG"):
                 organism = "mouse"
-        public_ontology = model.public(
-            organism=organism, public_source=kwargs.get("public_source")
-        )
+        public_ontology = model.public(organism=organism, source=kwargs.get("source"))
     except Exception:
         # for custom records that are not created from public sources
         return records, iterable_idx
-    # add public_source record to the kwargs
-    kwargs.update({"public_source": get_public_source_record(public_ontology)})
+    # add source record to the kwargs
+    source_record = get_source_record(public_ontology)
+    if source_record is not None and source_record.in_db:
+        # skips the creation of records from public if the source is already in the db
+        return records, iterable_idx
+    kwargs.update({"source": source_record})
     # filter the columns in bionty df based on fields
-    bionty_df = _filter_bionty_df_columns(model=model, public_ontology=public_ontology)
+    bionty_df = filter_bionty_df_columns(model=model, public_ontology=public_ontology)
     # standardize in the bionty reference
     result = public_ontology.inspect(iterable_idx, field=field.field.name, mute=True)
@@ -301,43 +314,6 @@ def _print_values(names: Iterable, n: int = 20, quotes: bool = True) -> str:
     return print_values
-def _filter_bionty_df_columns(model: Record, public_ontology: Any) -> pd.DataFrame:
-    bionty_df = pd.DataFrame()
-    if public_ontology is not None:
-        model_field_names = {i.name for i in model._meta.fields}
-        # parents needs to be added here as relationships aren't in fields
-        model_field_names.add("parents")
-        bionty_df = public_ontology.df().reset_index()
-        if model.__name__ == "Gene":
-            # groupby ensembl_gene_id and concat ncbi_gene_ids
-            groupby_id_col = (
-                "ensembl_gene_id" if "ensembl_gene_id" in bionty_df else "stable_id"
-            )
-            bionty_df.drop(
-                columns=["hgnc_id", "mgi_id", "index"], errors="ignore", inplace=True
-            )
-            bionty_df.drop_duplicates([groupby_id_col, "ncbi_gene_id"], inplace=True)
-            bionty_df["ncbi_gene_id"] = bionty_df["ncbi_gene_id"].fillna("")
-            bionty_df = (
-                bionty_df.groupby(groupby_id_col)
-                .agg(
-                    {
-                        "symbol": "first",
-                        "ncbi_gene_id": "|".join,
-                        "biotype": "first",
-                        "description": "first",
-                        "synonyms": "first",
-                    }
-                )
-                .reset_index()
-            )
-            bionty_df.rename(columns={"ncbi_gene_id": "ncbi_gene_ids"}, inplace=True)
-        # rename definition to description for the lnschema_bionty
-        bionty_df.rename(columns={"definition": "description"}, inplace=True)
-        bionty_df = bionty_df.loc[:, bionty_df.columns.isin(model_field_names)]
-    return bionty_df
 def _bulk_create_dicts_from_df(
     keys: set | list, column_name: str, df: pd.DataFrame
 ) -> tuple[dict, str]:
@@ -359,7 +335,7 @@ def _bulk_create_dicts_from_df(
     return df.reset_index().to_dict(orient="records"), multi_msg
-def _has_organism_field(orm: Record) -> bool:
+def _has_organism_field(orm: type[Record]) -> bool:
     try:
         orm._meta.get_field("organism")
         return True
@@ -371,10 +347,15 @@ def _get_organism_record(
     field: StrField, organism: str | Record, force: bool = False
 ) -> Record:
     model = field.field.model
-    check = True if force else field.field.name != "ensembl_gene_id"
+    check = True
+    if not force and hasattr(model, "_ontology_id_field"):
+        check = field.field.name != model._ontology_id_field
+        # e.g. bionty.CellMarker has "name" as _ontology_id_field
+        if not model._ontology_id_field.endswith("id"):
+            check = True
     if _has_organism_field(model) and check:
-        from lnschema_bionty._bionty import create_or_get_organism_record
+        from bionty._bionty import create_or_get_organism_record
         organism_record = create_or_get_organism_record(organism=organism, orm=model)
         if organism_record is not None:

lamindb 0.74.3__py3-none-any.whl → 0.75.0__py3-none-any.whl

lamindb 0.74.3py3-none-any.whl → 0.75.0py3-none-any.whl