PyPI - lamindb - Versions diffs - 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl - Mend

lamindb 1.4.0py3-none-any.whl → 1.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

lamindb/__init__.py +52 -36
lamindb/_finish.py +17 -10
lamindb/_tracked.py +1 -1
lamindb/base/__init__.py +3 -1
lamindb/base/fields.py +40 -22
lamindb/base/ids.py +1 -94
lamindb/base/types.py +2 -0
lamindb/base/uids.py +117 -0
lamindb/core/_context.py +203 -102
lamindb/core/_settings.py +38 -25
lamindb/core/datasets/__init__.py +11 -4
lamindb/core/datasets/_core.py +5 -5
lamindb/core/datasets/_small.py +0 -93
lamindb/core/datasets/mini_immuno.py +172 -0
lamindb/core/loaders.py +1 -1
lamindb/core/storage/_backed_access.py +100 -6
lamindb/core/storage/_polars_lazy_df.py +51 -0
lamindb/core/storage/_pyarrow_dataset.py +15 -30
lamindb/core/storage/_tiledbsoma.py +29 -13
lamindb/core/storage/objects.py +6 -0
lamindb/core/subsettings/__init__.py +2 -0
lamindb/core/subsettings/_annotation_settings.py +11 -0
lamindb/curators/__init__.py +7 -3349
lamindb/curators/_legacy.py +2056 -0
lamindb/curators/core.py +1534 -0
lamindb/errors.py +11 -0
lamindb/examples/__init__.py +27 -0
lamindb/examples/schemas/__init__.py +12 -0
lamindb/examples/schemas/_anndata.py +25 -0
lamindb/examples/schemas/_simple.py +19 -0
lamindb/integrations/_vitessce.py +8 -5
lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
lamindb/models/__init__.py +4 -1
lamindb/models/_describe.py +21 -4
lamindb/models/_feature_manager.py +382 -287
lamindb/models/_label_manager.py +8 -2
lamindb/models/artifact.py +177 -106
lamindb/models/artifact_set.py +122 -0
lamindb/models/collection.py +73 -52
lamindb/models/core.py +1 -1
lamindb/models/feature.py +51 -17
lamindb/models/has_parents.py +69 -14
lamindb/models/project.py +1 -1
lamindb/models/query_manager.py +221 -22
lamindb/models/query_set.py +247 -172
lamindb/models/record.py +65 -247
lamindb/models/run.py +4 -4
lamindb/models/save.py +8 -2
lamindb/models/schema.py +456 -184
lamindb/models/transform.py +2 -2
lamindb/models/ulabel.py +8 -5
{lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
{lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
{lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
{lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0

lamindb/core/datasets/_core.py CHANGED Viewed

@@ -322,8 +322,6 @@ def anndata_human_immune_cells(
         import lamindb as ln
-        verbosity = ln.settings.verbosity
-        ln.settings.verbosity = "error"
         ln.save(
             bt.Gene.from_values(
                 adata.var.index, field="ensembl_gene_id", organism="human"
@@ -339,7 +337,6 @@ def anndata_human_immune_cells(
         ln.Feature(name="donor", dtype=[ln.ULabel]).save()  # type: ignore
         bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
         ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
-        ln.settings.verbosity = verbosity
     return adata
@@ -560,11 +557,14 @@ def spatialdata_blobs() -> SpatialData:
     from spatialdata.datasets import blobs
     sdata = blobs()
-    sdata.attrs["sample"] = {
-        "assay": "Visium Spatial Gene Expression",
+    sdata.attrs["bio"] = {
         "disease": "Alzheimer disease",
         "developmental_stage": "adult stage",
     }
+    sdata.attrs["tech"] = {
+        "assay": "Visium Spatial Gene Expression",
+    }
+    sdata.attrs["random_int"] = 20
     sdata.tables["table"].var.index = [
         "ENSG00000139618",  # BRCA2
         "ENSG00000157764",  # BRAF

lamindb/core/datasets/_small.py CHANGED Viewed

@@ -7,99 +7,6 @@ import numpy as np
 import pandas as pd
-def small_dataset1(
-    otype: Literal["DataFrame", "AnnData"] = "DataFrame",
-    gene_symbols_in_index: bool = False,
-    with_typo: bool = False,
-    with_cell_type_synonym: bool = False,
-    with_cell_type_typo: bool = False,
-) -> pd.DataFrame | ad.AnnData:
-    # define the data in the dataset
-    # it's a mix of numerical measurements and observation-level metadata
-    ifng = "IFNJ" if with_typo else "IFNG"
-    if gene_symbols_in_index:
-        var_ids = ["CD8A", "CD4", "CD14"]
-    else:
-        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
-    abt_cell = (
-        "CD8-pos alpha-beta T cell"
-        if with_cell_type_typo
-        else "CD8-positive, alpha-beta T cell"
-    )
-    dataset_dict = {
-        var_ids[0]: [1, 2, 3],
-        var_ids[1]: [3, 4, 5],
-        var_ids[2]: [5, 6, 7],
-        "perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
-        "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
-        "cell_type_by_expert": pd.Categorical(
-            ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
-        ),
-        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
-        "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
-        "concentration": ["0.1%", "200 nM", "0.1%"],
-        "treatment_time_h": [24, 24, 6],
-        "donor": ["D0001", "D0002", None],
-    }
-    # define the dataset-level metadata
-    metadata = {
-        "temperature": 21.6,
-        "experiment": "Experiment 1",
-        "date_of_study": "2024-12-01",
-        "study_note": "We had a great time performing this study and the results look compelling.",
-    }
-    # the dataset as DataFrame
-    dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
-    if otype == "DataFrame":
-        for key, value in metadata.items():
-            dataset_df.attrs[key] = value
-        return dataset_df
-    else:
-        dataset_ad = ad.AnnData(
-            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
-        )
-        return dataset_ad
-def small_dataset2(
-    otype: Literal["DataFrame", "AnnData"],
-    gene_symbols_in_index: bool = False,
-) -> pd.DataFrame | ad.AnnData:
-    if gene_symbols_in_index:
-        var_ids = ["CD8A", "CD4", "CD38"]
-    else:
-        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
-    dataset_dict = {
-        var_ids[0]: [2, 3, 3],
-        var_ids[1]: [3, 4, 5],
-        var_ids[2]: [4, 2, 3],
-        "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
-        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
-    }
-    metadata = {
-        "temperature": 22.6,
-        "experiment": "Experiment 2",
-        "date_of_study": "2025-02-13",
-    }
-    dataset_df = pd.DataFrame(
-        dataset_dict,
-        index=["sample4", "sample5", "sample6"],
-    )
-    ad.AnnData(
-        dataset_df[var_ids],
-        obs=dataset_df[["perturbation", "cell_type_by_model"]],
-    )
-    if otype == "DataFrame":
-        for key, value in metadata.items():
-            dataset_df.attrs[key] = value
-        return dataset_df
-    else:
-        dataset_ad = ad.AnnData(
-            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
-        )
-        return dataset_ad
 def small_dataset3_cellxgene(
     otype: Literal["DataFrame", "AnnData"] = "AnnData",
 ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:

lamindb/core/datasets/mini_immuno.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""The mini immuno dataset.
+.. autosummary::
+   :toctree: .
+   define_features_labels
+   get_dataset1
+   get_dataset2
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Literal
+import anndata as ad
+import pandas as pd
+if TYPE_CHECKING:
+    from lamindb.models import Schema
+def define_features_labels() -> None:
+    """Features & labels to validate the mini immuno datasets.
+    .. literalinclude:: scripts/define_mini_immuno_features_labels.py
+        :language: python
+    """
+    import sys
+    from pathlib import Path
+    docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
+    if str(docs_path) not in sys.path:
+        sys.path.append(str(docs_path))
+    import define_mini_immuno_features_labels  # noqa
+def define_mini_immuno_schema_flexible() -> Schema:
+    """Features & labels to validate the mini immuno datasets.
+    .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
+        :language: python
+    """
+    import sys
+    from pathlib import Path
+    from lamindb.models import Schema
+    docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
+    if str(docs_path) not in sys.path:
+        sys.path.append(str(docs_path))
+    define_features_labels()
+    import define_mini_immuno_schema_flexible  # noqa
+    return Schema.get(name="Mini immuno schema")
+def get_dataset1(
+    otype: Literal["DataFrame", "AnnData"] = "DataFrame",
+    gene_symbols_in_index: bool = False,
+    with_typo: bool = False,
+    with_cell_type_synonym: bool = False,
+    with_cell_type_typo: bool = False,
+    with_gene_typo: bool = False,
+    with_outdated_gene: bool = False,
+    with_wrong_subtype: bool = False,
+    with_index_type_mismatch: bool = False,
+) -> pd.DataFrame | ad.AnnData:
+    """A small tabular dataset measuring expression & metadata."""
+    # define the data in the dataset
+    # it's a mix of numerical measurements and observation-level metadata
+    ifng = "IFNJ" if with_typo else "IFNG"
+    thing = "ulabel_but_not_perturbation" if with_wrong_subtype else "DMSO"
+    if gene_symbols_in_index:
+        var_ids = ["CD8A", "CD4", "CD14" if not with_gene_typo else "GeneTypo"]
+    else:
+        var_ids = [
+            "ENSG00000153563",
+            "ENSG00000010610",
+            "ENSG00000170458"
+            if not with_gene_typo
+            else "GeneTypo"
+            if not with_outdated_gene
+            else "ENSG00000278198",
+        ]
+    abt_cell = (
+        "CD8-pos alpha-beta T cell"
+        if with_cell_type_typo
+        else "CD8-positive, alpha-beta T cell"
+    )
+    dataset_dict = {
+        var_ids[0]: [1, 2, 3],
+        var_ids[1]: [3, 4, 5],
+        var_ids[2]: [5, 6, 7],
+        "perturbation": pd.Categorical(["DMSO", ifng, thing]),
+        "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
+        "cell_type_by_expert": pd.Categorical(
+            ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
+        ),
+        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
+        "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
+        "concentration": ["0.1%", "200 nM", "0.1%"],
+        "treatment_time_h": [24, 24, 6],
+        "donor": ["D0001", "D0002", None],
+    }
+    # define the dataset-level metadata
+    metadata = {
+        "temperature": 21.6,
+        "experiment": "Experiment 1",
+        "date_of_study": "2024-12-01",
+        "study_note": "We had a great time performing this study and the results look compelling.",
+    }
+    # the dataset as DataFrame
+    dataset_df = pd.DataFrame(
+        dataset_dict,
+        index=["sample1", "sample2", 0]  # type: ignore
+        if with_index_type_mismatch
+        else ["sample1", "sample2", "sample3"],
+    )
+    if otype == "DataFrame":
+        for key, value in metadata.items():
+            dataset_df.attrs[key] = value
+        return dataset_df
+    else:
+        dataset_ad = ad.AnnData(
+            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
+        )
+        return dataset_ad
+def get_dataset2(
+    otype: Literal["DataFrame", "AnnData"],
+    gene_symbols_in_index: bool = False,
+) -> pd.DataFrame | ad.AnnData:
+    if gene_symbols_in_index:
+        var_ids = ["CD8A", "CD4", "CD38"]
+    else:
+        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
+    dataset_dict = {
+        var_ids[0]: [2, 3, 3],
+        var_ids[1]: [3, 4, 5],
+        var_ids[2]: [4, 2, 3],
+        "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
+        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
+        "concentration": ["0.1%", "200 nM", "0.1%"],
+        "treatment_time_h": [24, 24, 6],
+        "donor": ["D0003", "D0003", "D0004"],
+    }
+    metadata = {
+        "temperature": 22.6,
+        "experiment": "Experiment 2",
+        "date_of_study": "2025-02-13",
+    }
+    dataset_df = pd.DataFrame(
+        dataset_dict,
+        index=["sample4", "sample5", "sample6"],
+    )
+    ad.AnnData(
+        dataset_df[var_ids],
+        obs=dataset_df[["perturbation", "cell_type_by_model"]],
+    )
+    if otype == "DataFrame":
+        for key, value in metadata.items():
+            dataset_df.attrs[key] = value
+        return dataset_df
+    else:
+        dataset_ad = ad.AnnData(
+            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
+        )
+        return dataset_ad

lamindb/core/loaders.py CHANGED Viewed

@@ -44,7 +44,7 @@ try:
 except ImportError:
     def load_zarr(storepath):  # type: ignore
-        raise ImportError("Please install zarr: pip install zarr<=2.18.4")
+        raise ImportError("Please install zarr: pip install 'zarr<=2.18.4'")
 is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)

lamindb/core/storage/_backed_access.py CHANGED Viewed

@@ -1,20 +1,26 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Literal
 from anndata._io.specs.registry import get_spec
 from ._anndata_accessor import AnnDataAccessor, StorageType, registry
-from ._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
+from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df
+from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset
 from ._tiledbsoma import _open_tiledbsoma
 from .paths import filepath_from_artifact
 if TYPE_CHECKING:
+    from collections.abc import Iterator
     from fsspec.core import OpenFile
+    from polars import LazyFrame as PolarsLazyFrame
     from pyarrow.dataset import Dataset as PyArrowDataset
     from tiledbsoma import Collection as SOMACollection
     from tiledbsoma import Experiment as SOMAExperiment
+    from tiledbsoma import Measurement as SOMAMeasurement
     from upath import UPath
     from lamindb.models.artifact import Artifact
@@ -69,10 +75,17 @@ class BackedAccessor:
 def backed_access(
     artifact_or_filepath: Artifact | UPath,
     mode: str = "r",
+    engine: Literal["pyarrow", "polars"] = "pyarrow",
     using_key: str | None = None,
     **kwargs,
 ) -> (
-    AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
+    AnnDataAccessor
+    | BackedAccessor
+    | SOMACollection
+    | SOMAExperiment
+    | SOMAMeasurement
+    | PyArrowDataset
+    | Iterator[PolarsLazyFrame]
 ):
     from lamindb.models import Artifact
@@ -97,12 +110,15 @@ def backed_access(
         conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
     elif suffix == ".zarr":
         conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
-    elif _is_pyarrow_dataset(objectpath):
-        return _open_pyarrow_dataset(objectpath, **kwargs)
+    elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
+        df_suffix := df_suffixes.pop()
+    ) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
+        return _open_dataframe(objectpath, df_suffix, engine, **kwargs)
     else:
         raise ValueError(
             "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
-            f"or be compatible with pyarrow.dataset.dataset, instead of being {suffix} object."
+            f"be compatible with pyarrow.dataset.dataset or polars.scan_* functions, "
+            f"instead of being {suffix} object."
         )
     is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
@@ -112,3 +128,81 @@ def backed_access(
         return AnnDataAccessor(conn, storage, name)
     else:
         return BackedAccessor(conn, storage)
+def _flat_suffixes(paths: UPath | list[UPath]) -> set[str]:
+    # it is assumed here that the paths exist
+    # we don't check here that the filesystem is the same
+    # but this is a requirement for pyarrow.dataset.dataset
+    path_list = []
+    if isinstance(paths, Path):
+        paths = [paths]
+    for path in paths:
+        # assume http is always a file
+        if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
+            path_list += [p for p in path.rglob("*") if p.suffix != ""]
+        else:
+            path_list.append(path)
+    suffixes = set()
+    for path in path_list:
+        path_suffixes = path.suffixes
+        # this doesn't work for externally gzipped files, REMOVE LATER
+        path_suffix = (
+            path_suffixes[-2]
+            if len(path_suffixes) > 1 and ".gz" in path_suffixes
+            else path.suffix
+        )
+        suffixes.add(path_suffix)
+    return suffixes
+def _open_dataframe(
+    paths: UPath | list[UPath],
+    suffix: str | None = None,
+    engine: Literal["pyarrow", "polars"] = "pyarrow",
+    **kwargs,
+) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
+    df_suffix: str
+    if suffix is None:
+        df_suffixes = _flat_suffixes(paths)
+        if len(df_suffixes) > 1:
+            raise ValueError(
+                f"The artifacts in the collection have different file formats: {', '.join(df_suffixes)}.\n"
+                "It is not possible to open such stores with pyarrow or polars."
+            )
+        df_suffix = df_suffixes.pop()
+    else:
+        df_suffix = suffix
+    if engine == "pyarrow":
+        if df_suffix not in PYARROW_SUFFIXES:
+            raise ValueError(
+                f"{df_suffix} files are not supported by pyarrow, "
+                f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
+            )
+        # this checks that the filesystem is the same for all paths
+        # this is a requirement of pyarrow.dataset.dataset
+        if not isinstance(paths, Path):  # is a list then
+            fs = getattr(paths[0], "fs", None)
+            for path in paths[1:]:
+                # this assumes that the filesystems are cached by fsspec
+                if getattr(path, "fs", None) is not fs:
+                    raise ValueError(
+                        "The collection has artifacts with different filesystems, "
+                        "this is not supported by pyarrow."
+                    )
+        dataframe = _open_pyarrow_dataset(paths, **kwargs)
+    elif engine == "polars":
+        if df_suffix not in POLARS_SUFFIXES:
+            raise ValueError(
+                f"{df_suffix} files are not supported by polars, "
+                f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
+            )
+        dataframe = _open_polars_lazy_df(paths, **kwargs)
+    else:
+        raise ValueError(
+            f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
+        )
+    return dataframe

lamindb/core/storage/_polars_lazy_df.py ADDED Viewed

@@ -0,0 +1,51 @@
+from __future__ import annotations
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from polars import LazyFrame as PolarsLazyFrame
+    from upath import UPath
+POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")
+@contextmanager
+def _open_polars_lazy_df(
+    paths: UPath | list[UPath], **kwargs
+) -> Iterator[PolarsLazyFrame]:
+    try:
+        import polars as pl
+    except ImportError as ie:
+        raise ImportError("Please install polars: pip install polars") from ie
+    scans = {
+        ".parquet": pl.scan_parquet,
+        ".csv": pl.scan_csv,
+        ".ndjson": pl.scan_ndjson,
+        ".ipc": pl.scan_ipc,
+    }
+    path_list = []
+    if isinstance(paths, Path):
+        paths = [paths]
+    for path in paths:
+        # assume http is always a file
+        if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
+            path_list += [p for p in path.rglob("*") if p.suffix != ""]
+        else:
+            path_list.append(path)
+    open_files = []
+    try:
+        for path in path_list:
+            open_files.append(path.open(mode="rb"))
+        yield scans[path_list[0].suffix](open_files, **kwargs)
+    finally:
+        for open_file in open_files:
+            open_file.close()

lamindb/core/storage/_pyarrow_dataset.py CHANGED Viewed

@@ -13,41 +13,26 @@ if TYPE_CHECKING:
 PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
-def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
-    # it is assumed here that the paths exist
-    # we don't check here that the filesystem is the same
-    # but this is a requirement for pyarrow.dataset.dataset
-    if isinstance(paths, list):
-        path_list = paths
-    elif paths.is_dir():
-        path_list = [path for path in paths.rglob("*") if path.suffix != ""]
-    else:
-        path_list = [paths]
-    suffix = None
-    for path in path_list:
-        path_suffixes = path.suffixes
-        # this doesn't work for externally gzipped files, REMOVE LATER
-        path_suffix = (
-            path_suffixes[-2]
-            if len(path_suffixes) > 1 and ".gz" in path_suffixes
-            else path.suffix
-        )
-        if path_suffix not in PYARROW_SUFFIXES:
-            return False
-        elif suffix is None:
-            suffix = path_suffix
-        elif path_suffix != suffix:
-            return False
-    return True
 def _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset:
     if isinstance(paths, list):
+        # a single path can be a directory, but a list of paths
+        # has to be a flat list of files
+        paths_str = []
         path0 = paths[0]
         if isinstance(path0, LocalPathClasses):
-            paths_str, filesystem = [path.as_posix() for path in paths], None
+            path_to_str = lambda p: p.as_posix()
+            filesystem = None
         else:
-            paths_str, filesystem = [path.path for path in paths], path0.fs
+            path_to_str = lambda p: p.path
+            filesystem = path0.fs
+        for path in paths:
+            if (
+                getattr(path, "protocol", None) not in {"http", "https"}
+                and path.is_dir()
+            ):
+                paths_str += [path_to_str(p) for p in path.rglob("*") if p.suffix != ""]
+            else:
+                paths_str.append(path_to_str(path))
     elif isinstance(paths, LocalPathClasses):
         paths_str, filesystem = paths.as_posix(), None
     else:

lamindb/core/storage/_tiledbsoma.py CHANGED Viewed

@@ -110,7 +110,7 @@ def save_tiledbsoma_experiment(
 ) -> Artifact:
     """Write `AnnData` to `tiledbsoma.Experiment`.
-    Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an {class}`~lamindb.Artifact`.
+    Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an :class:`~lamindb.Artifact`.
     Populates a column `lamin_run_uid` column in `obs` with the current `run.uid`.
@@ -202,28 +202,44 @@ def save_tiledbsoma_experiment(
             context=ctx,
         )
+    prepare_experiment = False
     resize_experiment = False
     if registration_mapping is not None:
-        if version.parse(soma.__version__) < version.parse("1.15.0rc4"):
+        soma_version_parsed = version.parse(soma.__version__)
+        if soma_version_parsed < version.parse("1.15.0rc4"):
             n_observations = len(registration_mapping.obs_axis.data)
         else:
             n_observations = registration_mapping.get_obs_shape()
-            resize_experiment = True
+            prepare_experiment = soma_version_parsed >= version.parse("1.16.2")
+            resize_experiment = not prepare_experiment
     else:  # happens only if not appending and only one adata passed
         assert len(adata_objects) == 1  # noqa: S101
         n_observations = adata_objects[0].n_obs
     logger.important(f"Writing the tiledbsoma store to {storepath_str}")
+    experiment_exists: bool | None = None
     for adata_obj in adata_objects:
-        if resize_experiment and soma.Experiment.exists(storepath_str, context=ctx):
-            # can only happen if registration_mapping is not None
-            soma_io.resize_experiment(
-                storepath_str,
-                nobs=n_observations,
-                nvars=registration_mapping.get_var_shapes(),
-                context=ctx,
-            )
-            resize_experiment = False
+        # do not recheck if True
+        if not experiment_exists and (resize_experiment or prepare_experiment):
+            experiment_exists = soma.Experiment.exists(storepath_str, context=ctx)
+        if experiment_exists:
+            # both can only happen if registration_mapping is not None
+            if resize_experiment:
+                soma_io.resize_experiment(
+                    storepath_str,
+                    nobs=n_observations,
+                    nvars=registration_mapping.get_var_shapes(),
+                    context=ctx,
+                )
+                resize_experiment = False
+            elif prepare_experiment:
+                registration_mapping.prepare_experiment(storepath_str, context=ctx)
+                prepare_experiment = False
+        registration_mapping_write = (
+            registration_mapping.subset_for_anndata(adata_obj)
+            if hasattr(registration_mapping, "subset_for_anndata")
+            else registration_mapping
+        )
         soma_io.from_anndata(
             storepath_str,
             adata_obj,
@@ -231,7 +247,7 @@ def save_tiledbsoma_experiment(
             context=ctx,
             obs_id_name=obs_id_name,
             var_id_name=var_id_name,
-            registration_mapping=registration_mapping,
+            registration_mapping=registration_mapping_write,
             **kwargs,
         )

lamindb/core/storage/objects.py CHANGED Viewed

@@ -21,6 +21,7 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
     """Infer LaminDB storage file suffix from a data object."""
     if isinstance(dmem, AnnData):
         if format is not None:
+            # should be `.h5ad`, `.`zarr`, or `.anndata.zarr`
             if format not in {"h5ad", "zarr", "anndata.zarr"}:
                 raise ValueError(
                     "Error when specifying AnnData storage format, it should be"
@@ -31,6 +32,8 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
         return ".h5ad"
     if isinstance(dmem, DataFrame):
+        if format == ".csv":
+            return ".csv"
         return ".parquet"
     if with_package_obj(
@@ -79,6 +82,9 @@ def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
             raise NotImplementedError
     if isinstance(dmem, DataFrame):
+        if filepath.suffix == ".csv":
+            dmem.to_csv(filepath)
+            return
         dmem.to_parquet(filepath)
         return

lamindb/core/subsettings/__init__.py CHANGED Viewed

@@ -4,7 +4,9 @@
    :toctree: .
    CreationSettings
+   AnnotationSettings
 """
+from ._annotation_settings import AnnotationSettings
 from ._creation_settings import CreationSettings

lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl

lamindb 1.4.0py3-none-any.whl → 1.5.1py3-none-any.whl