PyPI - lamindb - Versions diffs - 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl - Mend

lamindb 1.3.2py3-none-any.whl → 1.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

lamindb/__init__.py +52 -36
lamindb/_finish.py +17 -10
lamindb/_tracked.py +1 -1
lamindb/base/__init__.py +3 -1
lamindb/base/fields.py +40 -22
lamindb/base/ids.py +1 -94
lamindb/base/types.py +2 -0
lamindb/base/uids.py +117 -0
lamindb/core/_context.py +216 -133
lamindb/core/_settings.py +38 -25
lamindb/core/datasets/__init__.py +11 -4
lamindb/core/datasets/_core.py +5 -5
lamindb/core/datasets/_small.py +0 -93
lamindb/core/datasets/mini_immuno.py +172 -0
lamindb/core/loaders.py +1 -1
lamindb/core/storage/_backed_access.py +100 -6
lamindb/core/storage/_polars_lazy_df.py +51 -0
lamindb/core/storage/_pyarrow_dataset.py +15 -30
lamindb/core/storage/objects.py +6 -0
lamindb/core/subsettings/__init__.py +2 -0
lamindb/core/subsettings/_annotation_settings.py +11 -0
lamindb/curators/__init__.py +7 -3559
lamindb/curators/_legacy.py +2056 -0
lamindb/curators/core.py +1546 -0
lamindb/errors.py +11 -0
lamindb/examples/__init__.py +27 -0
lamindb/examples/schemas/__init__.py +12 -0
lamindb/examples/schemas/_anndata.py +25 -0
lamindb/examples/schemas/_simple.py +19 -0
lamindb/integrations/_vitessce.py +8 -5
lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
lamindb/models/__init__.py +12 -2
lamindb/models/_describe.py +21 -4
lamindb/models/_feature_manager.py +384 -301
lamindb/models/_from_values.py +1 -1
lamindb/models/_is_versioned.py +5 -15
lamindb/models/_label_manager.py +8 -2
lamindb/models/artifact.py +354 -177
lamindb/models/artifact_set.py +122 -0
lamindb/models/can_curate.py +4 -1
lamindb/models/collection.py +79 -56
lamindb/models/core.py +1 -1
lamindb/models/feature.py +78 -47
lamindb/models/has_parents.py +24 -9
lamindb/models/project.py +3 -3
lamindb/models/query_manager.py +221 -22
lamindb/models/query_set.py +251 -206
lamindb/models/record.py +211 -344
lamindb/models/run.py +59 -5
lamindb/models/save.py +9 -5
lamindb/models/schema.py +673 -196
lamindb/models/transform.py +5 -14
lamindb/models/ulabel.py +8 -5
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/METADATA +8 -7
lamindb-1.5.0.dist-info/RECORD +108 -0
lamindb-1.3.2.dist-info/RECORD +0 -95
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/LICENSE +0 -0
{lamindb-1.3.2.dist-info → lamindb-1.5.0.dist-info}/WHEEL +0 -0

lamindb/core/_settings.py CHANGED Viewed

@@ -9,6 +9,7 @@ from lamindb_setup._set_managed_storage import set_managed_storage
 from lamindb_setup.core._settings import settings as setup_settings
 from lamindb_setup.core._settings_instance import sanitize_git_repo_url
+from .subsettings._annotation_settings import AnnotationSettings, annotation_settings
 from .subsettings._creation_settings import CreationSettings, creation_settings
 if TYPE_CHECKING:
@@ -34,13 +35,13 @@ VERBOSITY_TO_STR: dict[int, str] = dict(
 class Settings:
     """Settings.
-    Use ``lamindb.settings`` instead of instantiating this class yourself.
+    Use `lamindb.settings` instead of instantiating this class yourself.
     """
-    def __init__(self, git_repo: str | None):
+    def __init__(self):
         self._verbosity_int: int = 1  # warning-level logging
         logger.set_verbosity(self._verbosity_int)
-        self._sync_git_repo: str | None = git_repo
+        self._sync_git_repo: str | None = None
     @property
     def creation(self) -> CreationSettings:
@@ -51,6 +52,15 @@ class Settings:
         """
         return creation_settings
+    @property
+    def annotation(self) -> AnnotationSettings:
+        """Artifact annotation settings.
+        For example, `ln.settings.creation.search_names = False` will disable
+        searching for records with similar names during creation.
+        """
+        return annotation_settings
     track_run_inputs: bool = True
     """Track files as input upon `.load()`, `.cache()` and `.open()`.
@@ -85,13 +95,18 @@ class Settings:
         Provide the full git repo URL.
         """
-        return self._sync_git_repo
+        if self._sync_git_repo is not None:
+            return self._sync_git_repo
+        elif os.environ.get("LAMINDB_MULTI_INSTANCE") == "true":
+            return None
+        else:
+            return setup_settings.instance.git_repo
     @sync_git_repo.setter
     def sync_git_repo(self, value) -> None:
         """Sync transforms with scripts in git repository.
-        For example: `ln.sync_git_repo = https://github.com/laminlabs/redun-lamin`
+        For example: `ln.settings.sync_git_repo = https://github.com/laminlabs/redun-lamin`
         """
         self._sync_git_repo = sanitize_git_repo_url(value)
         if not self._sync_git_repo.startswith("https://"):  # pragma: nocover
@@ -99,28 +114,31 @@ class Settings:
     @property
     def storage(self) -> StorageSettings:
-        """Default storage location.
+        """Current default storage location for writes.
         Examples:
-        >>> ln.settings.storage
-        StorageSettings(root='s3://my-bucket', uid='j7MaPxtLxPeE')
+        Retrieve the storage settings::
+            ln.settings.storage
+            #> StorageSettings(root='s3://my-bucket')
-        >>> ln.settings.storage.root
-        UPath('s3://my-bucket')
+        Retrieve the storage root::
-        You can switch the default storage location to another managed storage
-        location by passing a string:
+            ln.settings.storage.root
+            #> UPath('s3://my-bucket')
-        >>> ln.settings.storage = "s3://some-bucket"
+        You can write artifacts to other storage locations by switching the current default storage location::
-        You can also pass additional fsspec kwargs via:
+            ln.settings.storage = "s3://some-bucket"
-        >>> kwargs = dict(
-        >>>     profile="some_profile", # fsspec arg
-        >>>     cache_regions=True # fsspec arg for s3
-        >>> )
-        >>> ln.settings.storage = "s3://some-bucket", kwargs
+        You can also pass additional fsspec kwargs via::
+            kwargs = dict(
+                profile="some_profile", # fsspec arg
+                cache_regions=True # fsspec arg for s3
+            )
+            ln.settings.storage = "s3://some-bucket", kwargs
         """
         return self._storage_settings
@@ -174,9 +192,4 @@ class Settings:
         logger.set_verbosity(verbosity_int)
-if os.environ.get("LAMINDB_MULTI_INSTANCE") == "true":
-    git_repo = None
-else:
-    git_repo = setup_settings.instance.git_repo
-settings = Settings(git_repo=git_repo)
+settings = Settings()

lamindb/core/datasets/__init__.py CHANGED Viewed

@@ -1,12 +1,17 @@
 """Test datasets.
+The mini immuno dataset.
+.. autosummary::
+   :toctree: .
+   mini_immuno
 Small in-memory datasets.
 .. autosummary::
    :toctree: .
-   small_dataset1
-   small_dataset2
    anndata_with_obs
 Files.
@@ -59,6 +64,7 @@ Other.
    fake_bio_notebook_titles
 """
+from . import mini_immuno
 from ._core import (
     anndata_file_pbmc68k_test,
     anndata_human_immune_cells,
@@ -88,7 +94,8 @@ from ._core import (
 from ._fake import fake_bio_notebook_titles
 from ._small import (
     anndata_with_obs,
-    small_dataset1,
-    small_dataset2,
     small_dataset3_cellxgene,
 )
+small_dataset1 = mini_immuno.get_dataset1  # backward compat
+small_dataset2 = mini_immuno.get_dataset2  # backward compat

lamindb/core/datasets/_core.py CHANGED Viewed

@@ -322,8 +322,6 @@ def anndata_human_immune_cells(
         import lamindb as ln
-        verbosity = ln.settings.verbosity
-        ln.settings.verbosity = "error"
         ln.save(
             bt.Gene.from_values(
                 adata.var.index, field="ensembl_gene_id", organism="human"
@@ -339,7 +337,6 @@ def anndata_human_immune_cells(
         ln.Feature(name="donor", dtype=[ln.ULabel]).save()  # type: ignore
         bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
         ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
-        ln.settings.verbosity = verbosity
     return adata
@@ -560,11 +557,14 @@ def spatialdata_blobs() -> SpatialData:
     from spatialdata.datasets import blobs
     sdata = blobs()
-    sdata.attrs["sample"] = {
-        "assay": "Visium Spatial Gene Expression",
+    sdata.attrs["bio"] = {
         "disease": "Alzheimer disease",
         "developmental_stage": "adult stage",
     }
+    sdata.attrs["tech"] = {
+        "assay": "Visium Spatial Gene Expression",
+    }
+    sdata.attrs["random_int"] = 20
     sdata.tables["table"].var.index = [
         "ENSG00000139618",  # BRCA2
         "ENSG00000157764",  # BRAF

lamindb/core/datasets/_small.py CHANGED Viewed

@@ -7,99 +7,6 @@ import numpy as np
 import pandas as pd
-def small_dataset1(
-    otype: Literal["DataFrame", "AnnData"] = "DataFrame",
-    gene_symbols_in_index: bool = False,
-    with_typo: bool = False,
-    with_cell_type_synonym: bool = False,
-    with_cell_type_typo: bool = False,
-) -> pd.DataFrame | ad.AnnData:
-    # define the data in the dataset
-    # it's a mix of numerical measurements and observation-level metadata
-    ifng = "IFNJ" if with_typo else "IFNG"
-    if gene_symbols_in_index:
-        var_ids = ["CD8A", "CD4", "CD14"]
-    else:
-        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
-    abt_cell = (
-        "CD8-pos alpha-beta T cell"
-        if with_cell_type_typo
-        else "CD8-positive, alpha-beta T cell"
-    )
-    dataset_dict = {
-        var_ids[0]: [1, 2, 3],
-        var_ids[1]: [3, 4, 5],
-        var_ids[2]: [5, 6, 7],
-        "perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
-        "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
-        "cell_type_by_expert": pd.Categorical(
-            ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
-        ),
-        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
-        "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
-        "concentration": ["0.1%", "200 nM", "0.1%"],
-        "treatment_time_h": [24, 24, 6],
-        "donor": ["D0001", "D0002", None],
-    }
-    # define the dataset-level metadata
-    metadata = {
-        "temperature": 21.6,
-        "study": "Candidate marker study 1",
-        "date_of_study": "2024-12-01",
-        "study_note": "We had a great time performing this study and the results look compelling.",
-    }
-    # the dataset as DataFrame
-    dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
-    if otype == "DataFrame":
-        for key, value in metadata.items():
-            dataset_df.attrs[key] = value
-        return dataset_df
-    else:
-        dataset_ad = ad.AnnData(
-            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
-        )
-        return dataset_ad
-def small_dataset2(
-    otype: Literal["DataFrame", "AnnData"],
-    gene_symbols_in_index: bool = False,
-) -> pd.DataFrame | ad.AnnData:
-    if gene_symbols_in_index:
-        var_ids = ["CD8A", "CD4", "CD38"]
-    else:
-        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
-    dataset_dict = {
-        var_ids[0]: [2, 3, 3],
-        var_ids[1]: [3, 4, 5],
-        var_ids[2]: [4, 2, 3],
-        "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
-        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
-    }
-    metadata = {
-        "temperature": 22.6,
-        "study": "Candidate marker study 2",
-        "date_of_study": "2025-02-13",
-    }
-    dataset_df = pd.DataFrame(
-        dataset_dict,
-        index=["sample4", "sample5", "sample6"],
-    )
-    ad.AnnData(
-        dataset_df[var_ids],
-        obs=dataset_df[["perturbation", "cell_type_by_model"]],
-    )
-    if otype == "DataFrame":
-        for key, value in metadata.items():
-            dataset_df.attrs[key] = value
-        return dataset_df
-    else:
-        dataset_ad = ad.AnnData(
-            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
-        )
-        return dataset_ad
 def small_dataset3_cellxgene(
     otype: Literal["DataFrame", "AnnData"] = "AnnData",
 ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:

lamindb/core/datasets/mini_immuno.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""The mini immuno dataset.
+.. autosummary::
+   :toctree: .
+   define_features_labels
+   get_dataset1
+   get_dataset2
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Literal
+import anndata as ad
+import pandas as pd
+if TYPE_CHECKING:
+    from lamindb.models import Schema
+def define_features_labels() -> None:
+    """Features & labels to validate the mini immuno datasets.
+    .. literalinclude:: scripts/define_mini_immuno_features_labels.py
+        :language: python
+    """
+    import sys
+    from pathlib import Path
+    docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
+    if str(docs_path) not in sys.path:
+        sys.path.append(str(docs_path))
+    import define_mini_immuno_features_labels  # noqa
+def define_mini_immuno_schema_flexible() -> Schema:
+    """Features & labels to validate the mini immuno datasets.
+    .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
+        :language: python
+    """
+    import sys
+    from pathlib import Path
+    from lamindb.models import Schema
+    docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
+    if str(docs_path) not in sys.path:
+        sys.path.append(str(docs_path))
+    define_features_labels()
+    import define_mini_immuno_schema_flexible  # noqa
+    return Schema.get(name="Mini immuno schema")
+def get_dataset1(
+    otype: Literal["DataFrame", "AnnData"] = "DataFrame",
+    gene_symbols_in_index: bool = False,
+    with_typo: bool = False,
+    with_cell_type_synonym: bool = False,
+    with_cell_type_typo: bool = False,
+    with_gene_typo: bool = False,
+    with_outdated_gene: bool = False,
+    with_wrong_subtype: bool = False,
+    with_index_type_mismatch: bool = False,
+) -> pd.DataFrame | ad.AnnData:
+    """A small tabular dataset measuring expression & metadata."""
+    # define the data in the dataset
+    # it's a mix of numerical measurements and observation-level metadata
+    ifng = "IFNJ" if with_typo else "IFNG"
+    thing = "ulabel_but_not_perturbation" if with_wrong_subtype else "DMSO"
+    if gene_symbols_in_index:
+        var_ids = ["CD8A", "CD4", "CD14" if not with_gene_typo else "GeneTypo"]
+    else:
+        var_ids = [
+            "ENSG00000153563",
+            "ENSG00000010610",
+            "ENSG00000170458"
+            if not with_gene_typo
+            else "GeneTypo"
+            if not with_outdated_gene
+            else "ENSG00000278198",
+        ]
+    abt_cell = (
+        "CD8-pos alpha-beta T cell"
+        if with_cell_type_typo
+        else "CD8-positive, alpha-beta T cell"
+    )
+    dataset_dict = {
+        var_ids[0]: [1, 2, 3],
+        var_ids[1]: [3, 4, 5],
+        var_ids[2]: [5, 6, 7],
+        "perturbation": pd.Categorical(["DMSO", ifng, thing]),
+        "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
+        "cell_type_by_expert": pd.Categorical(
+            ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
+        ),
+        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
+        "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
+        "concentration": ["0.1%", "200 nM", "0.1%"],
+        "treatment_time_h": [24, 24, 6],
+        "donor": ["D0001", "D0002", None],
+    }
+    # define the dataset-level metadata
+    metadata = {
+        "temperature": 21.6,
+        "experiment": "Experiment 1",
+        "date_of_study": "2024-12-01",
+        "study_note": "We had a great time performing this study and the results look compelling.",
+    }
+    # the dataset as DataFrame
+    dataset_df = pd.DataFrame(
+        dataset_dict,
+        index=["sample1", "sample2", 0]  # type: ignore
+        if with_index_type_mismatch
+        else ["sample1", "sample2", "sample3"],
+    )
+    if otype == "DataFrame":
+        for key, value in metadata.items():
+            dataset_df.attrs[key] = value
+        return dataset_df
+    else:
+        dataset_ad = ad.AnnData(
+            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
+        )
+        return dataset_ad
+def get_dataset2(
+    otype: Literal["DataFrame", "AnnData"],
+    gene_symbols_in_index: bool = False,
+) -> pd.DataFrame | ad.AnnData:
+    if gene_symbols_in_index:
+        var_ids = ["CD8A", "CD4", "CD38"]
+    else:
+        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
+    dataset_dict = {
+        var_ids[0]: [2, 3, 3],
+        var_ids[1]: [3, 4, 5],
+        var_ids[2]: [4, 2, 3],
+        "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
+        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
+        "concentration": ["0.1%", "200 nM", "0.1%"],
+        "treatment_time_h": [24, 24, 6],
+        "donor": ["D0003", "D0003", "D0004"],
+    }
+    metadata = {
+        "temperature": 22.6,
+        "experiment": "Experiment 2",
+        "date_of_study": "2025-02-13",
+    }
+    dataset_df = pd.DataFrame(
+        dataset_dict,
+        index=["sample4", "sample5", "sample6"],
+    )
+    ad.AnnData(
+        dataset_df[var_ids],
+        obs=dataset_df[["perturbation", "cell_type_by_model"]],
+    )
+    if otype == "DataFrame":
+        for key, value in metadata.items():
+            dataset_df.attrs[key] = value
+        return dataset_df
+    else:
+        dataset_ad = ad.AnnData(
+            dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
+        )
+        return dataset_ad

lamindb/core/loaders.py CHANGED Viewed

@@ -44,7 +44,7 @@ try:
 except ImportError:
     def load_zarr(storepath):  # type: ignore
-        raise ImportError("Please install zarr: pip install zarr<=2.18.4")
+        raise ImportError("Please install zarr: pip install 'zarr<=2.18.4'")
 is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)

lamindb/core/storage/_backed_access.py CHANGED Viewed

@@ -1,20 +1,26 @@
 from __future__ import annotations
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Literal
 from anndata._io.specs.registry import get_spec
 from ._anndata_accessor import AnnDataAccessor, StorageType, registry
-from ._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
+from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df
+from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset
 from ._tiledbsoma import _open_tiledbsoma
 from .paths import filepath_from_artifact
 if TYPE_CHECKING:
+    from collections.abc import Iterator
     from fsspec.core import OpenFile
+    from polars import LazyFrame as PolarsLazyFrame
     from pyarrow.dataset import Dataset as PyArrowDataset
     from tiledbsoma import Collection as SOMACollection
     from tiledbsoma import Experiment as SOMAExperiment
+    from tiledbsoma import Measurement as SOMAMeasurement
     from upath import UPath
     from lamindb.models.artifact import Artifact
@@ -69,10 +75,17 @@ class BackedAccessor:
 def backed_access(
     artifact_or_filepath: Artifact | UPath,
     mode: str = "r",
+    engine: Literal["pyarrow", "polars"] = "pyarrow",
     using_key: str | None = None,
     **kwargs,
 ) -> (
-    AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
+    AnnDataAccessor
+    | BackedAccessor
+    | SOMACollection
+    | SOMAExperiment
+    | SOMAMeasurement
+    | PyArrowDataset
+    | Iterator[PolarsLazyFrame]
 ):
     from lamindb.models import Artifact
@@ -97,12 +110,15 @@ def backed_access(
         conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
     elif suffix == ".zarr":
         conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
-    elif _is_pyarrow_dataset(objectpath):
-        return _open_pyarrow_dataset(objectpath, **kwargs)
+    elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
+        df_suffix := df_suffixes.pop()
+    ) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
+        return _open_dataframe(objectpath, df_suffix, engine, **kwargs)
     else:
         raise ValueError(
             "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
-            f"or be compatible with pyarrow.dataset.dataset, instead of being {suffix} object."
+            f"be compatible with pyarrow.dataset.dataset or polars.scan_* functions, "
+            f"instead of being {suffix} object."
         )
     is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
@@ -112,3 +128,81 @@ def backed_access(
         return AnnDataAccessor(conn, storage, name)
     else:
         return BackedAccessor(conn, storage)
+def _flat_suffixes(paths: UPath | list[UPath]) -> set[str]:
+    # it is assumed here that the paths exist
+    # we don't check here that the filesystem is the same
+    # but this is a requirement for pyarrow.dataset.dataset
+    path_list = []
+    if isinstance(paths, Path):
+        paths = [paths]
+    for path in paths:
+        # assume http is always a file
+        if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
+            path_list += [p for p in path.rglob("*") if p.suffix != ""]
+        else:
+            path_list.append(path)
+    suffixes = set()
+    for path in path_list:
+        path_suffixes = path.suffixes
+        # this doesn't work for externally gzipped files, REMOVE LATER
+        path_suffix = (
+            path_suffixes[-2]
+            if len(path_suffixes) > 1 and ".gz" in path_suffixes
+            else path.suffix
+        )
+        suffixes.add(path_suffix)
+    return suffixes
+def _open_dataframe(
+    paths: UPath | list[UPath],
+    suffix: str | None = None,
+    engine: Literal["pyarrow", "polars"] = "pyarrow",
+    **kwargs,
+) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
+    df_suffix: str
+    if suffix is None:
+        df_suffixes = _flat_suffixes(paths)
+        if len(df_suffixes) > 1:
+            raise ValueError(
+                f"The artifacts in the collection have different file formats: {', '.join(df_suffixes)}.\n"
+                "It is not possible to open such stores with pyarrow or polars."
+            )
+        df_suffix = df_suffixes.pop()
+    else:
+        df_suffix = suffix
+    if engine == "pyarrow":
+        if df_suffix not in PYARROW_SUFFIXES:
+            raise ValueError(
+                f"{df_suffix} files are not supported by pyarrow, "
+                f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
+            )
+        # this checks that the filesystem is the same for all paths
+        # this is a requirement of pyarrow.dataset.dataset
+        if not isinstance(paths, Path):  # is a list then
+            fs = getattr(paths[0], "fs", None)
+            for path in paths[1:]:
+                # this assumes that the filesystems are cached by fsspec
+                if getattr(path, "fs", None) is not fs:
+                    raise ValueError(
+                        "The collection has artifacts with different filesystems, "
+                        "this is not supported by pyarrow."
+                    )
+        dataframe = _open_pyarrow_dataset(paths, **kwargs)
+    elif engine == "polars":
+        if df_suffix not in POLARS_SUFFIXES:
+            raise ValueError(
+                f"{df_suffix} files are not supported by polars, "
+                f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
+            )
+        dataframe = _open_polars_lazy_df(paths, **kwargs)
+    else:
+        raise ValueError(
+            f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
+        )
+    return dataframe

lamindb/core/storage/_polars_lazy_df.py ADDED Viewed

@@ -0,0 +1,51 @@
+from __future__ import annotations
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+    from polars import LazyFrame as PolarsLazyFrame
+    from upath import UPath
+POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")
+@contextmanager
+def _open_polars_lazy_df(
+    paths: UPath | list[UPath], **kwargs
+) -> Iterator[PolarsLazyFrame]:
+    try:
+        import polars as pl
+    except ImportError as ie:
+        raise ImportError("Please install polars: pip install polars") from ie
+    scans = {
+        ".parquet": pl.scan_parquet,
+        ".csv": pl.scan_csv,
+        ".ndjson": pl.scan_ndjson,
+        ".ipc": pl.scan_ipc,
+    }
+    path_list = []
+    if isinstance(paths, Path):
+        paths = [paths]
+    for path in paths:
+        # assume http is always a file
+        if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
+            path_list += [p for p in path.rglob("*") if p.suffix != ""]
+        else:
+            path_list.append(path)
+    open_files = []
+    try:
+        for path in path_list:
+            open_files.append(path.open(mode="rb"))
+        yield scans[path_list[0].suffix](open_files, **kwargs)
+    finally:
+        for open_file in open_files:
+            open_file.close()

lamindb 1.3.2__py3-none-any.whl → 1.5.0__py3-none-any.whl

lamindb 1.3.2py3-none-any.whl → 1.5.0py3-none-any.whl