PyPI - lamindb - Versions diffs - 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

lamindb 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

lamindb/__init__.py +14 -5
lamindb/_artifact.py +174 -57
lamindb/_can_curate.py +27 -8
lamindb/_collection.py +85 -51
lamindb/_feature.py +177 -41
lamindb/_finish.py +222 -81
lamindb/_from_values.py +83 -98
lamindb/_parents.py +4 -4
lamindb/_query_set.py +59 -17
lamindb/_record.py +171 -53
lamindb/_run.py +4 -4
lamindb/_save.py +33 -10
lamindb/_schema.py +135 -38
lamindb/_storage.py +1 -1
lamindb/_tracked.py +106 -0
lamindb/_transform.py +21 -8
lamindb/_ulabel.py +5 -14
lamindb/base/validation.py +2 -6
lamindb/core/__init__.py +13 -14
lamindb/core/_context.py +39 -36
lamindb/core/_data.py +29 -25
lamindb/core/_describe.py +1 -1
lamindb/core/_django.py +1 -1
lamindb/core/_feature_manager.py +54 -44
lamindb/core/_label_manager.py +4 -4
lamindb/core/_mapped_collection.py +20 -7
lamindb/core/datasets/__init__.py +6 -1
lamindb/core/datasets/_core.py +12 -11
lamindb/core/datasets/_small.py +66 -20
lamindb/core/exceptions.py +1 -90
lamindb/core/loaders.py +7 -13
lamindb/core/relations.py +6 -4
lamindb/core/storage/_anndata_accessor.py +41 -0
lamindb/core/storage/_backed_access.py +2 -2
lamindb/core/storage/_pyarrow_dataset.py +25 -15
lamindb/core/storage/_tiledbsoma.py +56 -12
lamindb/core/storage/paths.py +41 -22
lamindb/core/subsettings/_creation_settings.py +4 -16
lamindb/curators/__init__.py +2168 -833
lamindb/curators/_cellxgene_schemas/__init__.py +26 -0
lamindb/curators/_cellxgene_schemas/schema_versions.yml +104 -0
lamindb/errors.py +96 -0
lamindb/integrations/_vitessce.py +3 -3
lamindb/migrations/0069_squashed.py +76 -75
lamindb/migrations/0075_lamindbv1_part5.py +4 -5
lamindb/migrations/0082_alter_feature_dtype.py +21 -0
lamindb/migrations/0083_alter_feature_is_type_alter_flextable_is_type_and_more.py +94 -0
lamindb/migrations/0084_alter_schemafeature_feature_and_more.py +35 -0
lamindb/migrations/0085_alter_feature_is_type_alter_flextable_is_type_and_more.py +63 -0
lamindb/migrations/0086_various.py +95 -0
lamindb/migrations/0087_rename__schemas_m2m_artifact_feature_sets_and_more.py +41 -0
lamindb/migrations/0088_schema_components.py +273 -0
lamindb/migrations/0088_squashed.py +4372 -0
lamindb/models.py +423 -156
{lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/METADATA +10 -7
lamindb-1.1.0.dist-info/RECORD +95 -0
lamindb/curators/_spatial.py +0 -528
lamindb/migrations/0052_squashed.py +0 -1261
lamindb/migrations/0053_alter_featureset_hash_alter_paramvalue_created_by_and_more.py +0 -57
lamindb/migrations/0054_alter_feature_previous_runs_and_more.py +0 -35
lamindb/migrations/0055_artifact_type_artifactparamvalue_and_more.py +0 -61
lamindb/migrations/0056_rename_ulabel_ref_is_name_artifactulabel_label_ref_is_name_and_more.py +0 -22
lamindb/migrations/0057_link_models_latest_report_and_others.py +0 -356
lamindb/migrations/0058_artifact__actions_collection__actions.py +0 -22
lamindb/migrations/0059_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -31
lamindb/migrations/0060_alter_artifact__actions.py +0 -22
lamindb/migrations/0061_alter_collection_meta_artifact_alter_run_environment_and_more.py +0 -45
lamindb/migrations/0062_add_is_latest_field.py +0 -32
lamindb/migrations/0063_populate_latest_field.py +0 -45
lamindb/migrations/0064_alter_artifact_version_alter_collection_version_and_more.py +0 -33
lamindb/migrations/0065_remove_collection_feature_sets_and_more.py +0 -22
lamindb/migrations/0066_alter_artifact__feature_values_and_more.py +0 -352
lamindb/migrations/0067_alter_featurevalue_unique_together_and_more.py +0 -20
lamindb/migrations/0068_alter_artifactulabel_unique_together_and_more.py +0 -20
lamindb/migrations/0069_alter_artifact__accessor_alter_artifact__hash_type_and_more.py +0 -1294
lamindb-1.0.4.dist-info/RECORD +0 -102
{lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/LICENSE +0 -0
{lamindb-1.0.4.dist-info → lamindb-1.1.0.dist-info}/WHEEL +0 -0

lamindb/core/_mapped_collection.py CHANGED Viewed

@@ -87,7 +87,7 @@ class MappedCollection:
         obs_keys: Keys from the ``.obs`` slots.
         obs_filter: Select only observations with these values for the given obs columns.
             Should be a dictionary with obs column names as keys
-            and filtering values (a string or a tuple of strings) as values.
+            and filtering values (a string or a list of strings) as values.
         join: `"inner"` or `"outer"` virtual joins. If ``None`` is passed,
             does not join.
         encode_labels: Encode labels into integers.
@@ -106,7 +106,7 @@ class MappedCollection:
         layers_keys: str | list[str] | None = None,
         obs_keys: str | list[str] | None = None,
         obsm_keys: str | list[str] | None = None,
-        obs_filter: dict[str, str | tuple[str, ...]] | None = None,
+        obs_filter: dict[str, str | list[str]] | None = None,
         join: Literal["inner", "outer"] | None = "inner",
         encode_labels: bool | list[str] = True,
         unknown_label: str | dict[str, str] | None = None,
@@ -184,9 +184,14 @@ class MappedCollection:
                 if self.filtered:
                     indices_storage_mask = None
                     for obs_filter_key, obs_filter_values in obs_filter.items():
-                        obs_filter_mask = np.isin(
-                            self._get_labels(store, obs_filter_key), obs_filter_values
-                        )
+                        if isinstance(obs_filter_values, tuple):
+                            obs_filter_values = list(obs_filter_values)
+                        elif not isinstance(obs_filter_values, list):
+                            obs_filter_values = [obs_filter_values]
+                        obs_labels = self._get_labels(store, obs_filter_key)
+                        obs_filter_mask = np.isin(obs_labels, obs_filter_values)
+                        if pd.isna(obs_filter_values).any():
+                            obs_filter_mask |= pd.isna(obs_labels)
                         if indices_storage_mask is None:
                             indices_storage_mask = obs_filter_mask
                         else:
@@ -296,7 +301,7 @@ class MappedCollection:
             self.var_joint = reduce(pd.Index.intersection, self.var_list)
             if len(self.var_joint) == 0:
                 raise ValueError(
-                    "The provided AnnData objects don't have shared varibales.\n"
+                    "The provided AnnData objects don't have shared variables.\n"
                     "Use join='outer'."
                 )
             self.var_indices = [
@@ -389,7 +394,7 @@ class MappedCollection:
                     else:
                         cats = None
                     label_idx = self._get_obs_idx(store, obs_idx, label, cats)
-                    if label in self.encoders:
+                    if label in self.encoders and label_idx is not np.nan:
                         label_idx = self.encoders[label][label_idx]
                     out[label] = label_idx
         return out
@@ -453,6 +458,8 @@ class MappedCollection:
                 label = labels[idx]
             else:
                 label = labels["codes"][idx]
+                if label == -1:
+                    return np.nan
         if categories is not None:
             cats = categories
         else:
@@ -589,7 +596,13 @@ class MappedCollection:
             cats = self._get_categories(storage, label_key)
         if cats is not None:
             cats = _decode(cats) if isinstance(cats[0], bytes) else cats
+            # NaN is coded as -1
+            nans = labels == -1
             labels = cats[labels]
+            # detect and replace nans
+            if nans.any():
+                labels[nans] = np.nan
         return labels
     def close(self):

lamindb/core/datasets/__init__.py CHANGED Viewed

@@ -85,4 +85,9 @@ from ._core import (
     schmidt22_perturbseq,
 )
 from ._fake import fake_bio_notebook_titles
-from ._small import anndata_with_obs, small_dataset1, small_dataset2
+from ._small import (
+    anndata_with_obs,
+    small_dataset1,
+    small_dataset2,
+    small_dataset3_cellxgene,
+)

lamindb/core/datasets/_core.py CHANGED Viewed

@@ -18,7 +18,8 @@ if TYPE_CHECKING:
 def file_fcs() -> Path:
     """Example FCS artifact."""
     filepath, _ = urlretrieve(
-        "https://lamindb-test.s3.amazonaws.com/example.fcs", "example.fcs"
+        "https://lamindb-dev-datasets.s3.amazonaws.com/.lamindb/DBNEczSgBui0bbzBXMGH.fcs",
+        "example.fcs",
     )
     return Path(filepath)
@@ -48,8 +49,8 @@ def file_fcs_alpert19(populate_registries: bool = False) -> Path:  # pragma: no
                 bt.CellMarker.public().inspect(std, "name").validated, "name"
             )
         )
-        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
-        ln.Feature(name="organism", dtype=[bt.Organism]).save()
+        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
+        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
         ln.settings.verbosity = verbosity
     return Path(filepath)
@@ -84,8 +85,8 @@ def file_tsv_rnaseq_nfcore_salmon_merged_gene_counts(
         verbosity = ln.settings.verbosity
         ln.settings.verbosity = "error"
-        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
-        ln.Feature(name="organism", dtype=[bt.Organism]).save()
+        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
+        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
         bt.ExperimentalFactor.from_source(ontology_id="EFO:0008896").save()
         ln.settings.verbosity = verbosity
@@ -207,7 +208,7 @@ def anndata_mouse_sc_lymph_node(
         # cell types
         ln.save(bt.CellType.from_values(["CL:0000115", "CL:0000738"], "ontology_id"))
         # assays
-        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
+        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
         bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
         # genes
         validated = bt.Gene.public(organism="mouse").validate(
@@ -330,11 +331,11 @@ def anndata_human_immune_cells(
         ln.save(bt.CellType.from_values(adata.obs.cell_type, field="name"))
         ln.save(bt.ExperimentalFactor.from_values(adata.obs.assay, field="name"))
         ln.save(bt.Tissue.from_values(adata.obs.tissue, field="name"))
-        ln.Feature(name="cell_type", dtype=[bt.CellType]).save()
-        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()
-        ln.Feature(name="tissue", dtype=[bt.Tissue]).save()
-        ln.Feature(name="organism", dtype=[bt.Organism]).save()
-        ln.Feature(name="donor", dtype=[ln.ULabel]).save()
+        ln.Feature(name="cell_type", dtype=[bt.CellType]).save()  # type: ignore
+        ln.Feature(name="assay", dtype=[bt.ExperimentalFactor]).save()  # type: ignore
+        ln.Feature(name="tissue", dtype=[bt.Tissue]).save()  # type: ignore
+        ln.Feature(name="organism", dtype=[bt.Organism]).save()  # type: ignore
+        ln.Feature(name="donor", dtype=[ln.ULabel]).save()  # type: ignore
         bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
         ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
         ln.settings.verbosity = verbosity

lamindb/core/datasets/_small.py CHANGED Viewed

@@ -8,20 +8,25 @@ import pandas as pd
 def small_dataset1(
-    format: Literal["df", "anndata"],
+    otype: Literal["DataFrame", "AnnData"],
+    gene_symbols_in_index: bool = False,
     with_typo: bool = False,
-) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
+) -> pd.DataFrame | ad.AnnData:
     # define the data in the dataset
     # it's a mix of numerical measurements and observation-level metadata
     ifng = "IFNJ" if with_typo else "IFNG"
+    if gene_symbols_in_index:
+        var_ids = ["CD8A", "CD4", "CD14"]
+    else:
+        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
     dataset_dict = {
-        "CD8A": [1, 2, 3],
-        "CD4": [3, 4, 5],
-        "CD14": [5, 6, 7],
-        "cell_medium": ["DMSO", ifng, "DMSO"],
+        var_ids[0]: [1, 2, 3],
+        var_ids[1]: [3, 4, 5],
+        var_ids[2]: [5, 6, 7],
+        "cell_medium": pd.Categorical(["DMSO", ifng, "DMSO"]),
         "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
-        "cell_type_by_expert": ["B cell", "T cell", "T cell"],
-        "cell_type_by_model": ["B cell", "T cell", "T cell"],
+        "cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
+        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
     }
     # define the dataset-level metadata
     metadata = {
@@ -32,8 +37,10 @@ def small_dataset1(
     }
     # the dataset as DataFrame
     dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
-    if format == "df":
-        return dataset_df, metadata
+    if otype == "DataFrame":
+        for key, value in metadata.items():
+            dataset_df.attrs[key] = value
+        return dataset_df
     else:
         dataset_ad = ad.AnnData(
             dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
@@ -42,14 +49,19 @@ def small_dataset1(
 def small_dataset2(
-    format: Literal["df", "anndata"],
-) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
+    otype: Literal["DataFrame", "AnnData"],
+    gene_symbols_in_index: bool = False,
+) -> pd.DataFrame | ad.AnnData:
+    if gene_symbols_in_index:
+        var_ids = ["CD8A", "CD4", "CD38"]
+    else:
+        var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
     dataset_dict = {
-        "CD8A": [2, 3, 3],
-        "CD4": [3, 4, 5],
-        "CD38": [4, 2, 3],
-        "cell_medium": ["DMSO", "IFNG", "IFNG"],
-        "cell_type_by_model": ["B cell", "T cell", "T cell"],
+        var_ids[0]: [2, 3, 3],
+        var_ids[1]: [3, 4, 5],
+        var_ids[2]: [4, 2, 3],
+        "cell_medium": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
+        "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
     }
     metadata = {
         "temperature": 22.6,
@@ -61,11 +73,13 @@ def small_dataset2(
         index=["sample4", "sample5", "sample6"],
     )
     ad.AnnData(
-        dataset_df[["CD8A", "CD4", "CD38"]],
+        dataset_df[var_ids],
         obs=dataset_df[["cell_medium", "cell_type_by_model"]],
     )
-    if format == "df":
-        return dataset_df, metadata
+    if otype == "DataFrame":
+        for key, value in metadata.items():
+            dataset_df.attrs[key] = value
+        return dataset_df
     else:
         dataset_ad = ad.AnnData(
             dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
@@ -73,6 +87,38 @@ def small_dataset2(
         return dataset_ad
+def small_dataset3_cellxgene(
+    otype: Literal["DataFrame", "AnnData"] = "AnnData",
+) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
+    # TODO: consider other ids for other organisms
+    # "ENSMUSG00002076988"
+    var_ids = ["invalid_ensembl_id", "ENSG00000000419", "ENSG00000139618"]
+    dataset_dict = {
+        var_ids[0]: [2, 3, 3],
+        var_ids[1]: [3, 4, 5],
+        var_ids[2]: [4, 2, 3],
+        "disease_ontology_term_id": ["MONDO:0004975", "MONDO:0004980", "MONDO:0004980"],
+        "organism": ["human", "human", "human"],
+        "sex": ["female", "male", "unknown"],
+        "tissue": ["lungg", "lungg", "heart"],
+        "donor": ["-1", "1", "2"],
+    }
+    dataset_df = pd.DataFrame(
+        dataset_dict,
+        index=["barcode1", "barcode2", "barcode3"],
+    )
+    dataset_df["tissue"] = dataset_df["tissue"].astype("category")
+    ad.AnnData(
+        dataset_df[var_ids],
+        obs=dataset_df[[key for key in dataset_dict if key not in var_ids]],
+    )
+    if otype == "DataFrame":
+        return dataset_df
+    else:
+        dataset_ad = ad.AnnData(dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:])
+        return dataset_ad
 def anndata_with_obs() -> ad.AnnData:
     """Create a mini anndata with cell_type, disease and tissue."""
     import anndata as ad

lamindb/core/exceptions.py CHANGED Viewed

@@ -1,90 +1 @@
-"""Exceptions.
-.. autosummary::
-   :toctree: .
-   InvalidArgument
-   DoesNotExist
-   ValidationError
-   NotebookNotSaved
-   MissingContextUID
-   UpdateContext
-   IntegrityError
-   RecordNameChangeIntegrityError
-"""
-# inheriting from SystemExit has the sole purpose of suppressing
-# the traceback - this isn't optimal but the current best solution
-# https://laminlabs.slack.com/archives/C04A0RMA0SC/p1726856875597489
-class InvalidArgument(SystemExit):
-    """Invalid method or function argument."""
-    pass
-class TrackNotCalled(SystemExit):
-    """`ln.track()` wasn't called."""
-    pass
-class NotebookNotSaved(SystemExit):
-    """Notebook wasn't saved."""
-    pass
-class ValidationError(SystemExit):
-    """Validation error: not mapped in registry."""
-    pass
-# inspired by Django's DoesNotExist
-# equivalent to SQLAlchemy's NoResultFound
-class DoesNotExist(SystemExit):
-    """No record found."""
-    pass
-class InconsistentKey(Exception):
-    """Inconsistent transform or artifact `key`."""
-    pass
-class RecordNameChangeIntegrityError(SystemExit):
-    """Custom exception for name change errors."""
-    pass
-# -------------------------------------------------------------------------------------
-# run context
-# -------------------------------------------------------------------------------------
-class IntegrityError(Exception):
-    """Integrity error.
-    For instance, it's not allowed to delete artifacts outside managed storage
-    locations.
-    """
-    pass
-class MissingContextUID(SystemExit):
-    """User didn't define transform settings."""
-    pass
-class UpdateContext(SystemExit):
-    """Transform settings require update."""
-    pass
+from ..errors import *  # noqa: F403 backward compat

lamindb/core/loaders.py CHANGED Viewed

@@ -40,7 +40,7 @@ try:
 except ImportError:
     def load_anndata_zarr(storepath):  # type: ignore
-        raise ImportError("Please install zarr: pip install zarr")
+        raise ImportError("Please install zarr: pip install zarr<=2.18.4")
 is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
@@ -109,19 +109,13 @@ def load_json(path: UPathStr) -> dict:
     return data
-def load_yaml(path: UPathStr) -> dict | UPathStr:
+def load_yaml(path: UPathStr) -> dict:
     """Load `.yaml` to `dict`."""
-    try:
-        import yaml  # type: ignore
-        with open(path) as f:
-            data = yaml.safe_load(f)
-        return data
-    except ImportError:
-        logger.warning(
-            "Please install PyYAML (`pip install PyYAML`) to load `.yaml` files."
-        )
-        return path
+    import yaml  # type: ignore
+    with open(path) as f:
+        data = yaml.safe_load(f)
+    return data
 def load_image(path: UPathStr) -> None | UPathStr:

lamindb/core/relations.py CHANGED Viewed

@@ -8,7 +8,7 @@ from lamindb_setup._connect_instance import (
 )
 from lamindb_setup.core._settings_store import instance_settings_file
-from lamindb.models import LinkORM, Record, Schema
+from lamindb.models import LinkORM, Record, Registry, Schema
 def get_schema_modules(instance: str | None) -> set[str]:
@@ -35,9 +35,11 @@ def get_schema_modules(instance: str | None) -> set[str]:
     return shared_schema_modules
+# this function here should likely be renamed
+# it maps the __get_name_with_module__() onto the actual model
 def dict_module_name_to_model_name(
-    registry: type[Record], instance: str | None = None
-) -> dict[str, Record]:
+    registry: Registry, instance: str | None = None
+) -> dict[str, Registry]:
     schema_modules = get_schema_modules(instance)
     d: dict = {
         i.related_model.__get_name_with_module__(): i.related_model
@@ -92,7 +94,7 @@ def get_related_name(features_type: type[Record]) -> str:
             f"Can't create feature sets from {features_type.__name__} because it's not"
             " related to it!\nYou need to create a link model between Schema and"
             " your Record in your custom module.\nTo do so, add a"
-            " line:\n_schemas_m2m = models.ManyToMany(Schema,"
+            " line:\n_feature_sets = models.ManyToMany(Schema,"
             " related_name='mythings')\n"
         )
     return candidates[0]

lamindb/core/storage/_anndata_accessor.py CHANGED Viewed

@@ -19,6 +19,7 @@ from fsspec.implementations.local import LocalFileSystem
 from lamin_utils import logger
 from lamindb_setup.core.upath import create_mapper, infer_filesystem
 from packaging import version
+from upath import UPath
 if TYPE_CHECKING:
     from collections.abc import Mapping
@@ -741,3 +742,43 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
         return AnnDataRawAccessor(
             self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
         )
+# get the number of observations in an anndata object or file fast and safely
+def _anndata_n_observations(object: UPathStr | AnnData) -> int | None:
+    if isinstance(object, AnnData):
+        return object.n_obs
+    try:
+        objectpath = UPath(object)
+        suffix = objectpath.suffix
+        conn_module = {".h5ad": "h5py", ".zarr": "zarr"}.get(suffix, suffix[1:])
+        conn, storage = registry.open(conn_module, objectpath, mode="r")
+    except Exception as e:
+        logger.warning(f"Could not open {object} to read n_observations: {e}")
+        return None
+    n_observations: int | None = None
+    try:
+        obs = storage["obs"]
+        if isinstance(obs, GroupTypes):  # type: ignore
+            if "_index" in obs.attrs:
+                elem_key = _read_attr(obs.attrs, "_index")
+            else:
+                elem_key = next(iter(obs))
+            elem = obs[elem_key]
+            if isinstance(elem, ArrayTypes):  # type: ignore
+                n_observations = elem.shape[0]
+            else:
+                # assume standard obs group
+                n_observations = elem["codes"].shape[0]
+        else:
+            n_observations = obs.shape[0]
+    except Exception as e:
+        logger.warning(f"Could not read n_observations from anndata {object}: {e}")
+    finally:
+        if hasattr(storage, "close"):
+            storage.close()
+        if hasattr(conn, "close"):
+            conn.close()
+    return n_observations

lamindb/core/storage/_backed_access.py CHANGED Viewed

@@ -94,8 +94,8 @@ def backed_access(
         return _open_pyarrow_dataset(objectpath)
     else:
         raise ValueError(
-            "object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix, not"
-            f" {suffix}."
+            "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
+            f"or be compatible with pyarrow.dataset.dataset, instead of being {suffix} object."
         )
     is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"

lamindb/core/storage/_pyarrow_dataset.py CHANGED Viewed

@@ -6,26 +6,36 @@ import pyarrow.dataset
 from lamindb_setup.core.upath import LocalPathClasses
 if TYPE_CHECKING:
+    from pyarrow.dataset import Dataset as PyArrowDataset
     from upath import UPath
-PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather")
+PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
-def _is_pyarrow_dataset(path: UPath) -> bool:
-    # it is assumed here that path exists
-    if path.is_file():
-        return path.suffix in PYARROW_SUFFIXES
+def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
+    # it is assumed here that the paths exist
+    # we don't check here that the filesystem is the same
+    # but this is a requirement for pyarrow.dataset.dataset
+    if isinstance(paths, list):
+        suffixes = {path.suffix for path in paths}
+    elif paths.is_file():
+        suffixes = {paths.suffix}
     else:
-        objects = path.rglob("*")
-        suffixes = {object.suffix for object in objects if object.suffix != ""}
-        return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
-def _open_pyarrow_dataset(path: UPath) -> pyarrow.dataset.Dataset:
-    if isinstance(path, LocalPathClasses):
-        path_str, filesystem = path.as_posix(), None
+        suffixes = {path.suffix for path in paths.rglob("*") if path.suffix != ""}
+    return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
+def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
+    if isinstance(paths, list):
+        path0 = paths[0]
+        if isinstance(path0, LocalPathClasses):
+            paths_str, filesystem = [path.as_posix() for path in paths], None
+        else:
+            paths_str, filesystem = [path.path for path in paths], path0.fs
+    elif isinstance(paths, LocalPathClasses):
+        paths_str, filesystem = paths.as_posix(), None
     else:
-        path_str, filesystem = path.path, path.fs
+        paths_str, filesystem = paths.path, paths.fs
-    return pyarrow.dataset.dataset(path_str, filesystem=filesystem)
+    return pyarrow.dataset.dataset(paths_str, filesystem=filesystem)

lamindb 1.0.4__py3-none-any.whl → 1.1.0__py3-none-any.whl

lamindb 1.0.4py3-none-any.whl → 1.1.0py3-none-any.whl