PyPI - lamindb - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

lamindb 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

lamindb/__init__.py +4 -2
lamindb/_artifact.py +54 -36
lamindb/_collection.py +1 -1
lamindb/_feature.py +1 -1
lamindb/_finish.py +9 -1
lamindb/_query_set.py +24 -6
lamindb/_record.py +4 -5
lamindb/_save.py +9 -1
lamindb/_tracked.py +25 -2
lamindb/base/users.py +1 -4
lamindb/core/_context.py +7 -2
lamindb/core/_mapped_collection.py +4 -2
lamindb/core/_track_environment.py +2 -1
lamindb/core/datasets/_small.py +3 -3
lamindb/core/loaders.py +15 -3
lamindb/core/storage/_anndata_accessor.py +8 -3
lamindb/core/storage/_backed_access.py +10 -5
lamindb/core/storage/_pyarrow_dataset.py +24 -9
lamindb/core/storage/paths.py +12 -12
lamindb/curators/__init__.py +77 -65
lamindb/models.py +58 -18
{lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/METADATA +2 -2
{lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/RECORD +25 -25
{lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/LICENSE +0 -0
{lamindb-1.1.0.dist-info → lamindb-1.1.1.dist-info}/WHEEL +0 -0

lamindb/__init__.py CHANGED Viewed

@@ -1,12 +1,13 @@
 """A data framework for biology.
-Tracking notebooks & scripts.
+Tracking notebooks, scripts & functions.
 .. autosummary::
    :toctree: .
    track
    finish
+   tracked
 Registries.
@@ -57,11 +58,12 @@ Backward compatibility.
    :toctree: .
    FeatureSet
+   Curator
 """
 # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
-__version__ = "1.1.0"
+__version__ = "1.1.1"
 from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
 from lamindb_setup._check_setup import _check_instance_setup

lamindb/_artifact.py CHANGED Viewed

@@ -152,6 +152,7 @@ def process_data(
     default_storage: Storage,
     using_key: str | None,
     skip_existence_check: bool = False,
+    is_replace: bool = False,
 ) -> tuple[Any, Path | UPath, str, Storage, bool]:
     """Serialize a data object that's provided as file or in memory."""
     # if not overwritten, data gets stored in default storage
@@ -161,14 +162,24 @@ def process_data(
         data_types = (pd.DataFrame, AnnData, MuData)
     else:
         data_types = (pd.DataFrame, AnnData)  # type:ignore
+    if key is not None:
+        key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
+        # use suffix as the (adata) format if the format is not provided
+        if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
+            format = key_suffix[1:]
+    else:
+        key_suffix = None
     if isinstance(data, (str, Path, UPath)):  # UPathStr, spelled out
         access_token = (
             default_storage._access_token
             if hasattr(default_storage, "_access_token")
             else None
         )
-        path = create_path(data, access_token=access_token).resolve()
+        path = create_path(data, access_token=access_token)
+        # we don't resolve http links because they can resolve into a different domain
+        # for example into a temporary url
+        if path.protocol not in {"http", "https"}:
+            path = path.resolve()
         storage, use_existing_storage_key = process_pathlike(
             path,
             default_storage=default_storage,
@@ -180,30 +191,23 @@ def process_data(
     elif isinstance(data, data_types):
         storage = default_storage
         memory_rep = data
-        if key is not None:
-            key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
-            # use suffix as the (adata) format if the format is not provided
-            if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
-                format = key_suffix[1:]
-        else:
-            key_suffix = None
         suffix = infer_suffix(data, format)
-        if key_suffix is not None and key_suffix != suffix:
-            raise InvalidArgument(
-                f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
-                f" be '{suffix}'."
-            )
-        cache_name = f"{provisional_uid}{suffix}"
-        path = settings.cache_dir / cache_name
-        # Alex: I don't understand the line below
-        if path.suffixes == []:
-            path = path.with_suffix(suffix)
-        write_to_disk(data, path)
-        use_existing_storage_key = False
     else:
         raise NotImplementedError(
             f"Do not know how to create a artifact object from {data}, pass a path instead!"
         )
+    if key_suffix is not None and key_suffix != suffix and not is_replace:
+        # consciously omitting a trailing period
+        if isinstance(data, (str, Path, UPath)):
+            message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
+        else:
+            message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
+        raise InvalidArgument(message)
+    # in case we have an in-memory representation, we need to write it to disk
+    if isinstance(data, data_types):
+        path = settings.cache_dir / f"{provisional_uid}{suffix}"
+        write_to_disk(data, path)
+        use_existing_storage_key = False
     return memory_rep, path, suffix, storage, use_existing_storage_key
@@ -321,6 +325,7 @@ def get_artifact_kwargs_from_data(
         default_storage,
         using_key,
         skip_check_exists,
+        is_replace=is_replace,
     )
     stat_or_artifact = get_stat_or_artifact(
         path=path,
@@ -453,7 +458,7 @@ def data_is_anndata(data: AnnData | UPathStr) -> bool:
         return True
     if isinstance(data, (str, Path, UPath)):
         data_path = UPath(data)
-        if data_path.suffix == ".h5ad":
+        if ".h5ad" in data_path.suffixes:  # ".h5ad.gz" is a valid suffix
             return True
         elif data_path.suffix == ".zarr":
             # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
@@ -689,6 +694,7 @@ def from_df(
         kind="dataset",
         **kwargs,
     )
+    artifact.n_observations = len(df)
     return artifact
@@ -973,7 +979,7 @@ inconsistent_state_msg = (
 # docstring handled through attach_func_to_class_method
 def open(
-    self, mode: str = "r", is_run_input: bool | None = None
+    self, mode: str = "r", is_run_input: bool | None = None, **kwargs
 ) -> (
     AnnDataAccessor
     | BackedAccessor
@@ -984,16 +990,23 @@ def open(
 ):
     if self._overwrite_versions and not self.is_latest:
         raise ValueError(inconsistent_state_msg)
+    # all hdf5 suffixes including gzipped
+    h5_suffixes = [".h5", ".hdf5", ".h5ad"]
+    h5_suffixes += [s + ".gz" for s in h5_suffixes]
     # ignore empty suffix for now
     suffixes = (
-        "",
-        ".h5",
-        ".hdf5",
-        ".h5ad",
-        ".zarr",
-        ".anndata.zarr",
-        ".tiledbsoma",
-    ) + PYARROW_SUFFIXES
+        (
+            "",
+            ".zarr",
+            ".anndata.zarr",
+            ".tiledbsoma",
+        )
+        + tuple(h5_suffixes)
+        + PYARROW_SUFFIXES
+        + tuple(
+            s + ".gz" for s in PYARROW_SUFFIXES
+        )  # this doesn't work for externally gzipped files, REMOVE LATER
+    )
     if self.suffix not in suffixes:
         raise ValueError(
             "Artifact should have a zarr, h5, tiledbsoma object"
@@ -1011,7 +1024,7 @@ def open(
     using_key = settings._using_key
     filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
     is_tiledbsoma_w = (
-        filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
+        filepath.name == "soma" or self.suffix == ".tiledbsoma"
     ) and mode == "w"
     # consider the case where an object is already locally cached
     localpath = setup_settings.paths.cloud_to_local_no_update(
@@ -1025,14 +1038,14 @@ def open(
         ) and not filepath.synchronize(localpath, just_check=True)
     if open_cache:
         try:
-            access = backed_access(localpath, mode, using_key)
+            access = backed_access(localpath, mode, using_key, **kwargs)
         except Exception as e:
             if isinstance(filepath, LocalPathClasses):
                 raise e
             logger.warning(
                 f"The cache might be corrupted: {e}. Trying to open directly."
             )
-            access = backed_access(filepath, mode, using_key)
+            access = backed_access(filepath, mode, using_key, **kwargs)
             # happens only if backed_access has been successful
             # delete the corrupted cache
             if localpath.is_dir():
@@ -1040,7 +1053,7 @@ def open(
             else:
                 localpath.unlink(missing_ok=True)
     else:
-        access = backed_access(filepath, mode, using_key)
+        access = backed_access(filepath, mode, using_key, **kwargs)
         if is_tiledbsoma_w:
             def finalize():
@@ -1237,6 +1250,7 @@ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
 def save(self, upload: bool | None = None, **kwargs) -> Artifact:
     state_was_adding = self._state.adding
     print_progress = kwargs.pop("print_progress", True)
+    store_kwargs = kwargs.pop("store_kwargs", {})  # kwargs for .upload_from in the end
     access_token = kwargs.pop("access_token", None)
     local_path = None
     if upload and setup_settings.instance.keep_artifacts_local:
@@ -1258,7 +1272,11 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
     if "using" in kwargs:
         using_key = kwargs["using"]
     exception_upload = check_and_attempt_upload(
-        self, using_key, access_token=access_token, print_progress=print_progress
+        self,
+        using_key,
+        access_token=access_token,
+        print_progress=print_progress,
+        **store_kwargs,
     )
     if exception_upload is not None:
         # we do not want to raise file not found on cleanup if upload of a file failed

lamindb/_collection.py CHANGED Viewed

@@ -273,7 +273,7 @@ def mapped(
     else:
         artifacts = self.ordered_artifacts.all()
     for artifact in artifacts:
-        if artifact.suffix not in {".h5ad", ".zarr"}:
+        if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
             logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
             continue
         elif not stream:

lamindb/_feature.py CHANGED Viewed

@@ -218,7 +218,7 @@ def __init__(self, *args, **kwargs):
         return None
     dtype = kwargs.get("dtype", None)
     default_value = kwargs.pop("default_value", None)
-    nullable = kwargs.pop("nullable", None)
+    nullable = kwargs.pop("nullable", True)  # default value of nullable
     cat_filters = kwargs.pop("cat_filters", None)
     kwargs = process_init_feature_param(args, kwargs)
     super(Feature, self).__init__(*args, **kwargs)

lamindb/_finish.py CHANGED Viewed

@@ -436,7 +436,15 @@ def save_context_core(
     # save both run & transform records if we arrive here
     if run is not None:
         run.save()
-    transform.save()
+    transform_id_prior_to_save = transform.id
+    transform.save()  # this in-place updates the state of transform upon hash collision
+    if transform.id != transform_id_prior_to_save:
+        # the hash existed and we're actually back to the previous version
+        # hence, this was in fact a run of the previous transform rather than of
+        # the new transform
+        # this can happen in interactive notebooks if the user makes no change to the notebook
+        run.transform = transform
+        run.save()
     # finalize
     if not from_cli and run is not None:

lamindb/_query_set.py CHANGED Viewed

@@ -214,10 +214,27 @@ def get(
     else:
         assert idlike is None  # noqa: S101
         expressions = process_expressions(qs, expressions)
+        # don't want _branch_code here in .get(), only in .filter()
+        expressions.pop("_branch_code", None)
         # inject is_latest for consistency with idlike
-        if issubclass(registry, IsVersioned) and "is_latest" not in expressions:
+        is_latest_was_not_in_expressions = "is_latest" not in expressions
+        if issubclass(registry, IsVersioned) and is_latest_was_not_in_expressions:
             expressions["is_latest"] = True
-        return registry.objects.using(qs.db).get(**expressions)
+        try:
+            return registry.objects.using(qs.db).get(**expressions)
+        except registry.DoesNotExist:
+            # handle the case in which the is_latest injection led to a missed query
+            if "is_latest" in expressions and is_latest_was_not_in_expressions:
+                expressions.pop("is_latest")
+                result = (
+                    registry.objects.using(qs.db)
+                    .filter(**expressions)
+                    .order_by("-created_at")
+                    .first()
+                )
+                if result is not None:
+                    return result
+            raise registry.DoesNotExist from registry.DoesNotExist
 class RecordList(UserList, Generic[T]):
@@ -641,11 +658,12 @@ class QuerySet(models.QuerySet):
                 and value.strip("-").isalpha()
                 and "__" not in field
                 and hasattr(self.model, field)
-                and getattr(self.model, field).field.related_model
             ):
-                raise FieldError(
-                    f"Invalid lookup '{value}' for {field}. Did you mean {field}__name?"
-                )
+                field_attr = getattr(self.model, field)
+                if hasattr(field_attr, "field") and field_attr.field.related_model:
+                    raise FieldError(
+                        f"Invalid lookup '{value}' for {field}. Did you mean {field}__name?"
+                    )
         expressions = process_expressions(self, expressions)
         if len(expressions) > 0:

lamindb/_record.py CHANGED Viewed

@@ -248,11 +248,10 @@ def __init__(record: Record, *args, **kwargs):
                         f" {name_field}{version_comment}: '{kwargs[name_field]}'"
                     )
                     if isinstance(record, Schema):
-                        if Artifact.filter(schema=record).exists():
-                            if record.hash != kwargs["hash"]:
-                                raise ValueError(
-                                    "Schema is already in use, can't be changed."
-                                )
+                        if existing_record.hash != kwargs["hash"]:
+                            raise ValueError(
+                                f"Schema name is already in use by schema with uid '{existing_record.uid}', please choose a different name."
+                            )
                     init_self_from_db(record, existing_record)
                     update_attributes(record, kwargs)
                     return None

lamindb/_save.py CHANGED Viewed

@@ -133,7 +133,9 @@ def check_and_attempt_upload(
     using_key: str | None = None,
     access_token: str | None = None,
     print_progress: bool = True,
+    **kwargs,
 ) -> Exception | None:
+    # kwargs are propagated to .upload_from in the end
     # if Artifact object is either newly instantiated or replace() was called on
     # a local env it will have a _local_filepath and needs to be uploaded
     if hasattr(artifact, "_local_filepath"):
@@ -143,6 +145,7 @@ def check_and_attempt_upload(
                 using_key,
                 access_token=access_token,
                 print_progress=print_progress,
+                **kwargs,
             )
         except Exception as exception:
             logger.warning(f"could not upload artifact: {artifact}")
@@ -316,8 +319,10 @@ def upload_artifact(
     using_key: str | None = None,
     access_token: str | None = None,
     print_progress: bool = True,
+    **kwargs,
 ) -> tuple[UPath, UPath | None]:
     """Store and add file and its linked entries."""
+    # kwargs are propagated to .upload_from in the end
     # can't currently use  filepath_from_artifact here because it resolves to ._local_filepath
     storage_key = auto_storage_key_from_artifact(artifact)
     storage_path, storage_settings = attempt_accessing_path(
@@ -326,7 +331,10 @@ def upload_artifact(
     if hasattr(artifact, "_to_store") and artifact._to_store:
         logger.save(f"storing artifact '{artifact.uid}' at '{storage_path}'")
         store_file_or_folder(
-            artifact._local_filepath, storage_path, print_progress=print_progress
+            artifact._local_filepath,
+            storage_path,
+            print_progress=print_progress,
+            **kwargs,
         )
     if isinstance(storage_path, LocalPathClasses):

lamindb/_tracked.py CHANGED Viewed

@@ -26,10 +26,33 @@ def get_current_tracked_run() -> Run | None:
 def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]:
-    """Decorator that tracks function execution.
+    """Mark a function as tracked with this decorator.
+    You will be able to see inputs, outputs, and parameters of the function in the data lineage graph.
+    Guide: :doc:`/track`
+    .. versionadded:: 1.1.0
+        This is still in beta and will be refined in future releases.
     Args:
-        uid: Optional unique identifier for the transform
+        uid: Persist the uid to identify this transform across renames.
+    Example::
+        import lamindb as ln
+        @ln.tracked()
+        def subset_dataframe(
+            input_artifact_key: str,  # all arguments tracked as parameters of the function run
+            output_artifact_key: str,
+            subset_rows: int = 2,
+            subset_cols: int = 2,
+        ) -> None:
+            artifact = ln.Artifact.get(key=input_artifact_key)
+            df = artifact.load()  # auto-tracked as input
+            new_df = df.iloc[:subset_rows, :subset_cols]
+            ln.Artifact.from_df(new_df, key=output_artifact_key).save()  # auto-tracked as output
     """
     def decorator_tracked(func: Callable[P, R]) -> Callable[P, R]:

lamindb/base/users.py CHANGED Viewed

@@ -12,12 +12,9 @@ def current_user_id() -> int:
         if ln_setup.core.django.IS_MIGRATING:
             return 1
         else:
-            exc_attr = (
-                "DoesNotExist" if hasattr(User, "DoesNotExist") else "_DoesNotExist"
-            )
             try:
                 user_id = User.objects.get(uid=settings.user.uid).id
-            except getattr(User, exc_attr):
+            except User.DoesNotExist:
                 register_user(settings.user)
                 user_id = User.objects.get(uid=settings.user.uid).id
             return user_id

lamindb/core/_context.py CHANGED Viewed

@@ -13,6 +13,7 @@ from typing import TYPE_CHECKING
 import lamindb_setup as ln_setup
 from django.db.models import Func, IntegerField
 from lamin_utils import logger
+from lamindb_setup.core import deprecated
 from lamindb_setup.core.hashing import hash_file
 from lamindb.base import ids
@@ -217,8 +218,8 @@ class Context:
         self._description = value
     @property
+    @deprecated(new_name="description")
     def name(self) -> str | None:
-        """Deprecated. Populates `description` argument for `context.transform`."""
         return self._description
     @name.setter
@@ -257,7 +258,7 @@ class Context:
         path: str | None = None,
         log_to_file: bool | None = None,
     ) -> None:
-        """Initiate a run with tracked data lineage.
+        """Track a global run of your Python session.
         - sets :attr:`~lamindb.core.Context.transform` &
           :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
@@ -284,6 +285,10 @@ class Context:
             >>> ln.track()
+            If you want to ensure a single version history across renames of the notebook or script, pass the auto-generated `uid` that you'll find in the logs:
+            >>> ln.track("Onv04I53OgtT0000")  # example uid, the last four characters encode the version of the transform
         """
         self._logging_message_track = ""
         self._logging_message_imports = ""

lamindb/core/_mapped_collection.py CHANGED Viewed

@@ -27,7 +27,8 @@ if TYPE_CHECKING:
 class _Connect:
     def __init__(self, storage):
         if isinstance(storage, UPath):
-            self.conn, self.store = registry.open("h5py", storage)
+            # force no external compression even for files with .gz extension. REMOVE LATER
+            self.conn, self.store = registry.open("h5py", storage, compression=None)
             self.to_close = True
         else:
             self.conn, self.store = None, storage
@@ -246,7 +247,8 @@ class MappedCollection:
                 if parallel:
                     conn, storage = None, path
                 else:
-                    conn, storage = registry.open("h5py", path)
+                    # force no external compression even for files with .gz extension. REMOVE LATER
+                    conn, storage = registry.open("h5py", path, compression=None)
             else:
                 conn, storage = registry.open("zarr", path)
             self.conns.append(conn)

lamindb/core/_track_environment.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import subprocess
+import sys
 from typing import TYPE_CHECKING
 import lamindb_setup as ln_setup
@@ -17,7 +18,7 @@ def track_environment(run: Run) -> None:
     try:
         with open(filepath, "w") as f:
             result = subprocess.run(
-                ["pip", "freeze"],
+                [sys.executable, "-m", "pip", "freeze"],
                 stdout=f,
             )
     except OSError as e:

lamindb/core/datasets/_small.py CHANGED Viewed

@@ -23,7 +23,7 @@ def small_dataset1(
         var_ids[0]: [1, 2, 3],
         var_ids[1]: [3, 4, 5],
         var_ids[2]: [5, 6, 7],
-        "cell_medium": pd.Categorical(["DMSO", ifng, "DMSO"]),
+        "perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
         "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
         "cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
         "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
@@ -60,7 +60,7 @@ def small_dataset2(
         var_ids[0]: [2, 3, 3],
         var_ids[1]: [3, 4, 5],
         var_ids[2]: [4, 2, 3],
-        "cell_medium": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
+        "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
         "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
     }
     metadata = {
@@ -74,7 +74,7 @@ def small_dataset2(
     )
     ad.AnnData(
         dataset_df[var_ids],
-        obs=dataset_df[["cell_medium", "cell_type_by_model"]],
+        obs=dataset_df[["perturbation", "cell_type_by_model"]],
     )
     if otype == "DataFrame":
         for key, value in metadata.items():

lamindb/core/loaders.py CHANGED Viewed

@@ -65,8 +65,8 @@ def load_tsv(path: UPathStr, **kwargs) -> pd.DataFrame:
 def load_h5ad(filepath, **kwargs) -> ad.AnnData:
     """Load an `.h5ad` file to `AnnData`."""
     fs, filepath = infer_filesystem(filepath)
-    with fs.open(filepath, mode="rb") as file:
+    compression = kwargs.pop("compression", "infer")
+    with fs.open(filepath, mode="rb", compression=compression) as file:
         adata = ad.read_h5ad(file, backed=False, **kwargs)
         return adata
@@ -148,9 +148,13 @@ def load_rds(path: UPathStr) -> UPathStr:
 FILE_LOADERS = {
     ".csv": pd.read_csv,
+    ".csv.gz": pd.read_csv,
     ".tsv": load_tsv,
+    ".tsv.gz": load_tsv,
     ".h5ad": load_h5ad,
+    ".h5ad.gz": load_h5ad,
     ".parquet": pd.read_parquet,
+    ".parquet.gz": pd.read_parquet,  # this doesn't work for externally gzipped files, REMOVE LATER
     ".fcs": load_fcs,
     ".zarr": load_anndata_zarr,
     ".html": load_html,
@@ -177,7 +181,15 @@ def load_to_memory(filepath: UPathStr, **kwargs):
     filepath = settings._storage_settings.cloud_to_local(filepath, print_progress=True)
-    loader = FILE_LOADERS.get(filepath.suffix)
+    # infer the correct suffix when .gz is present
+    suffixes = filepath.suffixes
+    suffix = (
+        "".join(suffixes[-2:])
+        if len(suffixes) > 1 and ".gz" in suffixes
+        else filepath.suffix
+    )
+    loader = FILE_LOADERS.get(suffix)
     if loader is None:
         return filepath
     else:

lamindb/core/storage/_anndata_accessor.py CHANGED Viewed

@@ -16,6 +16,7 @@ from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
 from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
 from anndata.compat import _read_attr
 from fsspec.implementations.local import LocalFileSystem
+from fsspec.utils import infer_compression
 from lamin_utils import logger
 from lamindb_setup.core.upath import create_mapper, infer_filesystem
 from packaging import version
@@ -152,9 +153,13 @@ registry = AccessRegistry()
 @registry.register_open("h5py")
-def open(filepath: UPathStr, mode: str = "r"):
+def open(filepath: UPathStr, mode: str = "r", compression: str | None = "infer"):
     fs, file_path_str = infer_filesystem(filepath)
-    if isinstance(fs, LocalFileSystem):
+    # we don't open compressed files directly because we need fsspec to uncompress on .open
+    compression = (
+        infer_compression(file_path_str) if compression == "infer" else compression
+    )
+    if isinstance(fs, LocalFileSystem) and compression is None:
         assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!"  #  noqa: S101
         return None, h5py.File(file_path_str, mode=mode)
     if mode == "r":
@@ -165,7 +170,7 @@ def open(filepath: UPathStr, mode: str = "r"):
         conn_mode = "ab"
     else:
         raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
-    conn = fs.open(file_path_str, mode=conn_mode)
+    conn = fs.open(file_path_str, mode=conn_mode, compression=compression)
     try:
         storage = h5py.File(conn, mode=mode)
     except Exception as e:

lamindb/core/storage/_backed_access.py CHANGED Viewed

@@ -70,6 +70,7 @@ def backed_access(
     artifact_or_filepath: Artifact | UPath,
     mode: str = "r",
     using_key: str | None = None,
+    **kwargs,
 ) -> (
     AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
 ):
@@ -80,18 +81,22 @@ def backed_access(
     else:
         objectpath = artifact_or_filepath
     name = objectpath.name
-    suffix = objectpath.suffix
+    # ignore .gz, only check the real suffix
+    suffixes = objectpath.suffixes
+    suffix = (
+        suffixes[-2] if len(suffixes) > 1 and ".gz" in suffixes else objectpath.suffix
+    )
     if name == "soma" or suffix == ".tiledbsoma":
         if mode not in {"r", "w"}:
             raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.")
-        return _open_tiledbsoma(objectpath, mode=mode)  # type: ignore
+        return _open_tiledbsoma(objectpath, mode=mode, **kwargs)  # type: ignore
     elif suffix in {".h5", ".hdf5", ".h5ad"}:
-        conn, storage = registry.open("h5py", objectpath, mode=mode)
+        conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
     elif suffix == ".zarr":
-        conn, storage = registry.open("zarr", objectpath, mode=mode)
+        conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
     elif _is_pyarrow_dataset(objectpath):
-        return _open_pyarrow_dataset(objectpath)
+        return _open_pyarrow_dataset(objectpath, **kwargs)
     else:
         raise ValueError(
             "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "

lamindb 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

lamindb 1.1.0py3-none-any.whl → 1.1.1py3-none-any.whl