lamindb 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lamindb/__init__.py CHANGED
@@ -1,12 +1,13 @@
1
1
  """A data framework for biology.
2
2
 
3
- Tracking notebooks & scripts.
3
+ Tracking notebooks, scripts & functions.
4
4
 
5
5
  .. autosummary::
6
6
  :toctree: .
7
7
 
8
8
  track
9
9
  finish
10
+ tracked
10
11
 
11
12
  Registries.
12
13
 
@@ -57,11 +58,12 @@ Backward compatibility.
57
58
  :toctree: .
58
59
 
59
60
  FeatureSet
61
+ Curator
60
62
 
61
63
  """
62
64
 
63
65
  # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
64
- __version__ = "1.1.0"
66
+ __version__ = "1.1.1"
65
67
 
66
68
  from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
67
69
  from lamindb_setup._check_setup import _check_instance_setup
lamindb/_artifact.py CHANGED
@@ -152,6 +152,7 @@ def process_data(
152
152
  default_storage: Storage,
153
153
  using_key: str | None,
154
154
  skip_existence_check: bool = False,
155
+ is_replace: bool = False,
155
156
  ) -> tuple[Any, Path | UPath, str, Storage, bool]:
156
157
  """Serialize a data object that's provided as file or in memory."""
157
158
  # if not overwritten, data gets stored in default storage
@@ -161,14 +162,24 @@ def process_data(
161
162
  data_types = (pd.DataFrame, AnnData, MuData)
162
163
  else:
163
164
  data_types = (pd.DataFrame, AnnData) # type:ignore
164
-
165
+ if key is not None:
166
+ key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
167
+ # use suffix as the (adata) format if the format is not provided
168
+ if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
169
+ format = key_suffix[1:]
170
+ else:
171
+ key_suffix = None
165
172
  if isinstance(data, (str, Path, UPath)): # UPathStr, spelled out
166
173
  access_token = (
167
174
  default_storage._access_token
168
175
  if hasattr(default_storage, "_access_token")
169
176
  else None
170
177
  )
171
- path = create_path(data, access_token=access_token).resolve()
178
+ path = create_path(data, access_token=access_token)
179
+ # we don't resolve http links because they can resolve into a different domain
180
+ # for example into a temporary url
181
+ if path.protocol not in {"http", "https"}:
182
+ path = path.resolve()
172
183
  storage, use_existing_storage_key = process_pathlike(
173
184
  path,
174
185
  default_storage=default_storage,
@@ -180,30 +191,23 @@ def process_data(
180
191
  elif isinstance(data, data_types):
181
192
  storage = default_storage
182
193
  memory_rep = data
183
- if key is not None:
184
- key_suffix = extract_suffix_from_path(PurePosixPath(key), arg_name="key")
185
- # use suffix as the (adata) format if the format is not provided
186
- if isinstance(data, AnnData) and format is None and len(key_suffix) > 0:
187
- format = key_suffix[1:]
188
- else:
189
- key_suffix = None
190
194
  suffix = infer_suffix(data, format)
191
- if key_suffix is not None and key_suffix != suffix:
192
- raise InvalidArgument(
193
- f"The suffix '{key_suffix}' of the provided key is incorrect, it should"
194
- f" be '{suffix}'."
195
- )
196
- cache_name = f"{provisional_uid}{suffix}"
197
- path = settings.cache_dir / cache_name
198
- # Alex: I don't understand the line below
199
- if path.suffixes == []:
200
- path = path.with_suffix(suffix)
201
- write_to_disk(data, path)
202
- use_existing_storage_key = False
203
195
  else:
204
196
  raise NotImplementedError(
205
197
  f"Do not know how to create a artifact object from {data}, pass a path instead!"
206
198
  )
199
+ if key_suffix is not None and key_suffix != suffix and not is_replace:
200
+ # consciously omitting a trailing period
201
+ if isinstance(data, (str, Path, UPath)):
202
+ message = f"The suffix '{suffix}' of the provided path is inconsistent, it should be '{key_suffix}'"
203
+ else:
204
+ message = f"The suffix '{key_suffix}' of the provided key is inconsistent, it should be '{suffix}'"
205
+ raise InvalidArgument(message)
206
+ # in case we have an in-memory representation, we need to write it to disk
207
+ if isinstance(data, data_types):
208
+ path = settings.cache_dir / f"{provisional_uid}{suffix}"
209
+ write_to_disk(data, path)
210
+ use_existing_storage_key = False
207
211
  return memory_rep, path, suffix, storage, use_existing_storage_key
208
212
 
209
213
 
@@ -321,6 +325,7 @@ def get_artifact_kwargs_from_data(
321
325
  default_storage,
322
326
  using_key,
323
327
  skip_check_exists,
328
+ is_replace=is_replace,
324
329
  )
325
330
  stat_or_artifact = get_stat_or_artifact(
326
331
  path=path,
@@ -453,7 +458,7 @@ def data_is_anndata(data: AnnData | UPathStr) -> bool:
453
458
  return True
454
459
  if isinstance(data, (str, Path, UPath)):
455
460
  data_path = UPath(data)
456
- if data_path.suffix == ".h5ad":
461
+ if ".h5ad" in data_path.suffixes: # ".h5ad.gz" is a valid suffix
457
462
  return True
458
463
  elif data_path.suffix == ".zarr":
459
464
  # ".anndata.zarr" is a valid suffix (core.storage._valid_suffixes)
@@ -689,6 +694,7 @@ def from_df(
689
694
  kind="dataset",
690
695
  **kwargs,
691
696
  )
697
+ artifact.n_observations = len(df)
692
698
  return artifact
693
699
 
694
700
 
@@ -973,7 +979,7 @@ inconsistent_state_msg = (
973
979
 
974
980
  # docstring handled through attach_func_to_class_method
975
981
  def open(
976
- self, mode: str = "r", is_run_input: bool | None = None
982
+ self, mode: str = "r", is_run_input: bool | None = None, **kwargs
977
983
  ) -> (
978
984
  AnnDataAccessor
979
985
  | BackedAccessor
@@ -984,16 +990,23 @@ def open(
984
990
  ):
985
991
  if self._overwrite_versions and not self.is_latest:
986
992
  raise ValueError(inconsistent_state_msg)
993
+ # all hdf5 suffixes including gzipped
994
+ h5_suffixes = [".h5", ".hdf5", ".h5ad"]
995
+ h5_suffixes += [s + ".gz" for s in h5_suffixes]
987
996
  # ignore empty suffix for now
988
997
  suffixes = (
989
- "",
990
- ".h5",
991
- ".hdf5",
992
- ".h5ad",
993
- ".zarr",
994
- ".anndata.zarr",
995
- ".tiledbsoma",
996
- ) + PYARROW_SUFFIXES
998
+ (
999
+ "",
1000
+ ".zarr",
1001
+ ".anndata.zarr",
1002
+ ".tiledbsoma",
1003
+ )
1004
+ + tuple(h5_suffixes)
1005
+ + PYARROW_SUFFIXES
1006
+ + tuple(
1007
+ s + ".gz" for s in PYARROW_SUFFIXES
1008
+ ) # this doesn't work for externally gzipped files, REMOVE LATER
1009
+ )
997
1010
  if self.suffix not in suffixes:
998
1011
  raise ValueError(
999
1012
  "Artifact should have a zarr, h5, tiledbsoma object"
@@ -1011,7 +1024,7 @@ def open(
1011
1024
  using_key = settings._using_key
1012
1025
  filepath, cache_key = filepath_cache_key_from_artifact(self, using_key=using_key)
1013
1026
  is_tiledbsoma_w = (
1014
- filepath.name == "soma" or filepath.suffix == ".tiledbsoma"
1027
+ filepath.name == "soma" or self.suffix == ".tiledbsoma"
1015
1028
  ) and mode == "w"
1016
1029
  # consider the case where an object is already locally cached
1017
1030
  localpath = setup_settings.paths.cloud_to_local_no_update(
@@ -1025,14 +1038,14 @@ def open(
1025
1038
  ) and not filepath.synchronize(localpath, just_check=True)
1026
1039
  if open_cache:
1027
1040
  try:
1028
- access = backed_access(localpath, mode, using_key)
1041
+ access = backed_access(localpath, mode, using_key, **kwargs)
1029
1042
  except Exception as e:
1030
1043
  if isinstance(filepath, LocalPathClasses):
1031
1044
  raise e
1032
1045
  logger.warning(
1033
1046
  f"The cache might be corrupted: {e}. Trying to open directly."
1034
1047
  )
1035
- access = backed_access(filepath, mode, using_key)
1048
+ access = backed_access(filepath, mode, using_key, **kwargs)
1036
1049
  # happens only if backed_access has been successful
1037
1050
  # delete the corrupted cache
1038
1051
  if localpath.is_dir():
@@ -1040,7 +1053,7 @@ def open(
1040
1053
  else:
1041
1054
  localpath.unlink(missing_ok=True)
1042
1055
  else:
1043
- access = backed_access(filepath, mode, using_key)
1056
+ access = backed_access(filepath, mode, using_key, **kwargs)
1044
1057
  if is_tiledbsoma_w:
1045
1058
 
1046
1059
  def finalize():
@@ -1237,6 +1250,7 @@ def _delete_skip_storage(artifact, *args, **kwargs) -> None:
1237
1250
  def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1238
1251
  state_was_adding = self._state.adding
1239
1252
  print_progress = kwargs.pop("print_progress", True)
1253
+ store_kwargs = kwargs.pop("store_kwargs", {}) # kwargs for .upload_from in the end
1240
1254
  access_token = kwargs.pop("access_token", None)
1241
1255
  local_path = None
1242
1256
  if upload and setup_settings.instance.keep_artifacts_local:
@@ -1258,7 +1272,11 @@ def save(self, upload: bool | None = None, **kwargs) -> Artifact:
1258
1272
  if "using" in kwargs:
1259
1273
  using_key = kwargs["using"]
1260
1274
  exception_upload = check_and_attempt_upload(
1261
- self, using_key, access_token=access_token, print_progress=print_progress
1275
+ self,
1276
+ using_key,
1277
+ access_token=access_token,
1278
+ print_progress=print_progress,
1279
+ **store_kwargs,
1262
1280
  )
1263
1281
  if exception_upload is not None:
1264
1282
  # we do not want to raise file not found on cleanup if upload of a file failed
lamindb/_collection.py CHANGED
@@ -273,7 +273,7 @@ def mapped(
273
273
  else:
274
274
  artifacts = self.ordered_artifacts.all()
275
275
  for artifact in artifacts:
276
- if artifact.suffix not in {".h5ad", ".zarr"}:
276
+ if ".h5ad" not in artifact.suffix and ".zarr" not in artifact.suffix:
277
277
  logger.warning(f"ignoring artifact with suffix {artifact.suffix}")
278
278
  continue
279
279
  elif not stream:
lamindb/_feature.py CHANGED
@@ -218,7 +218,7 @@ def __init__(self, *args, **kwargs):
218
218
  return None
219
219
  dtype = kwargs.get("dtype", None)
220
220
  default_value = kwargs.pop("default_value", None)
221
- nullable = kwargs.pop("nullable", None)
221
+ nullable = kwargs.pop("nullable", True) # default value of nullable
222
222
  cat_filters = kwargs.pop("cat_filters", None)
223
223
  kwargs = process_init_feature_param(args, kwargs)
224
224
  super(Feature, self).__init__(*args, **kwargs)
lamindb/_finish.py CHANGED
@@ -436,7 +436,15 @@ def save_context_core(
436
436
  # save both run & transform records if we arrive here
437
437
  if run is not None:
438
438
  run.save()
439
- transform.save()
439
+ transform_id_prior_to_save = transform.id
440
+ transform.save() # this in-place updates the state of transform upon hash collision
441
+ if transform.id != transform_id_prior_to_save:
442
+ # the hash existed and we're actually back to the previous version
443
+ # hence, this was in fact a run of the previous transform rather than of
444
+ # the new transform
445
+ # this can happen in interactive notebooks if the user makes no change to the notebook
446
+ run.transform = transform
447
+ run.save()
440
448
 
441
449
  # finalize
442
450
  if not from_cli and run is not None:
lamindb/_query_set.py CHANGED
@@ -214,10 +214,27 @@ def get(
214
214
  else:
215
215
  assert idlike is None # noqa: S101
216
216
  expressions = process_expressions(qs, expressions)
217
+ # don't want _branch_code here in .get(), only in .filter()
218
+ expressions.pop("_branch_code", None)
217
219
  # inject is_latest for consistency with idlike
218
- if issubclass(registry, IsVersioned) and "is_latest" not in expressions:
220
+ is_latest_was_not_in_expressions = "is_latest" not in expressions
221
+ if issubclass(registry, IsVersioned) and is_latest_was_not_in_expressions:
219
222
  expressions["is_latest"] = True
220
- return registry.objects.using(qs.db).get(**expressions)
223
+ try:
224
+ return registry.objects.using(qs.db).get(**expressions)
225
+ except registry.DoesNotExist:
226
+ # handle the case in which the is_latest injection led to a missed query
227
+ if "is_latest" in expressions and is_latest_was_not_in_expressions:
228
+ expressions.pop("is_latest")
229
+ result = (
230
+ registry.objects.using(qs.db)
231
+ .filter(**expressions)
232
+ .order_by("-created_at")
233
+ .first()
234
+ )
235
+ if result is not None:
236
+ return result
237
+ raise registry.DoesNotExist from registry.DoesNotExist
221
238
 
222
239
 
223
240
  class RecordList(UserList, Generic[T]):
@@ -641,11 +658,12 @@ class QuerySet(models.QuerySet):
641
658
  and value.strip("-").isalpha()
642
659
  and "__" not in field
643
660
  and hasattr(self.model, field)
644
- and getattr(self.model, field).field.related_model
645
661
  ):
646
- raise FieldError(
647
- f"Invalid lookup '{value}' for {field}. Did you mean {field}__name?"
648
- )
662
+ field_attr = getattr(self.model, field)
663
+ if hasattr(field_attr, "field") and field_attr.field.related_model:
664
+ raise FieldError(
665
+ f"Invalid lookup '{value}' for {field}. Did you mean {field}__name?"
666
+ )
649
667
 
650
668
  expressions = process_expressions(self, expressions)
651
669
  if len(expressions) > 0:
lamindb/_record.py CHANGED
@@ -248,11 +248,10 @@ def __init__(record: Record, *args, **kwargs):
248
248
  f" {name_field}{version_comment}: '{kwargs[name_field]}'"
249
249
  )
250
250
  if isinstance(record, Schema):
251
- if Artifact.filter(schema=record).exists():
252
- if record.hash != kwargs["hash"]:
253
- raise ValueError(
254
- "Schema is already in use, can't be changed."
255
- )
251
+ if existing_record.hash != kwargs["hash"]:
252
+ raise ValueError(
253
+ f"Schema name is already in use by schema with uid '{existing_record.uid}', please choose a different name."
254
+ )
256
255
  init_self_from_db(record, existing_record)
257
256
  update_attributes(record, kwargs)
258
257
  return None
lamindb/_save.py CHANGED
@@ -133,7 +133,9 @@ def check_and_attempt_upload(
133
133
  using_key: str | None = None,
134
134
  access_token: str | None = None,
135
135
  print_progress: bool = True,
136
+ **kwargs,
136
137
  ) -> Exception | None:
138
+ # kwargs are propagated to .upload_from in the end
137
139
  # if Artifact object is either newly instantiated or replace() was called on
138
140
  # a local env it will have a _local_filepath and needs to be uploaded
139
141
  if hasattr(artifact, "_local_filepath"):
@@ -143,6 +145,7 @@ def check_and_attempt_upload(
143
145
  using_key,
144
146
  access_token=access_token,
145
147
  print_progress=print_progress,
148
+ **kwargs,
146
149
  )
147
150
  except Exception as exception:
148
151
  logger.warning(f"could not upload artifact: {artifact}")
@@ -316,8 +319,10 @@ def upload_artifact(
316
319
  using_key: str | None = None,
317
320
  access_token: str | None = None,
318
321
  print_progress: bool = True,
322
+ **kwargs,
319
323
  ) -> tuple[UPath, UPath | None]:
320
324
  """Store and add file and its linked entries."""
325
+ # kwargs are propagated to .upload_from in the end
321
326
  # can't currently use filepath_from_artifact here because it resolves to ._local_filepath
322
327
  storage_key = auto_storage_key_from_artifact(artifact)
323
328
  storage_path, storage_settings = attempt_accessing_path(
@@ -326,7 +331,10 @@ def upload_artifact(
326
331
  if hasattr(artifact, "_to_store") and artifact._to_store:
327
332
  logger.save(f"storing artifact '{artifact.uid}' at '{storage_path}'")
328
333
  store_file_or_folder(
329
- artifact._local_filepath, storage_path, print_progress=print_progress
334
+ artifact._local_filepath,
335
+ storage_path,
336
+ print_progress=print_progress,
337
+ **kwargs,
330
338
  )
331
339
 
332
340
  if isinstance(storage_path, LocalPathClasses):
lamindb/_tracked.py CHANGED
@@ -26,10 +26,33 @@ def get_current_tracked_run() -> Run | None:
26
26
 
27
27
 
28
28
  def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]:
29
- """Decorator that tracks function execution.
29
+ """Mark a function as tracked with this decorator.
30
+
31
+ You will be able to see inputs, outputs, and parameters of the function in the data lineage graph.
32
+
33
+ Guide: :doc:`/track`
34
+
35
+ .. versionadded:: 1.1.0
36
+ This is still in beta and will be refined in future releases.
30
37
 
31
38
  Args:
32
- uid: Optional unique identifier for the transform
39
+ uid: Persist the uid to identify this transform across renames.
40
+
41
+ Example::
42
+
43
+ import lamindb as ln
44
+
45
+ @ln.tracked()
46
+ def subset_dataframe(
47
+ input_artifact_key: str, # all arguments tracked as parameters of the function run
48
+ output_artifact_key: str,
49
+ subset_rows: int = 2,
50
+ subset_cols: int = 2,
51
+ ) -> None:
52
+ artifact = ln.Artifact.get(key=input_artifact_key)
53
+ df = artifact.load() # auto-tracked as input
54
+ new_df = df.iloc[:subset_rows, :subset_cols]
55
+ ln.Artifact.from_df(new_df, key=output_artifact_key).save() # auto-tracked as output
33
56
  """
34
57
 
35
58
  def decorator_tracked(func: Callable[P, R]) -> Callable[P, R]:
lamindb/base/users.py CHANGED
@@ -12,12 +12,9 @@ def current_user_id() -> int:
12
12
  if ln_setup.core.django.IS_MIGRATING:
13
13
  return 1
14
14
  else:
15
- exc_attr = (
16
- "DoesNotExist" if hasattr(User, "DoesNotExist") else "_DoesNotExist"
17
- )
18
15
  try:
19
16
  user_id = User.objects.get(uid=settings.user.uid).id
20
- except getattr(User, exc_attr):
17
+ except User.DoesNotExist:
21
18
  register_user(settings.user)
22
19
  user_id = User.objects.get(uid=settings.user.uid).id
23
20
  return user_id
lamindb/core/_context.py CHANGED
@@ -13,6 +13,7 @@ from typing import TYPE_CHECKING
13
13
  import lamindb_setup as ln_setup
14
14
  from django.db.models import Func, IntegerField
15
15
  from lamin_utils import logger
16
+ from lamindb_setup.core import deprecated
16
17
  from lamindb_setup.core.hashing import hash_file
17
18
 
18
19
  from lamindb.base import ids
@@ -217,8 +218,8 @@ class Context:
217
218
  self._description = value
218
219
 
219
220
  @property
221
+ @deprecated(new_name="description")
220
222
  def name(self) -> str | None:
221
- """Deprecated. Populates `description` argument for `context.transform`."""
222
223
  return self._description
223
224
 
224
225
  @name.setter
@@ -257,7 +258,7 @@ class Context:
257
258
  path: str | None = None,
258
259
  log_to_file: bool | None = None,
259
260
  ) -> None:
260
- """Initiate a run with tracked data lineage.
261
+ """Track a global run of your Python session.
261
262
 
262
263
  - sets :attr:`~lamindb.core.Context.transform` &
263
264
  :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
@@ -284,6 +285,10 @@ class Context:
284
285
 
285
286
  >>> ln.track()
286
287
 
288
+ If you want to ensure a single version history across renames of the notebook or script, pass the auto-generated `uid` that you'll find in the logs:
289
+
290
+ >>> ln.track("Onv04I53OgtT0000") # example uid, the last four characters encode the version of the transform
291
+
287
292
  """
288
293
  self._logging_message_track = ""
289
294
  self._logging_message_imports = ""
@@ -27,7 +27,8 @@ if TYPE_CHECKING:
27
27
  class _Connect:
28
28
  def __init__(self, storage):
29
29
  if isinstance(storage, UPath):
30
- self.conn, self.store = registry.open("h5py", storage)
30
+ # force no external compression even for files with .gz extension. REMOVE LATER
31
+ self.conn, self.store = registry.open("h5py", storage, compression=None)
31
32
  self.to_close = True
32
33
  else:
33
34
  self.conn, self.store = None, storage
@@ -246,7 +247,8 @@ class MappedCollection:
246
247
  if parallel:
247
248
  conn, storage = None, path
248
249
  else:
249
- conn, storage = registry.open("h5py", path)
250
+ # force no external compression even for files with .gz extension. REMOVE LATER
251
+ conn, storage = registry.open("h5py", path, compression=None)
250
252
  else:
251
253
  conn, storage = registry.open("zarr", path)
252
254
  self.conns.append(conn)
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import subprocess
4
+ import sys
4
5
  from typing import TYPE_CHECKING
5
6
 
6
7
  import lamindb_setup as ln_setup
@@ -17,7 +18,7 @@ def track_environment(run: Run) -> None:
17
18
  try:
18
19
  with open(filepath, "w") as f:
19
20
  result = subprocess.run(
20
- ["pip", "freeze"],
21
+ [sys.executable, "-m", "pip", "freeze"],
21
22
  stdout=f,
22
23
  )
23
24
  except OSError as e:
@@ -23,7 +23,7 @@ def small_dataset1(
23
23
  var_ids[0]: [1, 2, 3],
24
24
  var_ids[1]: [3, 4, 5],
25
25
  var_ids[2]: [5, 6, 7],
26
- "cell_medium": pd.Categorical(["DMSO", ifng, "DMSO"]),
26
+ "perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
27
27
  "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
28
28
  "cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
29
29
  "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
@@ -60,7 +60,7 @@ def small_dataset2(
60
60
  var_ids[0]: [2, 3, 3],
61
61
  var_ids[1]: [3, 4, 5],
62
62
  var_ids[2]: [4, 2, 3],
63
- "cell_medium": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
63
+ "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
64
64
  "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
65
65
  }
66
66
  metadata = {
@@ -74,7 +74,7 @@ def small_dataset2(
74
74
  )
75
75
  ad.AnnData(
76
76
  dataset_df[var_ids],
77
- obs=dataset_df[["cell_medium", "cell_type_by_model"]],
77
+ obs=dataset_df[["perturbation", "cell_type_by_model"]],
78
78
  )
79
79
  if otype == "DataFrame":
80
80
  for key, value in metadata.items():
lamindb/core/loaders.py CHANGED
@@ -65,8 +65,8 @@ def load_tsv(path: UPathStr, **kwargs) -> pd.DataFrame:
65
65
  def load_h5ad(filepath, **kwargs) -> ad.AnnData:
66
66
  """Load an `.h5ad` file to `AnnData`."""
67
67
  fs, filepath = infer_filesystem(filepath)
68
-
69
- with fs.open(filepath, mode="rb") as file:
68
+ compression = kwargs.pop("compression", "infer")
69
+ with fs.open(filepath, mode="rb", compression=compression) as file:
70
70
  adata = ad.read_h5ad(file, backed=False, **kwargs)
71
71
  return adata
72
72
 
@@ -148,9 +148,13 @@ def load_rds(path: UPathStr) -> UPathStr:
148
148
 
149
149
  FILE_LOADERS = {
150
150
  ".csv": pd.read_csv,
151
+ ".csv.gz": pd.read_csv,
151
152
  ".tsv": load_tsv,
153
+ ".tsv.gz": load_tsv,
152
154
  ".h5ad": load_h5ad,
155
+ ".h5ad.gz": load_h5ad,
153
156
  ".parquet": pd.read_parquet,
157
+ ".parquet.gz": pd.read_parquet, # this doesn't work for externally gzipped files, REMOVE LATER
154
158
  ".fcs": load_fcs,
155
159
  ".zarr": load_anndata_zarr,
156
160
  ".html": load_html,
@@ -177,7 +181,15 @@ def load_to_memory(filepath: UPathStr, **kwargs):
177
181
 
178
182
  filepath = settings._storage_settings.cloud_to_local(filepath, print_progress=True)
179
183
 
180
- loader = FILE_LOADERS.get(filepath.suffix)
184
+ # infer the correct suffix when .gz is present
185
+ suffixes = filepath.suffixes
186
+ suffix = (
187
+ "".join(suffixes[-2:])
188
+ if len(suffixes) > 1 and ".gz" in suffixes
189
+ else filepath.suffix
190
+ )
191
+
192
+ loader = FILE_LOADERS.get(suffix)
181
193
  if loader is None:
182
194
  return filepath
183
195
  else:
@@ -16,6 +16,7 @@ from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
16
16
  from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
17
17
  from anndata.compat import _read_attr
18
18
  from fsspec.implementations.local import LocalFileSystem
19
+ from fsspec.utils import infer_compression
19
20
  from lamin_utils import logger
20
21
  from lamindb_setup.core.upath import create_mapper, infer_filesystem
21
22
  from packaging import version
@@ -152,9 +153,13 @@ registry = AccessRegistry()
152
153
 
153
154
 
154
155
  @registry.register_open("h5py")
155
- def open(filepath: UPathStr, mode: str = "r"):
156
+ def open(filepath: UPathStr, mode: str = "r", compression: str | None = "infer"):
156
157
  fs, file_path_str = infer_filesystem(filepath)
157
- if isinstance(fs, LocalFileSystem):
158
+ # we don't open compressed files directly because we need fsspec to uncompress on .open
159
+ compression = (
160
+ infer_compression(file_path_str) if compression == "infer" else compression
161
+ )
162
+ if isinstance(fs, LocalFileSystem) and compression is None:
158
163
  assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
159
164
  return None, h5py.File(file_path_str, mode=mode)
160
165
  if mode == "r":
@@ -165,7 +170,7 @@ def open(filepath: UPathStr, mode: str = "r"):
165
170
  conn_mode = "ab"
166
171
  else:
167
172
  raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
168
- conn = fs.open(file_path_str, mode=conn_mode)
173
+ conn = fs.open(file_path_str, mode=conn_mode, compression=compression)
169
174
  try:
170
175
  storage = h5py.File(conn, mode=mode)
171
176
  except Exception as e:
@@ -70,6 +70,7 @@ def backed_access(
70
70
  artifact_or_filepath: Artifact | UPath,
71
71
  mode: str = "r",
72
72
  using_key: str | None = None,
73
+ **kwargs,
73
74
  ) -> (
74
75
  AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
75
76
  ):
@@ -80,18 +81,22 @@ def backed_access(
80
81
  else:
81
82
  objectpath = artifact_or_filepath
82
83
  name = objectpath.name
83
- suffix = objectpath.suffix
84
+ # ignore .gz, only check the real suffix
85
+ suffixes = objectpath.suffixes
86
+ suffix = (
87
+ suffixes[-2] if len(suffixes) > 1 and ".gz" in suffixes else objectpath.suffix
88
+ )
84
89
 
85
90
  if name == "soma" or suffix == ".tiledbsoma":
86
91
  if mode not in {"r", "w"}:
87
92
  raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.")
88
- return _open_tiledbsoma(objectpath, mode=mode) # type: ignore
93
+ return _open_tiledbsoma(objectpath, mode=mode, **kwargs) # type: ignore
89
94
  elif suffix in {".h5", ".hdf5", ".h5ad"}:
90
- conn, storage = registry.open("h5py", objectpath, mode=mode)
95
+ conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
91
96
  elif suffix == ".zarr":
92
- conn, storage = registry.open("zarr", objectpath, mode=mode)
97
+ conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
93
98
  elif _is_pyarrow_dataset(objectpath):
94
- return _open_pyarrow_dataset(objectpath)
99
+ return _open_pyarrow_dataset(objectpath, **kwargs)
95
100
  else:
96
101
  raise ValueError(
97
102
  "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "