lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. lamindb/__init__.py +31 -26
  2. lamindb/_finish.py +9 -1
  3. lamindb/_tracked.py +26 -3
  4. lamindb/_view.py +2 -3
  5. lamindb/base/__init__.py +1 -1
  6. lamindb/base/ids.py +1 -10
  7. lamindb/base/users.py +1 -4
  8. lamindb/core/__init__.py +7 -65
  9. lamindb/core/_context.py +41 -10
  10. lamindb/core/_mapped_collection.py +4 -2
  11. lamindb/core/_settings.py +6 -6
  12. lamindb/core/_sync_git.py +1 -1
  13. lamindb/core/_track_environment.py +2 -1
  14. lamindb/core/datasets/_small.py +3 -3
  15. lamindb/core/loaders.py +22 -9
  16. lamindb/core/storage/_anndata_accessor.py +8 -3
  17. lamindb/core/storage/_backed_access.py +14 -7
  18. lamindb/core/storage/_pyarrow_dataset.py +24 -9
  19. lamindb/core/storage/_tiledbsoma.py +6 -4
  20. lamindb/core/storage/_zarr.py +32 -11
  21. lamindb/core/storage/objects.py +59 -26
  22. lamindb/core/storage/paths.py +16 -13
  23. lamindb/curators/__init__.py +173 -145
  24. lamindb/errors.py +1 -1
  25. lamindb/integrations/_vitessce.py +4 -4
  26. lamindb/migrations/0089_subsequent_runs.py +159 -0
  27. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  28. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  29. lamindb/models/__init__.py +79 -0
  30. lamindb/{core → models}/_describe.py +3 -3
  31. lamindb/{core → models}/_django.py +8 -5
  32. lamindb/{core → models}/_feature_manager.py +103 -87
  33. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  34. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  35. lamindb/{core → models}/_label_manager.py +10 -17
  36. lamindb/{core/relations.py → models/_relations.py} +8 -1
  37. lamindb/models/artifact.py +2601 -0
  38. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  39. lamindb/models/collection.py +683 -0
  40. lamindb/models/core.py +135 -0
  41. lamindb/models/feature.py +643 -0
  42. lamindb/models/flextable.py +163 -0
  43. lamindb/{_parents.py → models/has_parents.py} +55 -49
  44. lamindb/models/project.py +384 -0
  45. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  46. lamindb/{_query_set.py → models/query_set.py} +52 -30
  47. lamindb/models/record.py +1757 -0
  48. lamindb/models/run.py +563 -0
  49. lamindb/{_save.py → models/save.py} +18 -8
  50. lamindb/models/schema.py +732 -0
  51. lamindb/models/transform.py +360 -0
  52. lamindb/models/ulabel.py +249 -0
  53. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
  54. lamindb-1.2a2.dist-info/RECORD +94 -0
  55. lamindb/_artifact.py +0 -1361
  56. lamindb/_collection.py +0 -440
  57. lamindb/_feature.py +0 -316
  58. lamindb/_is_versioned.py +0 -40
  59. lamindb/_record.py +0 -1065
  60. lamindb/_run.py +0 -60
  61. lamindb/_schema.py +0 -347
  62. lamindb/_storage.py +0 -15
  63. lamindb/_transform.py +0 -170
  64. lamindb/_ulabel.py +0 -56
  65. lamindb/_utils.py +0 -9
  66. lamindb/base/validation.py +0 -63
  67. lamindb/core/_data.py +0 -491
  68. lamindb/core/fields.py +0 -12
  69. lamindb/models.py +0 -4435
  70. lamindb-1.1.0.dist-info/RECORD +0 -95
  71. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
  72. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
lamindb/__init__.py CHANGED
@@ -1,12 +1,13 @@
1
1
  """A data framework for biology.
2
2
 
3
- Tracking notebooks & scripts.
3
+ Tracking notebooks, scripts & functions.
4
4
 
5
5
  .. autosummary::
6
6
  :toctree: .
7
7
 
8
8
  track
9
9
  finish
10
+ tracked
10
11
 
11
12
  Registries.
12
13
 
@@ -15,8 +16,8 @@ Registries.
15
16
 
16
17
  Artifact
17
18
  Transform
18
- ULabel
19
19
  Run
20
+ ULabel
20
21
  User
21
22
  Storage
22
23
  Feature
@@ -24,9 +25,17 @@ Registries.
24
25
  Param
25
26
  Collection
26
27
  Project
28
+ Space
27
29
  Reference
28
30
  Person
29
31
 
32
+ Curators & integrations.
33
+
34
+ .. autosummary::
35
+
36
+ curators
37
+ integrations
38
+
30
39
  Key functionality.
31
40
 
32
41
  .. autosummary::
@@ -35,20 +44,19 @@ Key functionality.
35
44
  connect
36
45
  view
37
46
  save
47
+ UPath
48
+ settings
38
49
 
39
- Modules and settings.
50
+ Low-level functionality.
40
51
 
41
52
  .. autosummary::
42
53
  :toctree: .
43
54
 
44
- integrations
45
55
  context
46
- curators
47
- settings
48
56
  errors
49
57
  setup
50
- UPath
51
58
  base
59
+ models
52
60
  core
53
61
 
54
62
  Backward compatibility.
@@ -57,11 +65,20 @@ Backward compatibility.
57
65
  :toctree: .
58
66
 
59
67
  FeatureSet
68
+ Curator
60
69
 
61
70
  """
62
71
 
72
+ # ruff: noqa: I001
63
73
  # denote a release candidate for 0.1.0 with 0.1rc1, 0.1a1, 0.1b1, etc.
64
- __version__ = "1.1.0"
74
+ __version__ = "1.2a2"
75
+
76
+ import warnings
77
+
78
+ # through SpatialData
79
+ warnings.filterwarnings(
80
+ "ignore", message="The legacy Dask DataFrame implementation is deprecated"
81
+ )
65
82
 
66
83
  from lamindb_setup._check_setup import InstanceNotSetupError as _InstanceNotSetupError
67
84
  from lamindb_setup._check_setup import _check_instance_setup
@@ -77,23 +94,7 @@ def __getattr__(name):
77
94
 
78
95
  if _check_instance_setup(from_module="lamindb"):
79
96
  del __getattr__ # so that imports work out
80
- from . import core # isort: split
81
- from . import (
82
- _artifact,
83
- _can_curate,
84
- _collection,
85
- _feature,
86
- _is_versioned,
87
- _parents,
88
- _record,
89
- _run,
90
- _schema,
91
- _storage,
92
- _transform,
93
- _ulabel,
94
- integrations,
95
- )
96
- from ._save import save
97
+ from . import base
97
98
  from ._tracked import tracked
98
99
  from ._view import view
99
100
  from .core._context import context
@@ -109,12 +110,16 @@ if _check_instance_setup(from_module="lamindb"):
109
110
  Project,
110
111
  Reference,
111
112
  Run,
112
- Schema, # forward compat
113
+ Schema,
113
114
  Storage,
114
115
  Transform,
115
116
  ULabel,
116
117
  User,
118
+ Space,
117
119
  )
120
+ from .models.save import save
121
+ from . import core
122
+ from . import integrations
118
123
 
119
124
  track = context.track # simple access
120
125
  finish = context.finish # simple access
lamindb/_finish.py CHANGED
@@ -436,7 +436,15 @@ def save_context_core(
436
436
  # save both run & transform records if we arrive here
437
437
  if run is not None:
438
438
  run.save()
439
- transform.save()
439
+ transform_id_prior_to_save = transform.id
440
+ transform.save() # this in-place updates the state of transform upon hash collision
441
+ if transform.id != transform_id_prior_to_save:
442
+ # the hash existed and we're actually back to the previous version
443
+ # hence, this was in fact a run of the previous transform rather than of
444
+ # the new transform
445
+ # this can happen in interactive notebooks if the user makes no change to the notebook
446
+ run.transform = transform
447
+ run.save()
440
448
 
441
449
  # finalize
442
450
  if not from_cli and run is not None:
lamindb/_tracked.py CHANGED
@@ -5,8 +5,8 @@ from datetime import datetime, timezone
5
5
  from typing import Callable, ParamSpec, TypeVar
6
6
 
7
7
  from .core._context import context
8
- from .core._feature_manager import infer_feature_type_convert_json
9
8
  from .models import Run, Transform
9
+ from .models._feature_manager import infer_feature_type_convert_json
10
10
 
11
11
  P = ParamSpec("P")
12
12
  R = TypeVar("R")
@@ -26,10 +26,33 @@ def get_current_tracked_run() -> Run | None:
26
26
 
27
27
 
28
28
  def tracked(uid: str | None = None) -> Callable[[Callable[P, R]], Callable[P, R]]:
29
- """Decorator that tracks function execution.
29
+ """Mark a function as tracked with this decorator.
30
+
31
+ You will be able to see inputs, outputs, and parameters of the function in the data lineage graph.
32
+
33
+ Guide: :doc:`/track`
34
+
35
+ .. versionadded:: 1.1.0
36
+ This is still in beta and will be refined in future releases.
30
37
 
31
38
  Args:
32
- uid: Optional unique identifier for the transform
39
+ uid: Persist the uid to identify this transform across renames.
40
+
41
+ Example::
42
+
43
+ import lamindb as ln
44
+
45
+ @ln.tracked()
46
+ def subset_dataframe(
47
+ input_artifact_key: str, # all arguments tracked as parameters of the function run
48
+ output_artifact_key: str,
49
+ subset_rows: int = 2,
50
+ subset_cols: int = 2,
51
+ ) -> None:
52
+ artifact = ln.Artifact.get(key=input_artifact_key)
53
+ df = artifact.load() # auto-tracked as input
54
+ new_df = df.iloc[:subset_rows, :subset_cols]
55
+ ln.Artifact.from_df(new_df, key=output_artifact_key).save() # auto-tracked as output
33
56
  """
34
57
 
35
58
  def decorator_tracked(func: Callable[P, R]) -> Callable[P, R]:
lamindb/_view.py CHANGED
@@ -9,10 +9,9 @@ from lamin_utils import colors, logger
9
9
  from lamindb_setup import settings
10
10
  from lamindb_setup._init_instance import get_schema_module_name
11
11
 
12
- from lamindb.core import FeatureValue, ParamValue
13
- from lamindb.models import Feature, Record
12
+ from lamindb.models import Feature, FeatureValue, ParamValue, Record
14
13
 
15
- from ._feature import convert_pandas_dtype_to_lamin_dtype
14
+ from .models.feature import convert_pandas_dtype_to_lamin_dtype
16
15
 
17
16
  if TYPE_CHECKING:
18
17
  import pandas as pd
lamindb/base/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  """Base library.
2
2
 
3
- Is available also when no instance is connected.
3
+ Is available also when no instance is setup.
4
4
 
5
5
  Modules:
6
6
 
lamindb/base/ids.py CHANGED
@@ -1,4 +1,4 @@
1
- """IDs.
1
+ """Universal IDs.
2
2
 
3
3
  Base generators:
4
4
 
@@ -55,15 +55,6 @@ def base62(n_char: int) -> str:
55
55
  return id
56
56
 
57
57
 
58
- # the following cannot be serialized by Django
59
- # class Base62:
60
- # def __init__(self, n_char: int):
61
- # self.n_char = n_char
62
-
63
- # def __call__(self):
64
- # return base62(self.n_char)
65
-
66
-
67
58
  def base26(n_char: int):
68
59
  """ASCII lowercase."""
69
60
  alphabet = string.ascii_lowercase
lamindb/base/users.py CHANGED
@@ -12,12 +12,9 @@ def current_user_id() -> int:
12
12
  if ln_setup.core.django.IS_MIGRATING:
13
13
  return 1
14
14
  else:
15
- exc_attr = (
16
- "DoesNotExist" if hasattr(User, "DoesNotExist") else "_DoesNotExist"
17
- )
18
15
  try:
19
16
  user_id = User.objects.get(uid=settings.user.uid).id
20
- except getattr(User, exc_attr):
17
+ except User.DoesNotExist:
21
18
  register_user(settings.user)
22
19
  user_id = User.objects.get(uid=settings.user.uid).id
23
20
  return user_id
lamindb/core/__init__.py CHANGED
@@ -1,49 +1,20 @@
1
1
  """Core library.
2
2
 
3
- Registries:
4
-
5
- .. autosummary::
6
- :toctree: .
7
-
8
- BasicRecord
9
- Record
10
- Registry
11
- QuerySet
12
- QueryManager
13
- RecordList
14
- FeatureManager
15
- ParamManager
16
- LabelManager
17
- IsVersioned
18
- CanCurate
19
- HasParents
20
- TracksRun
21
- TracksUpdates
22
- ParamValue
23
- FeatureValue
24
- InspectResult
25
- ValidateFields
26
- fields
27
-
28
- Curators:
3
+ Settings & context:
29
4
 
30
5
  .. autosummary::
31
6
  :toctree: .
32
7
 
33
- CatManager
34
- DataFrameCatManager
35
- AnnDataCatManager
36
- MuDataCatManager
37
- TiledbsomaCatManager
38
- CurateLookup
8
+ Settings
9
+ subsettings
10
+ Context
39
11
 
40
- Settings & context:
12
+ Artifact loaders:
41
13
 
42
14
  .. autosummary::
43
15
  :toctree: .
44
16
 
45
- Settings
46
- Context
17
+ loaders
47
18
 
48
19
  Data loaders:
49
20
 
@@ -57,10 +28,8 @@ Modules:
57
28
  .. autosummary::
58
29
  :toctree: .
59
30
 
60
- loaders
61
31
  datasets
62
32
  storage
63
- subsettings
64
33
  logger
65
34
 
66
35
  """
@@ -68,35 +37,8 @@ Modules:
68
37
  from lamin_utils import logger
69
38
  from lamin_utils._inspect import InspectResult
70
39
 
71
- from lamindb._query_manager import QueryManager
72
- from lamindb._query_set import QuerySet, RecordList
73
- from lamindb.core._feature_manager import FeatureManager, ParamManager
74
- from lamindb.core._label_manager import LabelManager
75
- from lamindb.curators import (
76
- AnnDataCatManager,
77
- CatManager,
78
- CurateLookup,
79
- Curator,
80
- DataFrameCatManager,
81
- MuDataCatManager,
82
- TiledbsomaCatManager,
83
- )
84
- from lamindb.models import (
85
- BasicRecord,
86
- CanCurate,
87
- FeatureValue,
88
- HasParents,
89
- IsVersioned,
90
- ParamValue,
91
- Record,
92
- Registry,
93
- TracksRun,
94
- TracksUpdates,
95
- ValidateFields,
96
- )
97
-
98
40
  from .. import errors as exceptions
99
- from . import _data, datasets, fields, loaders, subsettings, types
41
+ from . import datasets, loaders, subsettings, types
100
42
  from ._context import Context
101
43
  from ._mapped_collection import MappedCollection
102
44
  from ._settings import Settings
lamindb/core/_context.py CHANGED
@@ -11,29 +11,35 @@ from pathlib import Path
11
11
  from typing import TYPE_CHECKING
12
12
 
13
13
  import lamindb_setup as ln_setup
14
- from django.db.models import Func, IntegerField
14
+ from django.db.models import Func, IntegerField, Q
15
15
  from lamin_utils import logger
16
+ from lamindb_setup.core import deprecated
16
17
  from lamindb_setup.core.hashing import hash_file
17
18
 
18
19
  from lamindb.base import ids
19
20
  from lamindb.base.ids import base62_12
20
21
  from lamindb.models import Run, Transform, format_field_value
21
22
 
23
+ from ..core._settings import settings
22
24
  from ..errors import (
23
25
  InconsistentKey,
26
+ InvalidArgument,
24
27
  TrackNotCalled,
25
28
  UpdateContext,
26
29
  )
27
- from ._settings import settings
30
+ from ..models._is_versioned import bump_version as bump_version_function
31
+ from ..models._is_versioned import (
32
+ increment_base62,
33
+ message_update_key_in_version_family,
34
+ )
28
35
  from ._sync_git import get_transform_reference_from_git_repo
29
36
  from ._track_environment import track_environment
30
- from .versioning import bump_version as bump_version_function
31
- from .versioning import increment_base62, message_update_key_in_version_family
32
37
 
33
38
  if TYPE_CHECKING:
34
39
  from lamindb_setup.core.types import UPathStr
35
40
 
36
41
  from lamindb.base.types import TransformType
42
+ from lamindb.models import Project
37
43
 
38
44
  is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
39
45
 
@@ -197,6 +203,7 @@ class Context:
197
203
  self._run: Run | None = None
198
204
  self._path: Path | None = None
199
205
  """A local path to the script that's running."""
206
+ self._project: Project | None = None
200
207
  self._logging_message_track: str = ""
201
208
  self._logging_message_imports: str = ""
202
209
  self._stream_tracker: LogStreamTracker = LogStreamTracker()
@@ -217,8 +224,8 @@ class Context:
217
224
  self._description = value
218
225
 
219
226
  @property
227
+ @deprecated(new_name="description")
220
228
  def name(self) -> str | None:
221
- """Deprecated. Populates `description` argument for `context.transform`."""
222
229
  return self._description
223
230
 
224
231
  @name.setter
@@ -243,6 +250,11 @@ class Context:
243
250
  def version(self, value: str | None):
244
251
  self._version = value
245
252
 
253
+ @property
254
+ def project(self) -> Project | None:
255
+ """Project to label entities created during the run."""
256
+ return self._project
257
+
246
258
  @property
247
259
  def run(self) -> Run | None:
248
260
  """Managed run of context."""
@@ -252,12 +264,12 @@ class Context:
252
264
  self,
253
265
  transform: str | Transform | None = None,
254
266
  *,
267
+ project: str | None = None,
255
268
  params: dict | None = None,
256
269
  new_run: bool | None = None,
257
270
  path: str | None = None,
258
- log_to_file: bool | None = None,
259
271
  ) -> None:
260
- """Initiate a run with tracked data lineage.
272
+ """Track a global run of your Python session.
261
273
 
262
274
  - sets :attr:`~lamindb.core.Context.transform` &
263
275
  :attr:`~lamindb.core.Context.run` by creating or loading `Transform` &
@@ -269,14 +281,12 @@ class Context:
269
281
 
270
282
  Args:
271
283
  transform: A transform `uid` or record. If `None`, creates a `uid`.
284
+ project: A project `name` or `uid` for labeling entities created during the run.
272
285
  params: A dictionary of parameters to track for the run.
273
286
  new_run: If `False`, loads the latest run of transform
274
287
  (default notebook), if `True`, creates new run (default non-notebook).
275
288
  path: Filepath of notebook or script. Only needed if it can't be
276
289
  automatically detected.
277
- log_to_file: If `True`, logs stdout and stderr to a file and
278
- saves the file within the current run (default non-notebook),
279
- if `False`, does not log the output (default notebook).
280
290
 
281
291
  Examples:
282
292
 
@@ -284,7 +294,22 @@ class Context:
284
294
 
285
295
  >>> ln.track()
286
296
 
297
+ If you want to ensure a single version history across renames of the notebook or script, pass the auto-generated `uid` that you'll find in the logs:
298
+
299
+ >>> ln.track("Onv04I53OgtT0000") # example uid, the last four characters encode the version of the transform
300
+
287
301
  """
302
+ from lamindb.models import Project
303
+
304
+ if project is not None:
305
+ project_record = Project.filter(
306
+ Q(name=project) | Q(uid=project)
307
+ ).one_or_none()
308
+ if project_record is None:
309
+ raise InvalidArgument(
310
+ f"Project '{project}' not found, either create it with `ln.Project(name='...').save()` or fix typos."
311
+ )
312
+ self._project = project_record
288
313
  self._logging_message_track = ""
289
314
  self._logging_message_imports = ""
290
315
  if transform is not None and isinstance(transform, str):
@@ -370,6 +395,12 @@ class Context:
370
395
  )
371
396
  self._run = run
372
397
  track_environment(run)
398
+ if self.project is not None:
399
+ # to update a potential project link
400
+ # is only necessary if transform is loaded rather than newly created
401
+ # can be optimized by checking whether the transform is loaded, but it typically is
402
+ self.transform.save()
403
+ log_to_file = None
373
404
  if log_to_file is None:
374
405
  log_to_file = self.transform.type != "notebook"
375
406
  if log_to_file:
@@ -27,7 +27,8 @@ if TYPE_CHECKING:
27
27
  class _Connect:
28
28
  def __init__(self, storage):
29
29
  if isinstance(storage, UPath):
30
- self.conn, self.store = registry.open("h5py", storage)
30
+ # force no external compression even for files with .gz extension. REMOVE LATER
31
+ self.conn, self.store = registry.open("h5py", storage, compression=None)
31
32
  self.to_close = True
32
33
  else:
33
34
  self.conn, self.store = None, storage
@@ -246,7 +247,8 @@ class MappedCollection:
246
247
  if parallel:
247
248
  conn, storage = None, path
248
249
  else:
249
- conn, storage = registry.open("h5py", path)
250
+ # force no external compression even for files with .gz extension. REMOVE LATER
251
+ conn, storage = registry.open("h5py", path, compression=None)
250
252
  else:
251
253
  conn, storage = registry.open("zarr", path)
252
254
  self.conns.append(conn)
lamindb/core/_settings.py CHANGED
@@ -155,12 +155,12 @@ class Settings:
155
155
  def verbosity(self) -> str:
156
156
  """Logger verbosity (default `'warning'`).
157
157
 
158
- - `'error'`: only show error messages
159
- - `'warning'`: also show warning messages
160
- - `'success'`: also show success and save messages
161
- - `'info'`: 💡 also show info messages
162
- - `'hint'`: 💡 also show hint messages
163
- - `'debug'`: 🐛 also show detailed debug messages
158
+ - `'error'`: only show error messages
159
+ - `'warning'`: also show warning messages
160
+ - `'success'`: also show success and save messages
161
+ - `'info'`: also show info messages
162
+ - `'hint'`: also show hint messages
163
+ - `'debug'`: also show detailed debug messages
164
164
  """
165
165
  return VERBOSITY_TO_STR[self._verbosity_int]
166
166
 
lamindb/core/_sync_git.py CHANGED
@@ -7,7 +7,7 @@ from lamin_utils import logger
7
7
  from lamindb_setup import settings as setup_settings
8
8
  from lamindb_setup.core.hashing import hash_code
9
9
 
10
- from ._settings import sanitize_git_repo_url, settings
10
+ from ..core._settings import sanitize_git_repo_url, settings
11
11
 
12
12
 
13
13
  class BlobHashNotFound(SystemExit):
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import subprocess
4
+ import sys
4
5
  from typing import TYPE_CHECKING
5
6
 
6
7
  import lamindb_setup as ln_setup
@@ -17,7 +18,7 @@ def track_environment(run: Run) -> None:
17
18
  try:
18
19
  with open(filepath, "w") as f:
19
20
  result = subprocess.run(
20
- ["pip", "freeze"],
21
+ [sys.executable, "-m", "pip", "freeze"],
21
22
  stdout=f,
22
23
  )
23
24
  except OSError as e:
@@ -23,7 +23,7 @@ def small_dataset1(
23
23
  var_ids[0]: [1, 2, 3],
24
24
  var_ids[1]: [3, 4, 5],
25
25
  var_ids[2]: [5, 6, 7],
26
- "cell_medium": pd.Categorical(["DMSO", ifng, "DMSO"]),
26
+ "perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
27
27
  "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
28
28
  "cell_type_by_expert": pd.Categorical(["B cell", "T cell", "T cell"]),
29
29
  "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
@@ -60,7 +60,7 @@ def small_dataset2(
60
60
  var_ids[0]: [2, 3, 3],
61
61
  var_ids[1]: [3, 4, 5],
62
62
  var_ids[2]: [4, 2, 3],
63
- "cell_medium": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
63
+ "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
64
64
  "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
65
65
  }
66
66
  metadata = {
@@ -74,7 +74,7 @@ def small_dataset2(
74
74
  )
75
75
  ad.AnnData(
76
76
  dataset_df[var_ids],
77
- obs=dataset_df[["cell_medium", "cell_type_by_model"]],
77
+ obs=dataset_df[["perturbation", "cell_type_by_model"]],
78
78
  )
79
79
  if otype == "DataFrame":
80
80
  for key, value in metadata.items():
lamindb/core/loaders.py CHANGED
@@ -30,13 +30,13 @@ from lamindb_setup.core.upath import (
30
30
  infer_filesystem,
31
31
  )
32
32
 
33
- from ._settings import settings
33
+ from ..core._settings import settings
34
34
 
35
35
  if TYPE_CHECKING:
36
36
  from lamindb_setup.core.types import UPathStr
37
37
 
38
38
  try:
39
- from .storage._zarr import load_anndata_zarr
39
+ from ..core.storage._zarr import load_anndata_zarr
40
40
  except ImportError:
41
41
 
42
42
  def load_anndata_zarr(storepath): # type: ignore
@@ -65,8 +65,8 @@ def load_tsv(path: UPathStr, **kwargs) -> pd.DataFrame:
65
65
  def load_h5ad(filepath, **kwargs) -> ad.AnnData:
66
66
  """Load an `.h5ad` file to `AnnData`."""
67
67
  fs, filepath = infer_filesystem(filepath)
68
-
69
- with fs.open(filepath, mode="rb") as file:
68
+ compression = kwargs.pop("compression", "infer")
69
+ with fs.open(filepath, mode="rb", compression=compression) as file:
70
70
  adata = ad.read_h5ad(file, backed=False, **kwargs)
71
71
  return adata
72
72
 
@@ -148,9 +148,13 @@ def load_rds(path: UPathStr) -> UPathStr:
148
148
 
149
149
  FILE_LOADERS = {
150
150
  ".csv": pd.read_csv,
151
+ ".csv.gz": pd.read_csv,
151
152
  ".tsv": load_tsv,
153
+ ".tsv.gz": load_tsv,
152
154
  ".h5ad": load_h5ad,
155
+ ".h5ad.gz": load_h5ad,
153
156
  ".parquet": pd.read_parquet,
157
+ ".parquet.gz": pd.read_parquet, # this doesn't work for externally gzipped files, REMOVE LATER
154
158
  ".fcs": load_fcs,
155
159
  ".zarr": load_anndata_zarr,
156
160
  ".html": load_html,
@@ -175,10 +179,19 @@ def load_to_memory(filepath: UPathStr, **kwargs):
175
179
  """
176
180
  filepath = create_path(filepath)
177
181
 
178
- filepath = settings._storage_settings.cloud_to_local(filepath, print_progress=True)
182
+ # infer the correct suffix when .gz is present
183
+ suffixes = filepath.suffixes
184
+ suffix = (
185
+ "".join(suffixes[-2:])
186
+ if len(suffixes) > 1 and ".gz" in suffixes
187
+ else filepath.suffix
188
+ )
179
189
 
180
- loader = FILE_LOADERS.get(filepath.suffix)
190
+ loader = FILE_LOADERS.get(suffix, None)
181
191
  if loader is None:
182
- return filepath
183
- else:
184
- return loader(filepath, **kwargs)
192
+ raise NotImplementedError(
193
+ f"There is no loader for {suffix} files. Use .cache() to get the path."
194
+ )
195
+
196
+ filepath = settings._storage_settings.cloud_to_local(filepath, print_progress=True)
197
+ return loader(filepath, **kwargs)
@@ -16,6 +16,7 @@ from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
16
16
  from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
17
17
  from anndata.compat import _read_attr
18
18
  from fsspec.implementations.local import LocalFileSystem
19
+ from fsspec.utils import infer_compression
19
20
  from lamin_utils import logger
20
21
  from lamindb_setup.core.upath import create_mapper, infer_filesystem
21
22
  from packaging import version
@@ -152,9 +153,13 @@ registry = AccessRegistry()
152
153
 
153
154
 
154
155
  @registry.register_open("h5py")
155
- def open(filepath: UPathStr, mode: str = "r"):
156
+ def open(filepath: UPathStr, mode: str = "r", compression: str | None = "infer"):
156
157
  fs, file_path_str = infer_filesystem(filepath)
157
- if isinstance(fs, LocalFileSystem):
158
+ # we don't open compressed files directly because we need fsspec to uncompress on .open
159
+ compression = (
160
+ infer_compression(file_path_str) if compression == "infer" else compression
161
+ )
162
+ if isinstance(fs, LocalFileSystem) and compression is None:
158
163
  assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
159
164
  return None, h5py.File(file_path_str, mode=mode)
160
165
  if mode == "r":
@@ -165,7 +170,7 @@ def open(filepath: UPathStr, mode: str = "r"):
165
170
  conn_mode = "ab"
166
171
  else:
167
172
  raise ValueError(f"Unknown mode {mode}! Should be 'r', 'w' or 'a'.")
168
- conn = fs.open(file_path_str, mode=conn_mode)
173
+ conn = fs.open(file_path_str, mode=conn_mode, compression=compression)
169
174
  try:
170
175
  storage = h5py.File(conn, mode=mode)
171
176
  except Exception as e: