lamindb 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +89 -49
- lamindb/_finish.py +17 -15
- lamindb/_tracked.py +2 -4
- lamindb/_view.py +1 -1
- lamindb/base/__init__.py +2 -1
- lamindb/base/dtypes.py +76 -0
- lamindb/core/_settings.py +45 -2
- lamindb/core/storage/_anndata_accessor.py +118 -26
- lamindb/core/storage/_backed_access.py +10 -7
- lamindb/core/storage/_spatialdata_accessor.py +15 -4
- lamindb/core/storage/_zarr.py +3 -0
- lamindb/curators/_legacy.py +16 -3
- lamindb/curators/core.py +449 -193
- lamindb/errors.py +6 -0
- lamindb/examples/cellxgene/__init__.py +8 -3
- lamindb/examples/cellxgene/_cellxgene.py +127 -13
- lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
- lamindb/examples/croissant/__init__.py +32 -6
- lamindb/examples/datasets/__init__.py +2 -2
- lamindb/examples/datasets/_core.py +9 -2
- lamindb/examples/datasets/_small.py +66 -22
- lamindb/examples/fixtures/sheets.py +8 -2
- lamindb/integrations/_croissant.py +34 -11
- lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
- lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
- lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
- lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
- lamindb/migrations/0121_recorduser.py +60 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +2 -2
- lamindb/models/_feature_manager.py +131 -71
- lamindb/models/_from_values.py +2 -2
- lamindb/models/_is_versioned.py +4 -4
- lamindb/models/_label_manager.py +4 -4
- lamindb/models/artifact.py +357 -192
- lamindb/models/artifact_set.py +45 -1
- lamindb/models/can_curate.py +1 -2
- lamindb/models/collection.py +3 -34
- lamindb/models/feature.py +111 -7
- lamindb/models/has_parents.py +11 -11
- lamindb/models/project.py +42 -2
- lamindb/models/query_manager.py +16 -7
- lamindb/models/query_set.py +191 -78
- lamindb/models/record.py +30 -5
- lamindb/models/run.py +10 -33
- lamindb/models/save.py +6 -8
- lamindb/models/schema.py +54 -26
- lamindb/models/sqlrecord.py +152 -40
- lamindb/models/storage.py +59 -14
- lamindb/models/transform.py +17 -17
- lamindb/models/ulabel.py +6 -1
- {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/METADATA +11 -16
- {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/RECORD +55 -50
- {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/LICENSE +0 -0
- {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/WHEEL +0 -0
@@ -13,12 +13,17 @@ from anndata import __version__ as anndata_version
|
|
13
13
|
from anndata._core.index import _normalize_indices
|
14
14
|
from anndata._core.views import _resolve_idx
|
15
15
|
from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
|
16
|
-
from anndata._io.specs.registry import
|
16
|
+
from anndata._io.specs.registry import (
|
17
|
+
get_spec,
|
18
|
+
read_elem,
|
19
|
+
read_elem_partial,
|
20
|
+
write_elem,
|
21
|
+
)
|
17
22
|
from anndata.compat import _read_attr
|
18
23
|
from fsspec.implementations.local import LocalFileSystem
|
19
24
|
from fsspec.utils import infer_compression
|
20
25
|
from lamin_utils import logger
|
21
|
-
from lamindb_setup.core.upath import infer_filesystem
|
26
|
+
from lamindb_setup.core.upath import S3FSMap, infer_filesystem
|
22
27
|
from packaging import version
|
23
28
|
from upath import UPath
|
24
29
|
|
@@ -28,6 +33,8 @@ if TYPE_CHECKING:
|
|
28
33
|
from fsspec.core import OpenFile
|
29
34
|
from lamindb_setup.types import UPathStr
|
30
35
|
|
36
|
+
from lamindb import Artifact
|
37
|
+
|
31
38
|
|
32
39
|
anndata_version_parse = version.parse(anndata_version)
|
33
40
|
|
@@ -288,7 +295,7 @@ except ImportError:
|
|
288
295
|
if ZARR_INSTALLED:
|
289
296
|
from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
|
290
297
|
|
291
|
-
from ._zarr import get_zarr_store
|
298
|
+
from ._zarr import IS_ZARR_V3, get_zarr_store
|
292
299
|
|
293
300
|
ArrayTypes.append(zarr.Array)
|
294
301
|
GroupTypes.append(zarr.Group)
|
@@ -299,7 +306,18 @@ if ZARR_INSTALLED:
|
|
299
306
|
assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
|
300
307
|
|
301
308
|
store = get_zarr_store(filepath)
|
302
|
-
|
309
|
+
kwargs = {}
|
310
|
+
if IS_ZARR_V3 and mode != "r":
|
311
|
+
# otherwise unable to write
|
312
|
+
kwargs["use_consolidated"] = False
|
313
|
+
storage = zarr.open(store, mode=mode, **kwargs)
|
314
|
+
# zarr v2 re-initializes the mapper
|
315
|
+
# we need to put back the correct one
|
316
|
+
# S3FSMap is returned from get_zarr_store only for zarr v2
|
317
|
+
if isinstance(store, S3FSMap):
|
318
|
+
assert not IS_ZARR_V3 # noqa: S101
|
319
|
+
|
320
|
+
storage.store.map = store
|
303
321
|
conn = None
|
304
322
|
return conn, storage
|
305
323
|
|
@@ -351,10 +369,10 @@ if ZARR_INSTALLED:
|
|
351
369
|
# this is needed because accessing zarr.Group.keys() directly is very slow
|
352
370
|
@registry.register("zarr")
|
353
371
|
def keys(storage: zarr.Group):
|
354
|
-
if
|
372
|
+
if IS_ZARR_V3:
|
355
373
|
paths = storage._sync_iter(storage.store.list())
|
356
374
|
else:
|
357
|
-
paths = storage.store.keys()
|
375
|
+
paths = storage.store.keys()
|
358
376
|
|
359
377
|
attrs_keys: dict[str, list] = {}
|
360
378
|
obs_var_arrays = []
|
@@ -438,9 +456,15 @@ def _try_backed_full(elem):
|
|
438
456
|
return read_elem(elem)
|
439
457
|
|
440
458
|
|
459
|
+
def _to_index(elem: np.ndarray):
|
460
|
+
if elem.dtype in (np.float64, np.int64):
|
461
|
+
elem = elem.astype(str)
|
462
|
+
return pd.Index(elem)
|
463
|
+
|
464
|
+
|
441
465
|
def _safer_read_index(elem):
|
442
466
|
if isinstance(elem, GroupTypes):
|
443
|
-
return
|
467
|
+
return _to_index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
|
444
468
|
elif isinstance(elem, ArrayTypes):
|
445
469
|
indices = None
|
446
470
|
for index_name in ("index", "_index"):
|
@@ -450,7 +474,7 @@ def _safer_read_index(elem):
|
|
450
474
|
if indices is not None and len(indices) > 0:
|
451
475
|
if isinstance(indices[0], bytes):
|
452
476
|
indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
|
453
|
-
return
|
477
|
+
return _to_index(indices)
|
454
478
|
else:
|
455
479
|
raise ValueError("Indices not found.")
|
456
480
|
else:
|
@@ -479,33 +503,40 @@ class _MapAccessor:
|
|
479
503
|
return descr
|
480
504
|
|
481
505
|
|
506
|
+
def _safer_read_df(elem, indices=None):
|
507
|
+
if indices is not None:
|
508
|
+
obj = registry.safer_read_partial(elem, indices=indices)
|
509
|
+
df = _records_to_df(obj)
|
510
|
+
else:
|
511
|
+
df = registry.read_dataframe(elem)
|
512
|
+
if df.index.dtype in (np.float64, np.int64):
|
513
|
+
df.index = df.index.astype(str)
|
514
|
+
return df
|
515
|
+
|
516
|
+
|
482
517
|
class _AnnDataAttrsMixin:
|
483
518
|
storage: StorageType
|
484
519
|
_attrs_keys: Mapping[str, list]
|
485
520
|
|
486
521
|
@cached_property
|
487
|
-
def obs(self) -> pd.DataFrame:
|
522
|
+
def obs(self) -> pd.DataFrame | None:
|
488
523
|
if "obs" not in self._attrs_keys:
|
489
524
|
return None
|
490
525
|
indices = getattr(self, "indices", None)
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
else:
|
496
|
-
return registry.read_dataframe(self.storage["obs"]) # type: ignore
|
526
|
+
return _safer_read_df(
|
527
|
+
self.storage["obs"], # type: ignore
|
528
|
+
indices=(indices[0], slice(None)) if indices is not None else None,
|
529
|
+
)
|
497
530
|
|
498
531
|
@cached_property
|
499
|
-
def var(self) -> pd.DataFrame:
|
532
|
+
def var(self) -> pd.DataFrame | None:
|
500
533
|
if "var" not in self._attrs_keys:
|
501
534
|
return None
|
502
535
|
indices = getattr(self, "indices", None)
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
else:
|
508
|
-
return registry.read_dataframe(self.storage["var"]) # type: ignore
|
536
|
+
return _safer_read_df(
|
537
|
+
self.storage["var"], # type: ignore
|
538
|
+
indices=(indices[1], slice(None)) if indices is not None else None,
|
539
|
+
)
|
509
540
|
|
510
541
|
@cached_property
|
511
542
|
def uns(self):
|
@@ -702,6 +733,7 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
|
|
702
733
|
connection: OpenFile | None,
|
703
734
|
storage: StorageType,
|
704
735
|
filename: str,
|
736
|
+
artifact: Artifact | None = None,
|
705
737
|
):
|
706
738
|
self._conn = connection
|
707
739
|
self.storage = storage
|
@@ -713,14 +745,43 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
|
|
713
745
|
self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
|
714
746
|
self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
|
715
747
|
|
748
|
+
self._artifact = artifact # save artifact to update in write mode
|
749
|
+
|
750
|
+
self._updated = False # track updates in r+ mode for zarr
|
751
|
+
|
752
|
+
self._entered = False # check that the context manager is used
|
716
753
|
self._closed = False
|
717
754
|
|
718
755
|
def close(self):
|
719
756
|
"""Closes the connection."""
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
757
|
+
storage = self.storage
|
758
|
+
connection = self._conn
|
759
|
+
|
760
|
+
if self._updated and (artifact := self._artifact) is not None:
|
761
|
+
from lamindb.models.artifact import Artifact
|
762
|
+
from lamindb.models.sqlrecord import init_self_from_db
|
763
|
+
|
764
|
+
# now self._updated can only be True for zarr
|
765
|
+
assert ZARR_INSTALLED # noqa: S101
|
766
|
+
|
767
|
+
store = storage.store
|
768
|
+
keys = storage._sync_iter(store.list()) if IS_ZARR_V3 else store.keys()
|
769
|
+
# this checks that there consolidated metadata was written before
|
770
|
+
# need to update it
|
771
|
+
# zmetadata is in spatialdata sometimes for some reason
|
772
|
+
if ".zmetadata" in keys or "zmetadata" in keys:
|
773
|
+
zarr.consolidate_metadata(store)
|
774
|
+
|
775
|
+
new_version = Artifact(
|
776
|
+
artifact.path, revises=artifact, _is_internal_call=True
|
777
|
+
).save()
|
778
|
+
# note: sets _state.db = "default"
|
779
|
+
init_self_from_db(artifact, new_version)
|
780
|
+
|
781
|
+
if hasattr(storage, "close"):
|
782
|
+
storage.close()
|
783
|
+
if hasattr(connection, "close"):
|
784
|
+
connection.close()
|
724
785
|
self._closed = True
|
725
786
|
|
726
787
|
@property
|
@@ -728,6 +789,8 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
|
|
728
789
|
return self._closed
|
729
790
|
|
730
791
|
def __enter__(self):
|
792
|
+
self._entered = True
|
793
|
+
|
731
794
|
return self
|
732
795
|
|
733
796
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
@@ -763,6 +826,35 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
|
|
763
826
|
self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
|
764
827
|
)
|
765
828
|
|
829
|
+
def add_column(
|
830
|
+
self,
|
831
|
+
where: Literal["obs", "var"],
|
832
|
+
col_name: str,
|
833
|
+
col: np.ndarray | pd.Categorical,
|
834
|
+
):
|
835
|
+
"""Add a new column to .obs or .var of the underlying AnnData object."""
|
836
|
+
df_store = self.storage[where] # type: ignore
|
837
|
+
if getattr(df_store, "read_only", True):
|
838
|
+
raise ValueError(
|
839
|
+
"You can use .add_column(...) only with zarr in a writable mode."
|
840
|
+
)
|
841
|
+
write_elem(df_store, col_name, col)
|
842
|
+
df_store.attrs["column-order"] = df_store.attrs["column-order"] + [col_name]
|
843
|
+
# remind only once if this wasn't updated before and not in the context manager
|
844
|
+
if not self._updated and not self._entered and self._artifact is not None:
|
845
|
+
logger.important(
|
846
|
+
"Do not forget to call .close() after you finish "
|
847
|
+
f"working with this accessor for {self._name} "
|
848
|
+
"to automatically update the corresponding artifact."
|
849
|
+
)
|
850
|
+
|
851
|
+
self._updated = True
|
852
|
+
# reset the cached property
|
853
|
+
# todo: maybe just append the column if the df was already loaded
|
854
|
+
self.__dict__.pop(where, None)
|
855
|
+
# update the cached columns
|
856
|
+
self._attrs_keys[where].append(col_name)
|
857
|
+
|
766
858
|
|
767
859
|
# get the number of observations in an anndata object or file fast and safely
|
768
860
|
def _anndata_n_observations(object: UPathStr | AnnData) -> int | None:
|
@@ -4,6 +4,7 @@ from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import TYPE_CHECKING, Any, Callable, Literal
|
6
6
|
|
7
|
+
import h5py
|
7
8
|
from anndata._io.specs.registry import get_spec
|
8
9
|
|
9
10
|
from ._anndata_accessor import AnnDataAccessor, StorageType, registry
|
@@ -92,10 +93,10 @@ def backed_access(
|
|
92
93
|
from lamindb.models import Artifact
|
93
94
|
|
94
95
|
if isinstance(artifact_or_filepath, Artifact):
|
95
|
-
|
96
|
-
|
97
|
-
)
|
96
|
+
artifact = artifact_or_filepath
|
97
|
+
objectpath, _ = filepath_from_artifact(artifact, using_key=using_key)
|
98
98
|
else:
|
99
|
+
artifact = None
|
99
100
|
objectpath = artifact_or_filepath
|
100
101
|
name = objectpath.name
|
101
102
|
# ignore .gz, only check the real suffix
|
@@ -111,9 +112,11 @@ def backed_access(
|
|
111
112
|
elif suffix in {".h5", ".hdf5", ".h5ad"}:
|
112
113
|
conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
|
113
114
|
elif suffix == ".zarr":
|
115
|
+
if mode not in {"r", "r+"}:
|
116
|
+
raise ValueError("`mode` should be either 'r' or 'r+' for zarr.")
|
114
117
|
conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
|
115
118
|
if "spatialdata_attrs" in storage.attrs:
|
116
|
-
return SpatialDataAccessor(storage, name)
|
119
|
+
return SpatialDataAccessor(storage, name, artifact)
|
117
120
|
elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
|
118
121
|
df_suffix := df_suffixes.pop()
|
119
122
|
) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
|
@@ -127,9 +130,9 @@ def backed_access(
|
|
127
130
|
|
128
131
|
is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
|
129
132
|
if is_anndata:
|
130
|
-
if mode != "r":
|
131
|
-
raise ValueError("Can only access `AnnData` with mode='r'.")
|
132
|
-
return AnnDataAccessor(conn, storage, name)
|
133
|
+
if mode != "r" and isinstance(storage, h5py.Group):
|
134
|
+
raise ValueError("Can only access `hdf5` `AnnData` with mode='r'.")
|
135
|
+
return AnnDataAccessor(conn, storage, name, artifact)
|
133
136
|
else:
|
134
137
|
return BackedAccessor(conn, storage)
|
135
138
|
|
@@ -8,13 +8,22 @@ from ._anndata_accessor import AnnDataAccessor
|
|
8
8
|
if TYPE_CHECKING:
|
9
9
|
from zarr import Group
|
10
10
|
|
11
|
+
from lamindb import Artifact
|
12
|
+
|
11
13
|
|
12
14
|
class _TablesAccessor:
|
13
|
-
def __init__(self, tables: Group):
|
15
|
+
def __init__(self, tables: Group, artifact: Artifact | None = None):
|
14
16
|
self._tables = tables
|
15
17
|
|
18
|
+
self._artifact = artifact
|
19
|
+
|
16
20
|
def __getitem__(self, key: str) -> AnnDataAccessor:
|
17
|
-
return AnnDataAccessor(
|
21
|
+
return AnnDataAccessor(
|
22
|
+
connection=None,
|
23
|
+
storage=self._tables[key],
|
24
|
+
filename=key,
|
25
|
+
artifact=self._artifact,
|
26
|
+
)
|
18
27
|
|
19
28
|
def keys(self) -> list[str]:
|
20
29
|
return list(self._tables.keys())
|
@@ -33,14 +42,16 @@ class SpatialDataAccessor:
|
|
33
42
|
For now only allows to access `tables`.
|
34
43
|
"""
|
35
44
|
|
36
|
-
def __init__(self, storage: Group, name: str):
|
45
|
+
def __init__(self, storage: Group, name: str, artifact: Artifact | None = None):
|
37
46
|
self.storage = storage
|
38
47
|
self._name = name
|
39
48
|
|
49
|
+
self._artifact = artifact
|
50
|
+
|
40
51
|
@cached_property
|
41
52
|
def tables(self) -> _TablesAccessor:
|
42
53
|
"""tables of the underlying SpatialData object."""
|
43
|
-
return _TablesAccessor(self.storage["tables"])
|
54
|
+
return _TablesAccessor(self.storage["tables"], self._artifact)
|
44
55
|
|
45
56
|
def __repr__(self):
|
46
57
|
"""Description of the SpatialDataAccessor object."""
|
lamindb/core/storage/_zarr.py
CHANGED
@@ -37,6 +37,9 @@ def get_zarr_store(
|
|
37
37
|
if isinstance(storepath, LocalPathClasses):
|
38
38
|
store = storepath_str
|
39
39
|
elif IS_ZARR_V3:
|
40
|
+
# todo: also check how to treat non-asynchronous filesystems
|
41
|
+
# zarr has something for this, using fsspec async wrapper
|
42
|
+
# check FsspecStore code
|
40
43
|
store = zarr.storage.FsspecStore.from_upath(UPath(storepath, asynchronous=True))
|
41
44
|
else:
|
42
45
|
store = create_mapper(storepath.fs, storepath_str, check=check, create=create)
|
lamindb/curators/_legacy.py
CHANGED
@@ -133,7 +133,7 @@ class CatManager:
|
|
133
133
|
|
134
134
|
if self._artifact is None:
|
135
135
|
if isinstance(self._dataset, pd.DataFrame):
|
136
|
-
artifact = Artifact.
|
136
|
+
artifact = Artifact.from_dataframe(
|
137
137
|
self._dataset,
|
138
138
|
key=key,
|
139
139
|
description=description,
|
@@ -1275,7 +1275,7 @@ class TiledbsomaCatManager(CatManager):
|
|
1275
1275
|
empty_dict, schema=self._obs_pa_schema
|
1276
1276
|
).to_pandas()
|
1277
1277
|
# in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
|
1278
|
-
feature_sets["obs"] = Schema.
|
1278
|
+
feature_sets["obs"] = Schema.from_dataframe(
|
1279
1279
|
df=mock_df,
|
1280
1280
|
field=self._columns_field,
|
1281
1281
|
mute=True,
|
@@ -1367,7 +1367,7 @@ def legacy_annotate_artifact(
|
|
1367
1367
|
|
1368
1368
|
|
1369
1369
|
@classmethod # type: ignore
|
1370
|
-
def
|
1370
|
+
def from_dataframe(
|
1371
1371
|
cls,
|
1372
1372
|
df: pd.DataFrame,
|
1373
1373
|
categoricals: dict[str, FieldAttr] | None = None,
|
@@ -1383,6 +1383,18 @@ def from_df(
|
|
1383
1383
|
)
|
1384
1384
|
|
1385
1385
|
|
1386
|
+
@classmethod # type: ignore
|
1387
|
+
@deprecated("from_dataframe")
|
1388
|
+
def from_df(
|
1389
|
+
cls,
|
1390
|
+
df: pd.DataFrame,
|
1391
|
+
categoricals: dict[str, FieldAttr] | None = None,
|
1392
|
+
columns: FieldAttr = Feature.name,
|
1393
|
+
organism: str | None = None,
|
1394
|
+
) -> DataFrameCatManager:
|
1395
|
+
return cls.from_dataframe(df, categoricals, columns, organism)
|
1396
|
+
|
1397
|
+
|
1386
1398
|
@classmethod # type: ignore
|
1387
1399
|
def from_anndata(
|
1388
1400
|
cls,
|
@@ -1468,6 +1480,7 @@ def from_spatialdata(
|
|
1468
1480
|
)
|
1469
1481
|
|
1470
1482
|
|
1483
|
+
CatManager.from_dataframe = from_dataframe # type: ignore
|
1471
1484
|
CatManager.from_df = from_df # type: ignore
|
1472
1485
|
CatManager.from_anndata = from_anndata # type: ignore
|
1473
1486
|
CatManager.from_mudata = from_mudata # type: ignore
|