lamindb 1.10.1__py3-none-any.whl → 1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. lamindb/__init__.py +89 -49
  2. lamindb/_finish.py +17 -15
  3. lamindb/_tracked.py +2 -4
  4. lamindb/_view.py +1 -1
  5. lamindb/base/__init__.py +2 -1
  6. lamindb/base/dtypes.py +76 -0
  7. lamindb/core/_settings.py +45 -2
  8. lamindb/core/storage/_anndata_accessor.py +118 -26
  9. lamindb/core/storage/_backed_access.py +10 -7
  10. lamindb/core/storage/_spatialdata_accessor.py +15 -4
  11. lamindb/core/storage/_zarr.py +3 -0
  12. lamindb/curators/_legacy.py +16 -3
  13. lamindb/curators/core.py +449 -193
  14. lamindb/errors.py +6 -0
  15. lamindb/examples/cellxgene/__init__.py +8 -3
  16. lamindb/examples/cellxgene/_cellxgene.py +127 -13
  17. lamindb/examples/cellxgene/{cxg_schema_versions.csv → cellxgene_schema_versions.csv} +11 -0
  18. lamindb/examples/croissant/__init__.py +32 -6
  19. lamindb/examples/datasets/__init__.py +2 -2
  20. lamindb/examples/datasets/_core.py +9 -2
  21. lamindb/examples/datasets/_small.py +66 -22
  22. lamindb/examples/fixtures/sheets.py +8 -2
  23. lamindb/integrations/_croissant.py +34 -11
  24. lamindb/migrations/0118_alter_recordproject_value_projectrecord.py +99 -0
  25. lamindb/migrations/0119_rename_records_project_linked_in_records.py +26 -0
  26. lamindb/migrations/{0117_squashed.py → 0119_squashed.py} +92 -5
  27. lamindb/migrations/0120_add_record_fk_constraint.py +64 -0
  28. lamindb/migrations/0121_recorduser.py +60 -0
  29. lamindb/models/__init__.py +4 -1
  30. lamindb/models/_describe.py +2 -2
  31. lamindb/models/_feature_manager.py +131 -71
  32. lamindb/models/_from_values.py +2 -2
  33. lamindb/models/_is_versioned.py +4 -4
  34. lamindb/models/_label_manager.py +4 -4
  35. lamindb/models/artifact.py +357 -192
  36. lamindb/models/artifact_set.py +45 -1
  37. lamindb/models/can_curate.py +1 -2
  38. lamindb/models/collection.py +3 -34
  39. lamindb/models/feature.py +111 -7
  40. lamindb/models/has_parents.py +11 -11
  41. lamindb/models/project.py +42 -2
  42. lamindb/models/query_manager.py +16 -7
  43. lamindb/models/query_set.py +191 -78
  44. lamindb/models/record.py +30 -5
  45. lamindb/models/run.py +10 -33
  46. lamindb/models/save.py +6 -8
  47. lamindb/models/schema.py +54 -26
  48. lamindb/models/sqlrecord.py +152 -40
  49. lamindb/models/storage.py +59 -14
  50. lamindb/models/transform.py +17 -17
  51. lamindb/models/ulabel.py +6 -1
  52. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/METADATA +11 -16
  53. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/RECORD +55 -50
  54. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/LICENSE +0 -0
  55. {lamindb-1.10.1.dist-info → lamindb-1.11.0.dist-info}/WHEEL +0 -0
@@ -13,12 +13,17 @@ from anndata import __version__ as anndata_version
13
13
  from anndata._core.index import _normalize_indices
14
14
  from anndata._core.views import _resolve_idx
15
15
  from anndata._io.h5ad import read_dataframe_legacy as read_dataframe_legacy_h5
16
- from anndata._io.specs.registry import get_spec, read_elem, read_elem_partial
16
+ from anndata._io.specs.registry import (
17
+ get_spec,
18
+ read_elem,
19
+ read_elem_partial,
20
+ write_elem,
21
+ )
17
22
  from anndata.compat import _read_attr
18
23
  from fsspec.implementations.local import LocalFileSystem
19
24
  from fsspec.utils import infer_compression
20
25
  from lamin_utils import logger
21
- from lamindb_setup.core.upath import infer_filesystem
26
+ from lamindb_setup.core.upath import S3FSMap, infer_filesystem
22
27
  from packaging import version
23
28
  from upath import UPath
24
29
 
@@ -28,6 +33,8 @@ if TYPE_CHECKING:
28
33
  from fsspec.core import OpenFile
29
34
  from lamindb_setup.types import UPathStr
30
35
 
36
+ from lamindb import Artifact
37
+
31
38
 
32
39
  anndata_version_parse = version.parse(anndata_version)
33
40
 
@@ -288,7 +295,7 @@ except ImportError:
288
295
  if ZARR_INSTALLED:
289
296
  from anndata._io.zarr import read_dataframe_legacy as read_dataframe_legacy_zarr
290
297
 
291
- from ._zarr import get_zarr_store
298
+ from ._zarr import IS_ZARR_V3, get_zarr_store
292
299
 
293
300
  ArrayTypes.append(zarr.Array)
294
301
  GroupTypes.append(zarr.Group)
@@ -299,7 +306,18 @@ if ZARR_INSTALLED:
299
306
  assert mode in {"r", "r+", "a", "w", "w-"}, f"Unknown mode {mode}!" # noqa: S101
300
307
 
301
308
  store = get_zarr_store(filepath)
302
- storage = zarr.open(store, mode=mode)
309
+ kwargs = {}
310
+ if IS_ZARR_V3 and mode != "r":
311
+ # otherwise unable to write
312
+ kwargs["use_consolidated"] = False
313
+ storage = zarr.open(store, mode=mode, **kwargs)
314
+ # zarr v2 re-initializes the mapper
315
+ # we need to put back the correct one
316
+ # S3FSMap is returned from get_zarr_store only for zarr v2
317
+ if isinstance(store, S3FSMap):
318
+ assert not IS_ZARR_V3 # noqa: S101
319
+
320
+ storage.store.map = store
303
321
  conn = None
304
322
  return conn, storage
305
323
 
@@ -351,10 +369,10 @@ if ZARR_INSTALLED:
351
369
  # this is needed because accessing zarr.Group.keys() directly is very slow
352
370
  @registry.register("zarr")
353
371
  def keys(storage: zarr.Group):
354
- if hasattr(storage, "_sync_iter"): # zarr v3
372
+ if IS_ZARR_V3:
355
373
  paths = storage._sync_iter(storage.store.list())
356
374
  else:
357
- paths = storage.store.keys() # zarr v2
375
+ paths = storage.store.keys()
358
376
 
359
377
  attrs_keys: dict[str, list] = {}
360
378
  obs_var_arrays = []
@@ -438,9 +456,15 @@ def _try_backed_full(elem):
438
456
  return read_elem(elem)
439
457
 
440
458
 
459
+ def _to_index(elem: np.ndarray):
460
+ if elem.dtype in (np.float64, np.int64):
461
+ elem = elem.astype(str)
462
+ return pd.Index(elem)
463
+
464
+
441
465
  def _safer_read_index(elem):
442
466
  if isinstance(elem, GroupTypes):
443
- return pd.Index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
467
+ return _to_index(read_elem(elem[_read_attr(elem.attrs, "_index")]))
444
468
  elif isinstance(elem, ArrayTypes):
445
469
  indices = None
446
470
  for index_name in ("index", "_index"):
@@ -450,7 +474,7 @@ def _safer_read_index(elem):
450
474
  if indices is not None and len(indices) > 0:
451
475
  if isinstance(indices[0], bytes):
452
476
  indices = np.frompyfunc(lambda x: x.decode("utf-8"), 1, 1)(indices)
453
- return pd.Index(indices)
477
+ return _to_index(indices)
454
478
  else:
455
479
  raise ValueError("Indices not found.")
456
480
  else:
@@ -479,33 +503,40 @@ class _MapAccessor:
479
503
  return descr
480
504
 
481
505
 
506
+ def _safer_read_df(elem, indices=None):
507
+ if indices is not None:
508
+ obj = registry.safer_read_partial(elem, indices=indices)
509
+ df = _records_to_df(obj)
510
+ else:
511
+ df = registry.read_dataframe(elem)
512
+ if df.index.dtype in (np.float64, np.int64):
513
+ df.index = df.index.astype(str)
514
+ return df
515
+
516
+
482
517
  class _AnnDataAttrsMixin:
483
518
  storage: StorageType
484
519
  _attrs_keys: Mapping[str, list]
485
520
 
486
521
  @cached_property
487
- def obs(self) -> pd.DataFrame:
522
+ def obs(self) -> pd.DataFrame | None:
488
523
  if "obs" not in self._attrs_keys:
489
524
  return None
490
525
  indices = getattr(self, "indices", None)
491
- if indices is not None:
492
- indices = (indices[0], slice(None))
493
- obj = registry.safer_read_partial(self.storage["obs"], indices=indices) # type: ignore
494
- return _records_to_df(obj)
495
- else:
496
- return registry.read_dataframe(self.storage["obs"]) # type: ignore
526
+ return _safer_read_df(
527
+ self.storage["obs"], # type: ignore
528
+ indices=(indices[0], slice(None)) if indices is not None else None,
529
+ )
497
530
 
498
531
  @cached_property
499
- def var(self) -> pd.DataFrame:
532
+ def var(self) -> pd.DataFrame | None:
500
533
  if "var" not in self._attrs_keys:
501
534
  return None
502
535
  indices = getattr(self, "indices", None)
503
- if indices is not None:
504
- indices = (indices[1], slice(None))
505
- obj = registry.safer_read_partial(self.storage["var"], indices=indices) # type: ignore
506
- return _records_to_df(obj)
507
- else:
508
- return registry.read_dataframe(self.storage["var"]) # type: ignore
536
+ return _safer_read_df(
537
+ self.storage["var"], # type: ignore
538
+ indices=(indices[1], slice(None)) if indices is not None else None,
539
+ )
509
540
 
510
541
  @cached_property
511
542
  def uns(self):
@@ -702,6 +733,7 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
702
733
  connection: OpenFile | None,
703
734
  storage: StorageType,
704
735
  filename: str,
736
+ artifact: Artifact | None = None,
705
737
  ):
706
738
  self._conn = connection
707
739
  self.storage = storage
@@ -713,14 +745,43 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
713
745
  self._obs_names = _safer_read_index(self.storage["obs"]) # type: ignore
714
746
  self._var_names = _safer_read_index(self.storage["var"]) # type: ignore
715
747
 
748
+ self._artifact = artifact # save artifact to update in write mode
749
+
750
+ self._updated = False # track updates in r+ mode for zarr
751
+
752
+ self._entered = False # check that the context manager is used
716
753
  self._closed = False
717
754
 
718
755
  def close(self):
719
756
  """Closes the connection."""
720
- if hasattr(self, "storage") and hasattr(self.storage, "close"):
721
- self.storage.close()
722
- if hasattr(self, "_conn") and hasattr(self._conn, "close"):
723
- self._conn.close()
757
+ storage = self.storage
758
+ connection = self._conn
759
+
760
+ if self._updated and (artifact := self._artifact) is not None:
761
+ from lamindb.models.artifact import Artifact
762
+ from lamindb.models.sqlrecord import init_self_from_db
763
+
764
+ # now self._updated can only be True for zarr
765
+ assert ZARR_INSTALLED # noqa: S101
766
+
767
+ store = storage.store
768
+ keys = storage._sync_iter(store.list()) if IS_ZARR_V3 else store.keys()
769
+ # this checks that there consolidated metadata was written before
770
+ # need to update it
771
+ # zmetadata is in spatialdata sometimes for some reason
772
+ if ".zmetadata" in keys or "zmetadata" in keys:
773
+ zarr.consolidate_metadata(store)
774
+
775
+ new_version = Artifact(
776
+ artifact.path, revises=artifact, _is_internal_call=True
777
+ ).save()
778
+ # note: sets _state.db = "default"
779
+ init_self_from_db(artifact, new_version)
780
+
781
+ if hasattr(storage, "close"):
782
+ storage.close()
783
+ if hasattr(connection, "close"):
784
+ connection.close()
724
785
  self._closed = True
725
786
 
726
787
  @property
@@ -728,6 +789,8 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
728
789
  return self._closed
729
790
 
730
791
  def __enter__(self):
792
+ self._entered = True
793
+
731
794
  return self
732
795
 
733
796
  def __exit__(self, exc_type, exc_val, exc_tb):
@@ -763,6 +826,35 @@ class AnnDataAccessor(_AnnDataAttrsMixin):
763
826
  self.storage["raw"], None, None, self._obs_names, None, self.shape[0]
764
827
  )
765
828
 
829
+ def add_column(
830
+ self,
831
+ where: Literal["obs", "var"],
832
+ col_name: str,
833
+ col: np.ndarray | pd.Categorical,
834
+ ):
835
+ """Add a new column to .obs or .var of the underlying AnnData object."""
836
+ df_store = self.storage[where] # type: ignore
837
+ if getattr(df_store, "read_only", True):
838
+ raise ValueError(
839
+ "You can use .add_column(...) only with zarr in a writable mode."
840
+ )
841
+ write_elem(df_store, col_name, col)
842
+ df_store.attrs["column-order"] = df_store.attrs["column-order"] + [col_name]
843
+ # remind only once if this wasn't updated before and not in the context manager
844
+ if not self._updated and not self._entered and self._artifact is not None:
845
+ logger.important(
846
+ "Do not forget to call .close() after you finish "
847
+ f"working with this accessor for {self._name} "
848
+ "to automatically update the corresponding artifact."
849
+ )
850
+
851
+ self._updated = True
852
+ # reset the cached property
853
+ # todo: maybe just append the column if the df was already loaded
854
+ self.__dict__.pop(where, None)
855
+ # update the cached columns
856
+ self._attrs_keys[where].append(col_name)
857
+
766
858
 
767
859
  # get the number of observations in an anndata object or file fast and safely
768
860
  def _anndata_n_observations(object: UPathStr | AnnData) -> int | None:
@@ -4,6 +4,7 @@ from dataclasses import dataclass
4
4
  from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any, Callable, Literal
6
6
 
7
+ import h5py
7
8
  from anndata._io.specs.registry import get_spec
8
9
 
9
10
  from ._anndata_accessor import AnnDataAccessor, StorageType, registry
@@ -92,10 +93,10 @@ def backed_access(
92
93
  from lamindb.models import Artifact
93
94
 
94
95
  if isinstance(artifact_or_filepath, Artifact):
95
- objectpath, _ = filepath_from_artifact(
96
- artifact_or_filepath, using_key=using_key
97
- )
96
+ artifact = artifact_or_filepath
97
+ objectpath, _ = filepath_from_artifact(artifact, using_key=using_key)
98
98
  else:
99
+ artifact = None
99
100
  objectpath = artifact_or_filepath
100
101
  name = objectpath.name
101
102
  # ignore .gz, only check the real suffix
@@ -111,9 +112,11 @@ def backed_access(
111
112
  elif suffix in {".h5", ".hdf5", ".h5ad"}:
112
113
  conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
113
114
  elif suffix == ".zarr":
115
+ if mode not in {"r", "r+"}:
116
+ raise ValueError("`mode` should be either 'r' or 'r+' for zarr.")
114
117
  conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
115
118
  if "spatialdata_attrs" in storage.attrs:
116
- return SpatialDataAccessor(storage, name)
119
+ return SpatialDataAccessor(storage, name, artifact)
117
120
  elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
118
121
  df_suffix := df_suffixes.pop()
119
122
  ) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
@@ -127,9 +130,9 @@ def backed_access(
127
130
 
128
131
  is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
129
132
  if is_anndata:
130
- if mode != "r":
131
- raise ValueError("Can only access `AnnData` with mode='r'.")
132
- return AnnDataAccessor(conn, storage, name)
133
+ if mode != "r" and isinstance(storage, h5py.Group):
134
+ raise ValueError("Can only access `hdf5` `AnnData` with mode='r'.")
135
+ return AnnDataAccessor(conn, storage, name, artifact)
133
136
  else:
134
137
  return BackedAccessor(conn, storage)
135
138
 
@@ -8,13 +8,22 @@ from ._anndata_accessor import AnnDataAccessor
8
8
  if TYPE_CHECKING:
9
9
  from zarr import Group
10
10
 
11
+ from lamindb import Artifact
12
+
11
13
 
12
14
  class _TablesAccessor:
13
- def __init__(self, tables: Group):
15
+ def __init__(self, tables: Group, artifact: Artifact | None = None):
14
16
  self._tables = tables
15
17
 
18
+ self._artifact = artifact
19
+
16
20
  def __getitem__(self, key: str) -> AnnDataAccessor:
17
- return AnnDataAccessor(connection=None, storage=self._tables[key], filename=key)
21
+ return AnnDataAccessor(
22
+ connection=None,
23
+ storage=self._tables[key],
24
+ filename=key,
25
+ artifact=self._artifact,
26
+ )
18
27
 
19
28
  def keys(self) -> list[str]:
20
29
  return list(self._tables.keys())
@@ -33,14 +42,16 @@ class SpatialDataAccessor:
33
42
  For now only allows to access `tables`.
34
43
  """
35
44
 
36
- def __init__(self, storage: Group, name: str):
45
+ def __init__(self, storage: Group, name: str, artifact: Artifact | None = None):
37
46
  self.storage = storage
38
47
  self._name = name
39
48
 
49
+ self._artifact = artifact
50
+
40
51
  @cached_property
41
52
  def tables(self) -> _TablesAccessor:
42
53
  """tables of the underlying SpatialData object."""
43
- return _TablesAccessor(self.storage["tables"])
54
+ return _TablesAccessor(self.storage["tables"], self._artifact)
44
55
 
45
56
  def __repr__(self):
46
57
  """Description of the SpatialDataAccessor object."""
@@ -37,6 +37,9 @@ def get_zarr_store(
37
37
  if isinstance(storepath, LocalPathClasses):
38
38
  store = storepath_str
39
39
  elif IS_ZARR_V3:
40
+ # todo: also check how to treat non-asynchronous filesystems
41
+ # zarr has something for this, using fsspec async wrapper
42
+ # check FsspecStore code
40
43
  store = zarr.storage.FsspecStore.from_upath(UPath(storepath, asynchronous=True))
41
44
  else:
42
45
  store = create_mapper(storepath.fs, storepath_str, check=check, create=create)
@@ -133,7 +133,7 @@ class CatManager:
133
133
 
134
134
  if self._artifact is None:
135
135
  if isinstance(self._dataset, pd.DataFrame):
136
- artifact = Artifact.from_df(
136
+ artifact = Artifact.from_dataframe(
137
137
  self._dataset,
138
138
  key=key,
139
139
  description=description,
@@ -1275,7 +1275,7 @@ class TiledbsomaCatManager(CatManager):
1275
1275
  empty_dict, schema=self._obs_pa_schema
1276
1276
  ).to_pandas()
1277
1277
  # in parallel to https://github.com/laminlabs/lamindb/blob/2a1709990b5736b480c6de49c0ada47fafc8b18d/lamindb/core/_feature_manager.py#L549-L554
1278
- feature_sets["obs"] = Schema.from_df(
1278
+ feature_sets["obs"] = Schema.from_dataframe(
1279
1279
  df=mock_df,
1280
1280
  field=self._columns_field,
1281
1281
  mute=True,
@@ -1367,7 +1367,7 @@ def legacy_annotate_artifact(
1367
1367
 
1368
1368
 
1369
1369
  @classmethod # type: ignore
1370
- def from_df(
1370
+ def from_dataframe(
1371
1371
  cls,
1372
1372
  df: pd.DataFrame,
1373
1373
  categoricals: dict[str, FieldAttr] | None = None,
@@ -1383,6 +1383,18 @@ def from_df(
1383
1383
  )
1384
1384
 
1385
1385
 
1386
+ @classmethod # type: ignore
1387
+ @deprecated("from_dataframe")
1388
+ def from_df(
1389
+ cls,
1390
+ df: pd.DataFrame,
1391
+ categoricals: dict[str, FieldAttr] | None = None,
1392
+ columns: FieldAttr = Feature.name,
1393
+ organism: str | None = None,
1394
+ ) -> DataFrameCatManager:
1395
+ return cls.from_dataframe(df, categoricals, columns, organism)
1396
+
1397
+
1386
1398
  @classmethod # type: ignore
1387
1399
  def from_anndata(
1388
1400
  cls,
@@ -1468,6 +1480,7 @@ def from_spatialdata(
1468
1480
  )
1469
1481
 
1470
1482
 
1483
+ CatManager.from_dataframe = from_dataframe # type: ignore
1471
1484
  CatManager.from_df = from_df # type: ignore
1472
1485
  CatManager.from_anndata = from_anndata # type: ignore
1473
1486
  CatManager.from_mudata = from_mudata # type: ignore