lamindb 1.1.0__py3-none-any.whl → 1.2a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. lamindb/__init__.py +31 -26
  2. lamindb/_finish.py +9 -1
  3. lamindb/_tracked.py +26 -3
  4. lamindb/_view.py +2 -3
  5. lamindb/base/__init__.py +1 -1
  6. lamindb/base/ids.py +1 -10
  7. lamindb/base/users.py +1 -4
  8. lamindb/core/__init__.py +7 -65
  9. lamindb/core/_context.py +41 -10
  10. lamindb/core/_mapped_collection.py +4 -2
  11. lamindb/core/_settings.py +6 -6
  12. lamindb/core/_sync_git.py +1 -1
  13. lamindb/core/_track_environment.py +2 -1
  14. lamindb/core/datasets/_small.py +3 -3
  15. lamindb/core/loaders.py +22 -9
  16. lamindb/core/storage/_anndata_accessor.py +8 -3
  17. lamindb/core/storage/_backed_access.py +14 -7
  18. lamindb/core/storage/_pyarrow_dataset.py +24 -9
  19. lamindb/core/storage/_tiledbsoma.py +6 -4
  20. lamindb/core/storage/_zarr.py +32 -11
  21. lamindb/core/storage/objects.py +59 -26
  22. lamindb/core/storage/paths.py +16 -13
  23. lamindb/curators/__init__.py +173 -145
  24. lamindb/errors.py +1 -1
  25. lamindb/integrations/_vitessce.py +4 -4
  26. lamindb/migrations/0089_subsequent_runs.py +159 -0
  27. lamindb/migrations/0090_runproject_project_runs.py +73 -0
  28. lamindb/migrations/{0088_squashed.py → 0090_squashed.py} +245 -177
  29. lamindb/models/__init__.py +79 -0
  30. lamindb/{core → models}/_describe.py +3 -3
  31. lamindb/{core → models}/_django.py +8 -5
  32. lamindb/{core → models}/_feature_manager.py +103 -87
  33. lamindb/{_from_values.py → models/_from_values.py} +5 -2
  34. lamindb/{core/versioning.py → models/_is_versioned.py} +94 -6
  35. lamindb/{core → models}/_label_manager.py +10 -17
  36. lamindb/{core/relations.py → models/_relations.py} +8 -1
  37. lamindb/models/artifact.py +2601 -0
  38. lamindb/{_can_curate.py → models/can_curate.py} +349 -180
  39. lamindb/models/collection.py +683 -0
  40. lamindb/models/core.py +135 -0
  41. lamindb/models/feature.py +643 -0
  42. lamindb/models/flextable.py +163 -0
  43. lamindb/{_parents.py → models/has_parents.py} +55 -49
  44. lamindb/models/project.py +384 -0
  45. lamindb/{_query_manager.py → models/query_manager.py} +10 -8
  46. lamindb/{_query_set.py → models/query_set.py} +52 -30
  47. lamindb/models/record.py +1757 -0
  48. lamindb/models/run.py +563 -0
  49. lamindb/{_save.py → models/save.py} +18 -8
  50. lamindb/models/schema.py +732 -0
  51. lamindb/models/transform.py +360 -0
  52. lamindb/models/ulabel.py +249 -0
  53. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/METADATA +5 -5
  54. lamindb-1.2a2.dist-info/RECORD +94 -0
  55. lamindb/_artifact.py +0 -1361
  56. lamindb/_collection.py +0 -440
  57. lamindb/_feature.py +0 -316
  58. lamindb/_is_versioned.py +0 -40
  59. lamindb/_record.py +0 -1065
  60. lamindb/_run.py +0 -60
  61. lamindb/_schema.py +0 -347
  62. lamindb/_storage.py +0 -15
  63. lamindb/_transform.py +0 -170
  64. lamindb/_ulabel.py +0 -56
  65. lamindb/_utils.py +0 -9
  66. lamindb/base/validation.py +0 -63
  67. lamindb/core/_data.py +0 -491
  68. lamindb/core/fields.py +0 -12
  69. lamindb/models.py +0 -4435
  70. lamindb-1.1.0.dist-info/RECORD +0 -95
  71. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/LICENSE +0 -0
  72. {lamindb-1.1.0.dist-info → lamindb-1.2a2.dist-info}/WHEEL +0 -0
@@ -5,8 +5,6 @@ from typing import TYPE_CHECKING, Any, Callable
5
5
 
6
6
  from anndata._io.specs.registry import get_spec
7
7
 
8
- from lamindb.models import Artifact
9
-
10
8
  from ._anndata_accessor import AnnDataAccessor, StorageType, registry
11
9
  from ._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
12
10
  from ._tiledbsoma import _open_tiledbsoma
@@ -19,6 +17,8 @@ if TYPE_CHECKING:
19
17
  from tiledbsoma import Experiment as SOMAExperiment
20
18
  from upath import UPath
21
19
 
20
+ from lamindb.models.artifact import Artifact
21
+
22
22
 
23
23
  # this dynamically creates a subclass of a context manager class
24
24
  # and reassigns it to an instance of the superclass
@@ -70,9 +70,12 @@ def backed_access(
70
70
  artifact_or_filepath: Artifact | UPath,
71
71
  mode: str = "r",
72
72
  using_key: str | None = None,
73
+ **kwargs,
73
74
  ) -> (
74
75
  AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
75
76
  ):
77
+ from lamindb.models import Artifact
78
+
76
79
  if isinstance(artifact_or_filepath, Artifact):
77
80
  objectpath, _ = filepath_from_artifact(
78
81
  artifact_or_filepath, using_key=using_key
@@ -80,18 +83,22 @@ def backed_access(
80
83
  else:
81
84
  objectpath = artifact_or_filepath
82
85
  name = objectpath.name
83
- suffix = objectpath.suffix
86
+ # ignore .gz, only check the real suffix
87
+ suffixes = objectpath.suffixes
88
+ suffix = (
89
+ suffixes[-2] if len(suffixes) > 1 and ".gz" in suffixes else objectpath.suffix
90
+ )
84
91
 
85
92
  if name == "soma" or suffix == ".tiledbsoma":
86
93
  if mode not in {"r", "w"}:
87
94
  raise ValueError("`mode` should be either 'r' or 'w' for tiledbsoma.")
88
- return _open_tiledbsoma(objectpath, mode=mode) # type: ignore
95
+ return _open_tiledbsoma(objectpath, mode=mode, **kwargs) # type: ignore
89
96
  elif suffix in {".h5", ".hdf5", ".h5ad"}:
90
- conn, storage = registry.open("h5py", objectpath, mode=mode)
97
+ conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
91
98
  elif suffix == ".zarr":
92
- conn, storage = registry.open("zarr", objectpath, mode=mode)
99
+ conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
93
100
  elif _is_pyarrow_dataset(objectpath):
94
- return _open_pyarrow_dataset(objectpath)
101
+ return _open_pyarrow_dataset(objectpath, **kwargs)
95
102
  else:
96
103
  raise ValueError(
97
104
  "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
@@ -18,15 +18,30 @@ def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
18
18
  # we don't check here that the filesystem is the same
19
19
  # but this is a requirement for pyarrow.dataset.dataset
20
20
  if isinstance(paths, list):
21
- suffixes = {path.suffix for path in paths}
22
- elif paths.is_file():
23
- suffixes = {paths.suffix}
21
+ path_list = paths
22
+ elif paths.is_dir():
23
+ path_list = [path for path in paths.rglob("*") if path.suffix != ""]
24
24
  else:
25
- suffixes = {path.suffix for path in paths.rglob("*") if path.suffix != ""}
26
- return len(suffixes) == 1 and suffixes.pop() in PYARROW_SUFFIXES
27
-
28
-
29
- def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
25
+ path_list = [paths]
26
+ suffix = None
27
+ for path in path_list:
28
+ path_suffixes = path.suffixes
29
+ # this doesn't work for externally gzipped files, REMOVE LATER
30
+ path_suffix = (
31
+ path_suffixes[-2]
32
+ if len(path_suffixes) > 1 and ".gz" in path_suffixes
33
+ else path.suffix
34
+ )
35
+ if path_suffix not in PYARROW_SUFFIXES:
36
+ return False
37
+ elif suffix is None:
38
+ suffix = path_suffix
39
+ elif path_suffix != suffix:
40
+ return False
41
+ return True
42
+
43
+
44
+ def _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset:
30
45
  if isinstance(paths, list):
31
46
  path0 = paths[0]
32
47
  if isinstance(path0, LocalPathClasses):
@@ -38,4 +53,4 @@ def _open_pyarrow_dataset(paths: UPath | list[UPath]) -> PyArrowDataset:
38
53
  else:
39
54
  paths_str, filesystem = paths.path, paths.fs
40
55
 
41
- return pyarrow.dataset.dataset(paths_str, filesystem=filesystem)
56
+ return pyarrow.dataset.dataset(paths_str, filesystem=filesystem, **kwargs)
@@ -12,8 +12,6 @@ from lamindb_setup.core._settings_storage import get_storage_region
12
12
  from lamindb_setup.core.upath import LocalPathClasses, create_path
13
13
  from packaging import version
14
14
 
15
- from lamindb.models import Artifact, Run
16
-
17
15
  if TYPE_CHECKING:
18
16
  from lamindb_setup.core.types import UPathStr
19
17
  from tiledbsoma import Collection as SOMACollection
@@ -21,6 +19,9 @@ if TYPE_CHECKING:
21
19
  from tiledbsoma import Measurement as SOMAMeasurement
22
20
  from upath import UPath
23
21
 
22
+ from lamindb.models.artifact import Artifact
23
+ from lamindb.models.run import Run
24
+
24
25
 
25
26
  def _load_h5ad_zarr(objpath: UPath):
26
27
  from lamindb.core.loaders import load_anndata_zarr, load_h5ad
@@ -134,9 +135,10 @@ def save_tiledbsoma_experiment(
134
135
  except ImportError as e:
135
136
  raise ImportError("Please install tiledbsoma: pip install tiledbsoma") from e
136
137
 
137
- from lamindb.core._data import get_run
138
138
  from lamindb.core.storage.paths import auto_storage_key_from_artifact_uid
139
- from lamindb.core.versioning import create_uid
139
+ from lamindb.models import Artifact
140
+ from lamindb.models._is_versioned import create_uid
141
+ from lamindb.models.artifact import get_run
140
142
 
141
143
  run = get_run(run)
142
144
 
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import warnings
4
- from typing import TYPE_CHECKING
4
+ from typing import TYPE_CHECKING, Literal
5
5
 
6
6
  import scipy.sparse as sparse
7
7
  import zarr
8
8
  from anndata import __version__ as anndata_version
9
9
  from anndata._io.specs import write_elem
10
- from anndata._io.specs.registry import get_spec
11
10
  from fsspec.implementations.local import LocalFileSystem
11
+ from lamin_utils import logger
12
12
  from lamindb_setup.core.upath import create_mapper, infer_filesystem
13
13
  from packaging import version
14
14
 
@@ -25,15 +25,36 @@ if TYPE_CHECKING:
25
25
  from lamindb_setup.core.types import UPathStr
26
26
 
27
27
 
28
- def zarr_is_adata(storepath: UPathStr) -> bool:
28
+ def identify_zarr_type(
29
+ storepath: UPathStr, *, check: bool = True
30
+ ) -> Literal["anndata", "spatialdata", "unknown"]:
31
+ """Identify whether a zarr store is AnnData, SpatialData, or unknown type."""
32
+ # we can add these cheap suffix-based-checks later
33
+ # also need to check whether the .spatialdata.zarr suffix
34
+ # actually becomes a "standard"; currently we don't recognize it
35
+ # unlike ".anndata.zarr" in VALID_SUFFIXES
36
+ # suffixes = UPath(storepath).suffixes
37
+ # if ".spatialdata" in suffixes:
38
+ # return "spatialdata"
39
+ # elif ".anndata" in suffixes:
40
+ # return "anndata"
41
+
29
42
  fs, storepath_str = infer_filesystem(storepath)
43
+
30
44
  if isinstance(fs, LocalFileSystem):
31
- # this is faster than through an fsspec mapper for local
32
45
  open_obj = storepath_str
33
46
  else:
34
- open_obj = create_mapper(fs, storepath_str, check=True)
35
- storage = zarr.open(open_obj, mode="r")
36
- return get_spec(storage).encoding_type == "anndata"
47
+ open_obj = create_mapper(fs, storepath_str, check=check)
48
+
49
+ try:
50
+ storage = zarr.open(open_obj, mode="r")
51
+ if "spatialdata_attrs" in storage.attrs:
52
+ return "spatialdata"
53
+ if storage.attrs.get("encoding-type", "") == "anndata":
54
+ return "anndata"
55
+ except Exception as error:
56
+ logger.warning(f"an exception occured {error}")
57
+ return "unknown"
37
58
 
38
59
 
39
60
  def load_anndata_zarr(storepath: UPathStr) -> AnnData:
@@ -65,7 +86,7 @@ def write_adata_zarr(
65
86
  adata_size = None
66
87
  cumulative_val = 0
67
88
 
68
- def _cb(key_write: str | None = None):
89
+ def _report_progress(key_write: str | None = None):
69
90
  nonlocal adata_size
70
91
  nonlocal cumulative_val
71
92
 
@@ -91,9 +112,9 @@ def write_adata_zarr(
91
112
 
92
113
  def _write_elem_cb(f, k, elem, dataset_kwargs):
93
114
  write_elem(f, k, elem, dataset_kwargs=dataset_kwargs)
94
- _cb(k)
115
+ _report_progress(k)
95
116
 
96
- _cb(None)
117
+ _report_progress(None)
97
118
  with warnings.catch_warnings():
98
119
  warnings.filterwarnings("ignore", category=UserWarning, module="zarr")
99
120
 
@@ -114,4 +135,4 @@ def write_adata_zarr(
114
135
  )
115
136
  _write_elem_cb(f, "raw", adata.raw, dataset_kwargs=dataset_kwargs)
116
137
  # todo: fix size less than total at the end
117
- _cb(None)
138
+ _report_progress(None)
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import PurePosixPath
4
- from typing import TYPE_CHECKING
4
+ from typing import TYPE_CHECKING, TypeAlias, TypeVar
5
5
 
6
6
  from anndata import AnnData
7
7
  from pandas import DataFrame
@@ -9,54 +9,87 @@ from pandas import DataFrame
9
9
  if TYPE_CHECKING:
10
10
  from lamindb_setup.core.types import UPathStr
11
11
 
12
+ SpatialData = TypeVar("SpatialData")
13
+ MuData = TypeVar("MuData")
12
14
 
13
- def _mudata_is_installed():
14
- try:
15
- import mudata # noqa: F401c
16
- except ImportError:
17
- return False
18
- return True
15
+ SupportedDataTypes: TypeAlias = AnnData | DataFrame | MuData | SpatialData
19
16
 
20
17
 
21
- def infer_suffix(dmem, adata_format: str | None = None):
18
+ def is_package_installed(package_name):
19
+ import importlib.util
20
+
21
+ spec = importlib.util.find_spec(package_name)
22
+ return spec is not None
23
+
24
+
25
+ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
22
26
  """Infer LaminDB storage file suffix from a data object."""
23
27
  if isinstance(dmem, AnnData):
24
- if adata_format is not None:
25
- if adata_format not in {"h5ad", "zarr", "anndata.zarr"}:
28
+ if format is not None:
29
+ if format not in {"h5ad", "zarr", "anndata.zarr"}:
26
30
  raise ValueError(
27
31
  "Error when specifying AnnData storage format, it should be"
28
- f" 'h5ad', 'zarr', not '{adata_format}'. Check 'format'"
32
+ f" 'h5ad', 'zarr', not '{format}'. Check 'format'"
29
33
  " or the suffix of 'key'."
30
34
  )
31
- return "." + adata_format
35
+ return "." + format
32
36
  return ".h5ad"
33
- elif isinstance(dmem, DataFrame):
37
+
38
+ if isinstance(dmem, DataFrame):
34
39
  return ".parquet"
35
- else:
36
- if _mudata_is_installed():
37
- from mudata import MuData
38
40
 
39
- if isinstance(dmem, MuData):
40
- return ".h5mu"
41
+ if is_package_installed("mudata"):
42
+ from mudata import MuData
43
+
44
+ if isinstance(dmem, MuData):
45
+ return ".h5mu"
46
+
47
+ if is_package_installed("spatialdata"):
48
+ from spatialdata import SpatialData
49
+
50
+ if isinstance(dmem, SpatialData):
51
+ if format is not None:
52
+ if format not in {"spatialdata.zarr"}:
53
+ raise ValueError(
54
+ "Error when specifying SpatialData storage format, it should be"
55
+ f" 'zarr', 'spatialdata.zarr', not '{format}'. Check 'format'"
56
+ " or the suffix of 'key'."
57
+ )
58
+ return "." + format
59
+ return ".zarr"
60
+ else:
41
61
  raise NotImplementedError
42
62
 
43
63
 
44
- def write_to_disk(dmem, filepath: UPathStr):
64
+ def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
65
+ """Writes the passed in memory data to disk to a specified path."""
45
66
  if isinstance(dmem, AnnData):
46
67
  suffix = PurePosixPath(filepath).suffix
47
68
  if suffix == ".h5ad":
48
69
  dmem.write_h5ad(filepath)
70
+ return
49
71
  elif suffix == ".zarr":
50
72
  dmem.write_zarr(filepath)
73
+ return
51
74
  else:
52
75
  raise NotImplementedError
53
- elif isinstance(dmem, DataFrame):
76
+
77
+ if isinstance(dmem, DataFrame):
54
78
  dmem.to_parquet(filepath)
55
- else:
56
- if _mudata_is_installed():
57
- from mudata import MuData
79
+ return
80
+
81
+ if is_package_installed("mudata"):
82
+ from mudata import MuData
58
83
 
59
- if isinstance(dmem, MuData):
60
- dmem.write(filepath)
61
- return
84
+ if isinstance(dmem, MuData):
85
+ dmem.write(filepath)
86
+ return
87
+
88
+ if is_package_installed("spatialdata"):
89
+ from spatialdata import SpatialData
90
+
91
+ if isinstance(dmem, SpatialData):
92
+ dmem.write(filepath, overwrite=True)
93
+ return
94
+ else:
62
95
  raise NotImplementedError
@@ -11,13 +11,14 @@ from lamindb_setup.core.upath import (
11
11
  )
12
12
 
13
13
  from lamindb.core._settings import settings
14
- from lamindb.models import Artifact, Storage
15
14
 
16
15
  if TYPE_CHECKING:
17
16
  from pathlib import Path
18
17
 
19
18
  from lamindb_setup.core.types import UPathStr
20
19
 
20
+ from lamindb.models.artifact import Artifact
21
+
21
22
 
22
23
  AUTO_KEY_PREFIX = ".lamindb/"
23
24
 
@@ -41,24 +42,21 @@ def auto_storage_key_from_artifact_uid(uid: str, suffix: str, is_dir: bool) -> s
41
42
  return storage_key
42
43
 
43
44
 
44
- def _safely_resolve(upath: UPath) -> UPath:
45
- if upath.protocol in {"http", "https"}:
46
- resolve_kwargs = {"follow_redirects": False}
47
- else:
48
- resolve_kwargs = {}
49
- return upath.resolve(**resolve_kwargs)
50
-
51
-
52
45
  def check_path_is_child_of_root(path: UPathStr, root: UPathStr) -> bool:
53
46
  if fsspec.utils.get_protocol(str(path)) != fsspec.utils.get_protocol(str(root)):
54
47
  return False
55
- path_upath = _safely_resolve(UPath(path))
56
- root_upath = _safely_resolve(UPath(root))
48
+ path_upath = UPath(path)
49
+ root_upath = UPath(root)
57
50
  if path_upath.protocol == "s3":
58
51
  endpoint_path = path_upath.storage_options.get("endpoint_url", "")
59
52
  endpoint_root = root_upath.storage_options.get("endpoint_url", "")
60
53
  if endpoint_path != endpoint_root:
61
54
  return False
55
+ # we don't resolve http links because they can resolve into a different domain
56
+ # for example into a temporary url
57
+ if path_upath.protocol not in {"http", "https"}:
58
+ path_upath = path_upath.resolve()
59
+ root_upath = root_upath.resolve()
62
60
  # str is needed to eliminate UPath storage_options
63
61
  # which affect equality checks
64
62
  return UPath(str(root_upath)) in UPath(str(path_upath)).parents
@@ -73,6 +71,8 @@ def attempt_accessing_path(
73
71
  ) -> tuple[UPath, StorageSettings]:
74
72
  # check whether the file is in the default db and whether storage
75
73
  # matches default storage
74
+ from lamindb.models import Storage
75
+
76
76
  if (
77
77
  artifact._state.db in ("default", None)
78
78
  and artifact.storage_id == settings._storage_settings.id
@@ -134,7 +134,7 @@ def filepath_cache_key_from_artifact(
134
134
 
135
135
 
136
136
  def store_file_or_folder(
137
- local_path: UPathStr, storage_path: UPath, print_progress: bool = True
137
+ local_path: UPathStr, storage_path: UPath, print_progress: bool = True, **kwargs
138
138
  ) -> None:
139
139
  """Store file or folder (localpath) at storagepath."""
140
140
  local_path = UPath(local_path)
@@ -155,7 +155,10 @@ def store_file_or_folder(
155
155
  else:
156
156
  create_folder = None
157
157
  storage_path.upload_from(
158
- local_path, create_folder=create_folder, print_progress=print_progress
158
+ local_path,
159
+ create_folder=create_folder,
160
+ print_progress=print_progress,
161
+ **kwargs,
159
162
  )
160
163
  else: # storage path is local
161
164
  if local_path.resolve().as_posix() == storage_path.resolve().as_posix():