lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. lamindb/__init__.py +52 -36
  2. lamindb/_finish.py +17 -10
  3. lamindb/_tracked.py +1 -1
  4. lamindb/base/__init__.py +3 -1
  5. lamindb/base/fields.py +40 -22
  6. lamindb/base/ids.py +1 -94
  7. lamindb/base/types.py +2 -0
  8. lamindb/base/uids.py +117 -0
  9. lamindb/core/_context.py +203 -102
  10. lamindb/core/_settings.py +38 -25
  11. lamindb/core/datasets/__init__.py +11 -4
  12. lamindb/core/datasets/_core.py +5 -5
  13. lamindb/core/datasets/_small.py +0 -93
  14. lamindb/core/datasets/mini_immuno.py +172 -0
  15. lamindb/core/loaders.py +1 -1
  16. lamindb/core/storage/_backed_access.py +100 -6
  17. lamindb/core/storage/_polars_lazy_df.py +51 -0
  18. lamindb/core/storage/_pyarrow_dataset.py +15 -30
  19. lamindb/core/storage/_tiledbsoma.py +29 -13
  20. lamindb/core/storage/objects.py +6 -0
  21. lamindb/core/subsettings/__init__.py +2 -0
  22. lamindb/core/subsettings/_annotation_settings.py +11 -0
  23. lamindb/curators/__init__.py +7 -3349
  24. lamindb/curators/_legacy.py +2056 -0
  25. lamindb/curators/core.py +1534 -0
  26. lamindb/errors.py +11 -0
  27. lamindb/examples/__init__.py +27 -0
  28. lamindb/examples/schemas/__init__.py +12 -0
  29. lamindb/examples/schemas/_anndata.py +25 -0
  30. lamindb/examples/schemas/_simple.py +19 -0
  31. lamindb/integrations/_vitessce.py +8 -5
  32. lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
  33. lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
  34. lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
  35. lamindb/models/__init__.py +4 -1
  36. lamindb/models/_describe.py +21 -4
  37. lamindb/models/_feature_manager.py +382 -287
  38. lamindb/models/_label_manager.py +8 -2
  39. lamindb/models/artifact.py +177 -106
  40. lamindb/models/artifact_set.py +122 -0
  41. lamindb/models/collection.py +73 -52
  42. lamindb/models/core.py +1 -1
  43. lamindb/models/feature.py +51 -17
  44. lamindb/models/has_parents.py +69 -14
  45. lamindb/models/project.py +1 -1
  46. lamindb/models/query_manager.py +221 -22
  47. lamindb/models/query_set.py +247 -172
  48. lamindb/models/record.py +65 -247
  49. lamindb/models/run.py +4 -4
  50. lamindb/models/save.py +8 -2
  51. lamindb/models/schema.py +456 -184
  52. lamindb/models/transform.py +2 -2
  53. lamindb/models/ulabel.py +8 -5
  54. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
  55. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
  56. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
  57. {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0
@@ -322,8 +322,6 @@ def anndata_human_immune_cells(
322
322
 
323
323
  import lamindb as ln
324
324
 
325
- verbosity = ln.settings.verbosity
326
- ln.settings.verbosity = "error"
327
325
  ln.save(
328
326
  bt.Gene.from_values(
329
327
  adata.var.index, field="ensembl_gene_id", organism="human"
@@ -339,7 +337,6 @@ def anndata_human_immune_cells(
339
337
  ln.Feature(name="donor", dtype=[ln.ULabel]).save() # type: ignore
340
338
  bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
341
339
  ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
342
- ln.settings.verbosity = verbosity
343
340
  return adata
344
341
 
345
342
 
@@ -560,11 +557,14 @@ def spatialdata_blobs() -> SpatialData:
560
557
  from spatialdata.datasets import blobs
561
558
 
562
559
  sdata = blobs()
563
- sdata.attrs["sample"] = {
564
- "assay": "Visium Spatial Gene Expression",
560
+ sdata.attrs["bio"] = {
565
561
  "disease": "Alzheimer disease",
566
562
  "developmental_stage": "adult stage",
567
563
  }
564
+ sdata.attrs["tech"] = {
565
+ "assay": "Visium Spatial Gene Expression",
566
+ }
567
+ sdata.attrs["random_int"] = 20
568
568
  sdata.tables["table"].var.index = [
569
569
  "ENSG00000139618", # BRCA2
570
570
  "ENSG00000157764", # BRAF
@@ -7,99 +7,6 @@ import numpy as np
7
7
  import pandas as pd
8
8
 
9
9
 
10
- def small_dataset1(
11
- otype: Literal["DataFrame", "AnnData"] = "DataFrame",
12
- gene_symbols_in_index: bool = False,
13
- with_typo: bool = False,
14
- with_cell_type_synonym: bool = False,
15
- with_cell_type_typo: bool = False,
16
- ) -> pd.DataFrame | ad.AnnData:
17
- # define the data in the dataset
18
- # it's a mix of numerical measurements and observation-level metadata
19
- ifng = "IFNJ" if with_typo else "IFNG"
20
- if gene_symbols_in_index:
21
- var_ids = ["CD8A", "CD4", "CD14"]
22
- else:
23
- var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
24
- abt_cell = (
25
- "CD8-pos alpha-beta T cell"
26
- if with_cell_type_typo
27
- else "CD8-positive, alpha-beta T cell"
28
- )
29
- dataset_dict = {
30
- var_ids[0]: [1, 2, 3],
31
- var_ids[1]: [3, 4, 5],
32
- var_ids[2]: [5, 6, 7],
33
- "perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
34
- "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
35
- "cell_type_by_expert": pd.Categorical(
36
- ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
37
- ),
38
- "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
39
- "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
40
- "concentration": ["0.1%", "200 nM", "0.1%"],
41
- "treatment_time_h": [24, 24, 6],
42
- "donor": ["D0001", "D0002", None],
43
- }
44
- # define the dataset-level metadata
45
- metadata = {
46
- "temperature": 21.6,
47
- "experiment": "Experiment 1",
48
- "date_of_study": "2024-12-01",
49
- "study_note": "We had a great time performing this study and the results look compelling.",
50
- }
51
- # the dataset as DataFrame
52
- dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
53
- if otype == "DataFrame":
54
- for key, value in metadata.items():
55
- dataset_df.attrs[key] = value
56
- return dataset_df
57
- else:
58
- dataset_ad = ad.AnnData(
59
- dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
60
- )
61
- return dataset_ad
62
-
63
-
64
- def small_dataset2(
65
- otype: Literal["DataFrame", "AnnData"],
66
- gene_symbols_in_index: bool = False,
67
- ) -> pd.DataFrame | ad.AnnData:
68
- if gene_symbols_in_index:
69
- var_ids = ["CD8A", "CD4", "CD38"]
70
- else:
71
- var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
72
- dataset_dict = {
73
- var_ids[0]: [2, 3, 3],
74
- var_ids[1]: [3, 4, 5],
75
- var_ids[2]: [4, 2, 3],
76
- "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
77
- "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
78
- }
79
- metadata = {
80
- "temperature": 22.6,
81
- "experiment": "Experiment 2",
82
- "date_of_study": "2025-02-13",
83
- }
84
- dataset_df = pd.DataFrame(
85
- dataset_dict,
86
- index=["sample4", "sample5", "sample6"],
87
- )
88
- ad.AnnData(
89
- dataset_df[var_ids],
90
- obs=dataset_df[["perturbation", "cell_type_by_model"]],
91
- )
92
- if otype == "DataFrame":
93
- for key, value in metadata.items():
94
- dataset_df.attrs[key] = value
95
- return dataset_df
96
- else:
97
- dataset_ad = ad.AnnData(
98
- dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
99
- )
100
- return dataset_ad
101
-
102
-
103
10
  def small_dataset3_cellxgene(
104
11
  otype: Literal["DataFrame", "AnnData"] = "AnnData",
105
12
  ) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
@@ -0,0 +1,172 @@
1
+ """The mini immuno dataset.
2
+
3
+ .. autosummary::
4
+ :toctree: .
5
+
6
+ define_features_labels
7
+ get_dataset1
8
+ get_dataset2
9
+
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import TYPE_CHECKING, Literal
15
+
16
+ import anndata as ad
17
+ import pandas as pd
18
+
19
+ if TYPE_CHECKING:
20
+ from lamindb.models import Schema
21
+
22
+
23
+ def define_features_labels() -> None:
24
+ """Features & labels to validate the mini immuno datasets.
25
+
26
+ .. literalinclude:: scripts/define_mini_immuno_features_labels.py
27
+ :language: python
28
+ """
29
+ import sys
30
+ from pathlib import Path
31
+
32
+ docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
33
+ if str(docs_path) not in sys.path:
34
+ sys.path.append(str(docs_path))
35
+
36
+ import define_mini_immuno_features_labels # noqa
37
+
38
+
39
+ def define_mini_immuno_schema_flexible() -> Schema:
40
+ """Features & labels to validate the mini immuno datasets.
41
+
42
+ .. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
43
+ :language: python
44
+ """
45
+ import sys
46
+ from pathlib import Path
47
+
48
+ from lamindb.models import Schema
49
+
50
+ docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
51
+ if str(docs_path) not in sys.path:
52
+ sys.path.append(str(docs_path))
53
+
54
+ define_features_labels()
55
+ import define_mini_immuno_schema_flexible # noqa
56
+
57
+ return Schema.get(name="Mini immuno schema")
58
+
59
+
60
+ def get_dataset1(
61
+ otype: Literal["DataFrame", "AnnData"] = "DataFrame",
62
+ gene_symbols_in_index: bool = False,
63
+ with_typo: bool = False,
64
+ with_cell_type_synonym: bool = False,
65
+ with_cell_type_typo: bool = False,
66
+ with_gene_typo: bool = False,
67
+ with_outdated_gene: bool = False,
68
+ with_wrong_subtype: bool = False,
69
+ with_index_type_mismatch: bool = False,
70
+ ) -> pd.DataFrame | ad.AnnData:
71
+ """A small tabular dataset measuring expression & metadata."""
72
+ # define the data in the dataset
73
+ # it's a mix of numerical measurements and observation-level metadata
74
+ ifng = "IFNJ" if with_typo else "IFNG"
75
+ thing = "ulabel_but_not_perturbation" if with_wrong_subtype else "DMSO"
76
+ if gene_symbols_in_index:
77
+ var_ids = ["CD8A", "CD4", "CD14" if not with_gene_typo else "GeneTypo"]
78
+ else:
79
+ var_ids = [
80
+ "ENSG00000153563",
81
+ "ENSG00000010610",
82
+ "ENSG00000170458"
83
+ if not with_gene_typo
84
+ else "GeneTypo"
85
+ if not with_outdated_gene
86
+ else "ENSG00000278198",
87
+ ]
88
+ abt_cell = (
89
+ "CD8-pos alpha-beta T cell"
90
+ if with_cell_type_typo
91
+ else "CD8-positive, alpha-beta T cell"
92
+ )
93
+ dataset_dict = {
94
+ var_ids[0]: [1, 2, 3],
95
+ var_ids[1]: [3, 4, 5],
96
+ var_ids[2]: [5, 6, 7],
97
+ "perturbation": pd.Categorical(["DMSO", ifng, thing]),
98
+ "sample_note": ["was ok", "looks naah", "pretty! 🤩"],
99
+ "cell_type_by_expert": pd.Categorical(
100
+ ["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
101
+ ),
102
+ "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
103
+ "assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
104
+ "concentration": ["0.1%", "200 nM", "0.1%"],
105
+ "treatment_time_h": [24, 24, 6],
106
+ "donor": ["D0001", "D0002", None],
107
+ }
108
+ # define the dataset-level metadata
109
+ metadata = {
110
+ "temperature": 21.6,
111
+ "experiment": "Experiment 1",
112
+ "date_of_study": "2024-12-01",
113
+ "study_note": "We had a great time performing this study and the results look compelling.",
114
+ }
115
+ # the dataset as DataFrame
116
+ dataset_df = pd.DataFrame(
117
+ dataset_dict,
118
+ index=["sample1", "sample2", 0] # type: ignore
119
+ if with_index_type_mismatch
120
+ else ["sample1", "sample2", "sample3"],
121
+ )
122
+ if otype == "DataFrame":
123
+ for key, value in metadata.items():
124
+ dataset_df.attrs[key] = value
125
+ return dataset_df
126
+ else:
127
+ dataset_ad = ad.AnnData(
128
+ dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
129
+ )
130
+ return dataset_ad
131
+
132
+
133
+ def get_dataset2(
134
+ otype: Literal["DataFrame", "AnnData"],
135
+ gene_symbols_in_index: bool = False,
136
+ ) -> pd.DataFrame | ad.AnnData:
137
+ if gene_symbols_in_index:
138
+ var_ids = ["CD8A", "CD4", "CD38"]
139
+ else:
140
+ var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
141
+ dataset_dict = {
142
+ var_ids[0]: [2, 3, 3],
143
+ var_ids[1]: [3, 4, 5],
144
+ var_ids[2]: [4, 2, 3],
145
+ "perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
146
+ "cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
147
+ "concentration": ["0.1%", "200 nM", "0.1%"],
148
+ "treatment_time_h": [24, 24, 6],
149
+ "donor": ["D0003", "D0003", "D0004"],
150
+ }
151
+ metadata = {
152
+ "temperature": 22.6,
153
+ "experiment": "Experiment 2",
154
+ "date_of_study": "2025-02-13",
155
+ }
156
+ dataset_df = pd.DataFrame(
157
+ dataset_dict,
158
+ index=["sample4", "sample5", "sample6"],
159
+ )
160
+ ad.AnnData(
161
+ dataset_df[var_ids],
162
+ obs=dataset_df[["perturbation", "cell_type_by_model"]],
163
+ )
164
+ if otype == "DataFrame":
165
+ for key, value in metadata.items():
166
+ dataset_df.attrs[key] = value
167
+ return dataset_df
168
+ else:
169
+ dataset_ad = ad.AnnData(
170
+ dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
171
+ )
172
+ return dataset_ad
lamindb/core/loaders.py CHANGED
@@ -44,7 +44,7 @@ try:
44
44
  except ImportError:
45
45
 
46
46
  def load_zarr(storepath): # type: ignore
47
- raise ImportError("Please install zarr: pip install zarr<=2.18.4")
47
+ raise ImportError("Please install zarr: pip install 'zarr<=2.18.4'")
48
48
 
49
49
 
50
50
  is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
@@ -1,20 +1,26 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import TYPE_CHECKING, Any, Callable
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING, Any, Callable, Literal
5
6
 
6
7
  from anndata._io.specs.registry import get_spec
7
8
 
8
9
  from ._anndata_accessor import AnnDataAccessor, StorageType, registry
9
- from ._pyarrow_dataset import _is_pyarrow_dataset, _open_pyarrow_dataset
10
+ from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df
11
+ from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset
10
12
  from ._tiledbsoma import _open_tiledbsoma
11
13
  from .paths import filepath_from_artifact
12
14
 
13
15
  if TYPE_CHECKING:
16
+ from collections.abc import Iterator
17
+
14
18
  from fsspec.core import OpenFile
19
+ from polars import LazyFrame as PolarsLazyFrame
15
20
  from pyarrow.dataset import Dataset as PyArrowDataset
16
21
  from tiledbsoma import Collection as SOMACollection
17
22
  from tiledbsoma import Experiment as SOMAExperiment
23
+ from tiledbsoma import Measurement as SOMAMeasurement
18
24
  from upath import UPath
19
25
 
20
26
  from lamindb.models.artifact import Artifact
@@ -69,10 +75,17 @@ class BackedAccessor:
69
75
  def backed_access(
70
76
  artifact_or_filepath: Artifact | UPath,
71
77
  mode: str = "r",
78
+ engine: Literal["pyarrow", "polars"] = "pyarrow",
72
79
  using_key: str | None = None,
73
80
  **kwargs,
74
81
  ) -> (
75
- AnnDataAccessor | BackedAccessor | SOMACollection | SOMAExperiment | PyArrowDataset
82
+ AnnDataAccessor
83
+ | BackedAccessor
84
+ | SOMACollection
85
+ | SOMAExperiment
86
+ | SOMAMeasurement
87
+ | PyArrowDataset
88
+ | Iterator[PolarsLazyFrame]
76
89
  ):
77
90
  from lamindb.models import Artifact
78
91
 
@@ -97,12 +110,15 @@ def backed_access(
97
110
  conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
98
111
  elif suffix == ".zarr":
99
112
  conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
100
- elif _is_pyarrow_dataset(objectpath):
101
- return _open_pyarrow_dataset(objectpath, **kwargs)
113
+ elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
114
+ df_suffix := df_suffixes.pop()
115
+ ) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
116
+ return _open_dataframe(objectpath, df_suffix, engine, **kwargs)
102
117
  else:
103
118
  raise ValueError(
104
119
  "The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
105
- f"or be compatible with pyarrow.dataset.dataset, instead of being {suffix} object."
120
+ f"be compatible with pyarrow.dataset.dataset or polars.scan_* functions, "
121
+ f"instead of being {suffix} object."
106
122
  )
107
123
 
108
124
  is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
@@ -112,3 +128,81 @@ def backed_access(
112
128
  return AnnDataAccessor(conn, storage, name)
113
129
  else:
114
130
  return BackedAccessor(conn, storage)
131
+
132
+
133
+ def _flat_suffixes(paths: UPath | list[UPath]) -> set[str]:
134
+ # it is assumed here that the paths exist
135
+ # we don't check here that the filesystem is the same
136
+ # but this is a requirement for pyarrow.dataset.dataset
137
+ path_list = []
138
+ if isinstance(paths, Path):
139
+ paths = [paths]
140
+ for path in paths:
141
+ # assume http is always a file
142
+ if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
143
+ path_list += [p for p in path.rglob("*") if p.suffix != ""]
144
+ else:
145
+ path_list.append(path)
146
+
147
+ suffixes = set()
148
+ for path in path_list:
149
+ path_suffixes = path.suffixes
150
+ # this doesn't work for externally gzipped files, REMOVE LATER
151
+ path_suffix = (
152
+ path_suffixes[-2]
153
+ if len(path_suffixes) > 1 and ".gz" in path_suffixes
154
+ else path.suffix
155
+ )
156
+ suffixes.add(path_suffix)
157
+ return suffixes
158
+
159
+
160
+ def _open_dataframe(
161
+ paths: UPath | list[UPath],
162
+ suffix: str | None = None,
163
+ engine: Literal["pyarrow", "polars"] = "pyarrow",
164
+ **kwargs,
165
+ ) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
166
+ df_suffix: str
167
+ if suffix is None:
168
+ df_suffixes = _flat_suffixes(paths)
169
+ if len(df_suffixes) > 1:
170
+ raise ValueError(
171
+ f"The artifacts in the collection have different file formats: {', '.join(df_suffixes)}.\n"
172
+ "It is not possible to open such stores with pyarrow or polars."
173
+ )
174
+ df_suffix = df_suffixes.pop()
175
+ else:
176
+ df_suffix = suffix
177
+
178
+ if engine == "pyarrow":
179
+ if df_suffix not in PYARROW_SUFFIXES:
180
+ raise ValueError(
181
+ f"{df_suffix} files are not supported by pyarrow, "
182
+ f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
183
+ )
184
+ # this checks that the filesystem is the same for all paths
185
+ # this is a requirement of pyarrow.dataset.dataset
186
+ if not isinstance(paths, Path): # is a list then
187
+ fs = getattr(paths[0], "fs", None)
188
+ for path in paths[1:]:
189
+ # this assumes that the filesystems are cached by fsspec
190
+ if getattr(path, "fs", None) is not fs:
191
+ raise ValueError(
192
+ "The collection has artifacts with different filesystems, "
193
+ "this is not supported by pyarrow."
194
+ )
195
+ dataframe = _open_pyarrow_dataset(paths, **kwargs)
196
+ elif engine == "polars":
197
+ if df_suffix not in POLARS_SUFFIXES:
198
+ raise ValueError(
199
+ f"{df_suffix} files are not supported by polars, "
200
+ f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
201
+ )
202
+ dataframe = _open_polars_lazy_df(paths, **kwargs)
203
+ else:
204
+ raise ValueError(
205
+ f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
206
+ )
207
+
208
+ return dataframe
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from collections.abc import Iterator
9
+
10
+ from polars import LazyFrame as PolarsLazyFrame
11
+ from upath import UPath
12
+
13
+ POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")
14
+
15
+
16
+ @contextmanager
17
+ def _open_polars_lazy_df(
18
+ paths: UPath | list[UPath], **kwargs
19
+ ) -> Iterator[PolarsLazyFrame]:
20
+ try:
21
+ import polars as pl
22
+ except ImportError as ie:
23
+ raise ImportError("Please install polars: pip install polars") from ie
24
+
25
+ scans = {
26
+ ".parquet": pl.scan_parquet,
27
+ ".csv": pl.scan_csv,
28
+ ".ndjson": pl.scan_ndjson,
29
+ ".ipc": pl.scan_ipc,
30
+ }
31
+
32
+ path_list = []
33
+ if isinstance(paths, Path):
34
+ paths = [paths]
35
+ for path in paths:
36
+ # assume http is always a file
37
+ if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
38
+ path_list += [p for p in path.rglob("*") if p.suffix != ""]
39
+ else:
40
+ path_list.append(path)
41
+
42
+ open_files = []
43
+
44
+ try:
45
+ for path in path_list:
46
+ open_files.append(path.open(mode="rb"))
47
+
48
+ yield scans[path_list[0].suffix](open_files, **kwargs)
49
+ finally:
50
+ for open_file in open_files:
51
+ open_file.close()
@@ -13,41 +13,26 @@ if TYPE_CHECKING:
13
13
  PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
14
14
 
15
15
 
16
- def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
17
- # it is assumed here that the paths exist
18
- # we don't check here that the filesystem is the same
19
- # but this is a requirement for pyarrow.dataset.dataset
20
- if isinstance(paths, list):
21
- path_list = paths
22
- elif paths.is_dir():
23
- path_list = [path for path in paths.rglob("*") if path.suffix != ""]
24
- else:
25
- path_list = [paths]
26
- suffix = None
27
- for path in path_list:
28
- path_suffixes = path.suffixes
29
- # this doesn't work for externally gzipped files, REMOVE LATER
30
- path_suffix = (
31
- path_suffixes[-2]
32
- if len(path_suffixes) > 1 and ".gz" in path_suffixes
33
- else path.suffix
34
- )
35
- if path_suffix not in PYARROW_SUFFIXES:
36
- return False
37
- elif suffix is None:
38
- suffix = path_suffix
39
- elif path_suffix != suffix:
40
- return False
41
- return True
42
-
43
-
44
16
  def _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset:
45
17
  if isinstance(paths, list):
18
+ # a single path can be a directory, but a list of paths
19
+ # has to be a flat list of files
20
+ paths_str = []
46
21
  path0 = paths[0]
47
22
  if isinstance(path0, LocalPathClasses):
48
- paths_str, filesystem = [path.as_posix() for path in paths], None
23
+ path_to_str = lambda p: p.as_posix()
24
+ filesystem = None
49
25
  else:
50
- paths_str, filesystem = [path.path for path in paths], path0.fs
26
+ path_to_str = lambda p: p.path
27
+ filesystem = path0.fs
28
+ for path in paths:
29
+ if (
30
+ getattr(path, "protocol", None) not in {"http", "https"}
31
+ and path.is_dir()
32
+ ):
33
+ paths_str += [path_to_str(p) for p in path.rglob("*") if p.suffix != ""]
34
+ else:
35
+ paths_str.append(path_to_str(path))
51
36
  elif isinstance(paths, LocalPathClasses):
52
37
  paths_str, filesystem = paths.as_posix(), None
53
38
  else:
@@ -110,7 +110,7 @@ def save_tiledbsoma_experiment(
110
110
  ) -> Artifact:
111
111
  """Write `AnnData` to `tiledbsoma.Experiment`.
112
112
 
113
- Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an {class}`~lamindb.Artifact`.
113
+ Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an :class:`~lamindb.Artifact`.
114
114
 
115
115
  Populates a column `lamin_run_uid` column in `obs` with the current `run.uid`.
116
116
 
@@ -202,28 +202,44 @@ def save_tiledbsoma_experiment(
202
202
  context=ctx,
203
203
  )
204
204
 
205
+ prepare_experiment = False
205
206
  resize_experiment = False
206
207
  if registration_mapping is not None:
207
- if version.parse(soma.__version__) < version.parse("1.15.0rc4"):
208
+ soma_version_parsed = version.parse(soma.__version__)
209
+ if soma_version_parsed < version.parse("1.15.0rc4"):
208
210
  n_observations = len(registration_mapping.obs_axis.data)
209
211
  else:
210
212
  n_observations = registration_mapping.get_obs_shape()
211
- resize_experiment = True
213
+ prepare_experiment = soma_version_parsed >= version.parse("1.16.2")
214
+ resize_experiment = not prepare_experiment
212
215
  else: # happens only if not appending and only one adata passed
213
216
  assert len(adata_objects) == 1 # noqa: S101
214
217
  n_observations = adata_objects[0].n_obs
215
218
 
216
219
  logger.important(f"Writing the tiledbsoma store to {storepath_str}")
220
+ experiment_exists: bool | None = None
217
221
  for adata_obj in adata_objects:
218
- if resize_experiment and soma.Experiment.exists(storepath_str, context=ctx):
219
- # can only happen if registration_mapping is not None
220
- soma_io.resize_experiment(
221
- storepath_str,
222
- nobs=n_observations,
223
- nvars=registration_mapping.get_var_shapes(),
224
- context=ctx,
225
- )
226
- resize_experiment = False
222
+ # do not recheck if True
223
+ if not experiment_exists and (resize_experiment or prepare_experiment):
224
+ experiment_exists = soma.Experiment.exists(storepath_str, context=ctx)
225
+ if experiment_exists:
226
+ # both can only happen if registration_mapping is not None
227
+ if resize_experiment:
228
+ soma_io.resize_experiment(
229
+ storepath_str,
230
+ nobs=n_observations,
231
+ nvars=registration_mapping.get_var_shapes(),
232
+ context=ctx,
233
+ )
234
+ resize_experiment = False
235
+ elif prepare_experiment:
236
+ registration_mapping.prepare_experiment(storepath_str, context=ctx)
237
+ prepare_experiment = False
238
+ registration_mapping_write = (
239
+ registration_mapping.subset_for_anndata(adata_obj)
240
+ if hasattr(registration_mapping, "subset_for_anndata")
241
+ else registration_mapping
242
+ )
227
243
  soma_io.from_anndata(
228
244
  storepath_str,
229
245
  adata_obj,
@@ -231,7 +247,7 @@ def save_tiledbsoma_experiment(
231
247
  context=ctx,
232
248
  obs_id_name=obs_id_name,
233
249
  var_id_name=var_id_name,
234
- registration_mapping=registration_mapping,
250
+ registration_mapping=registration_mapping_write,
235
251
  **kwargs,
236
252
  )
237
253
 
@@ -21,6 +21,7 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
21
21
  """Infer LaminDB storage file suffix from a data object."""
22
22
  if isinstance(dmem, AnnData):
23
23
  if format is not None:
24
+ # should be `.h5ad`, `.`zarr`, or `.anndata.zarr`
24
25
  if format not in {"h5ad", "zarr", "anndata.zarr"}:
25
26
  raise ValueError(
26
27
  "Error when specifying AnnData storage format, it should be"
@@ -31,6 +32,8 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
31
32
  return ".h5ad"
32
33
 
33
34
  if isinstance(dmem, DataFrame):
35
+ if format == ".csv":
36
+ return ".csv"
34
37
  return ".parquet"
35
38
 
36
39
  if with_package_obj(
@@ -79,6 +82,9 @@ def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
79
82
  raise NotImplementedError
80
83
 
81
84
  if isinstance(dmem, DataFrame):
85
+ if filepath.suffix == ".csv":
86
+ dmem.to_csv(filepath)
87
+ return
82
88
  dmem.to_parquet(filepath)
83
89
  return
84
90
 
@@ -4,7 +4,9 @@
4
4
  :toctree: .
5
5
 
6
6
  CreationSettings
7
+ AnnotationSettings
7
8
 
8
9
  """
9
10
 
11
+ from ._annotation_settings import AnnotationSettings
10
12
  from ._creation_settings import CreationSettings