lamindb 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lamindb/__init__.py +52 -36
- lamindb/_finish.py +17 -10
- lamindb/_tracked.py +1 -1
- lamindb/base/__init__.py +3 -1
- lamindb/base/fields.py +40 -22
- lamindb/base/ids.py +1 -94
- lamindb/base/types.py +2 -0
- lamindb/base/uids.py +117 -0
- lamindb/core/_context.py +203 -102
- lamindb/core/_settings.py +38 -25
- lamindb/core/datasets/__init__.py +11 -4
- lamindb/core/datasets/_core.py +5 -5
- lamindb/core/datasets/_small.py +0 -93
- lamindb/core/datasets/mini_immuno.py +172 -0
- lamindb/core/loaders.py +1 -1
- lamindb/core/storage/_backed_access.py +100 -6
- lamindb/core/storage/_polars_lazy_df.py +51 -0
- lamindb/core/storage/_pyarrow_dataset.py +15 -30
- lamindb/core/storage/_tiledbsoma.py +29 -13
- lamindb/core/storage/objects.py +6 -0
- lamindb/core/subsettings/__init__.py +2 -0
- lamindb/core/subsettings/_annotation_settings.py +11 -0
- lamindb/curators/__init__.py +7 -3349
- lamindb/curators/_legacy.py +2056 -0
- lamindb/curators/core.py +1534 -0
- lamindb/errors.py +11 -0
- lamindb/examples/__init__.py +27 -0
- lamindb/examples/schemas/__init__.py +12 -0
- lamindb/examples/schemas/_anndata.py +25 -0
- lamindb/examples/schemas/_simple.py +19 -0
- lamindb/integrations/_vitessce.py +8 -5
- lamindb/migrations/0091_alter_featurevalue_options_alter_space_options_and_more.py +24 -0
- lamindb/migrations/0092_alter_artifactfeaturevalue_artifact_and_more.py +75 -0
- lamindb/migrations/0093_alter_schemacomponent_unique_together.py +16 -0
- lamindb/models/__init__.py +4 -1
- lamindb/models/_describe.py +21 -4
- lamindb/models/_feature_manager.py +382 -287
- lamindb/models/_label_manager.py +8 -2
- lamindb/models/artifact.py +177 -106
- lamindb/models/artifact_set.py +122 -0
- lamindb/models/collection.py +73 -52
- lamindb/models/core.py +1 -1
- lamindb/models/feature.py +51 -17
- lamindb/models/has_parents.py +69 -14
- lamindb/models/project.py +1 -1
- lamindb/models/query_manager.py +221 -22
- lamindb/models/query_set.py +247 -172
- lamindb/models/record.py +65 -247
- lamindb/models/run.py +4 -4
- lamindb/models/save.py +8 -2
- lamindb/models/schema.py +456 -184
- lamindb/models/transform.py +2 -2
- lamindb/models/ulabel.py +8 -5
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/METADATA +6 -6
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/RECORD +57 -43
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/LICENSE +0 -0
- {lamindb-1.4.0.dist-info → lamindb-1.5.1.dist-info}/WHEEL +0 -0
lamindb/core/datasets/_core.py
CHANGED
@@ -322,8 +322,6 @@ def anndata_human_immune_cells(
|
|
322
322
|
|
323
323
|
import lamindb as ln
|
324
324
|
|
325
|
-
verbosity = ln.settings.verbosity
|
326
|
-
ln.settings.verbosity = "error"
|
327
325
|
ln.save(
|
328
326
|
bt.Gene.from_values(
|
329
327
|
adata.var.index, field="ensembl_gene_id", organism="human"
|
@@ -339,7 +337,6 @@ def anndata_human_immune_cells(
|
|
339
337
|
ln.Feature(name="donor", dtype=[ln.ULabel]).save() # type: ignore
|
340
338
|
bt.ExperimentalFactor.from_source(ontology_id="EFO:0008913").save()
|
341
339
|
ln.save([ln.ULabel(name=name) for name in adata.obs.donor.unique()])
|
342
|
-
ln.settings.verbosity = verbosity
|
343
340
|
return adata
|
344
341
|
|
345
342
|
|
@@ -560,11 +557,14 @@ def spatialdata_blobs() -> SpatialData:
|
|
560
557
|
from spatialdata.datasets import blobs
|
561
558
|
|
562
559
|
sdata = blobs()
|
563
|
-
sdata.attrs["
|
564
|
-
"assay": "Visium Spatial Gene Expression",
|
560
|
+
sdata.attrs["bio"] = {
|
565
561
|
"disease": "Alzheimer disease",
|
566
562
|
"developmental_stage": "adult stage",
|
567
563
|
}
|
564
|
+
sdata.attrs["tech"] = {
|
565
|
+
"assay": "Visium Spatial Gene Expression",
|
566
|
+
}
|
567
|
+
sdata.attrs["random_int"] = 20
|
568
568
|
sdata.tables["table"].var.index = [
|
569
569
|
"ENSG00000139618", # BRCA2
|
570
570
|
"ENSG00000157764", # BRAF
|
lamindb/core/datasets/_small.py
CHANGED
@@ -7,99 +7,6 @@ import numpy as np
|
|
7
7
|
import pandas as pd
|
8
8
|
|
9
9
|
|
10
|
-
def small_dataset1(
|
11
|
-
otype: Literal["DataFrame", "AnnData"] = "DataFrame",
|
12
|
-
gene_symbols_in_index: bool = False,
|
13
|
-
with_typo: bool = False,
|
14
|
-
with_cell_type_synonym: bool = False,
|
15
|
-
with_cell_type_typo: bool = False,
|
16
|
-
) -> pd.DataFrame | ad.AnnData:
|
17
|
-
# define the data in the dataset
|
18
|
-
# it's a mix of numerical measurements and observation-level metadata
|
19
|
-
ifng = "IFNJ" if with_typo else "IFNG"
|
20
|
-
if gene_symbols_in_index:
|
21
|
-
var_ids = ["CD8A", "CD4", "CD14"]
|
22
|
-
else:
|
23
|
-
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000170458"]
|
24
|
-
abt_cell = (
|
25
|
-
"CD8-pos alpha-beta T cell"
|
26
|
-
if with_cell_type_typo
|
27
|
-
else "CD8-positive, alpha-beta T cell"
|
28
|
-
)
|
29
|
-
dataset_dict = {
|
30
|
-
var_ids[0]: [1, 2, 3],
|
31
|
-
var_ids[1]: [3, 4, 5],
|
32
|
-
var_ids[2]: [5, 6, 7],
|
33
|
-
"perturbation": pd.Categorical(["DMSO", ifng, "DMSO"]),
|
34
|
-
"sample_note": ["was ok", "looks naah", "pretty! 🤩"],
|
35
|
-
"cell_type_by_expert": pd.Categorical(
|
36
|
-
["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
|
37
|
-
),
|
38
|
-
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
39
|
-
"assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
|
40
|
-
"concentration": ["0.1%", "200 nM", "0.1%"],
|
41
|
-
"treatment_time_h": [24, 24, 6],
|
42
|
-
"donor": ["D0001", "D0002", None],
|
43
|
-
}
|
44
|
-
# define the dataset-level metadata
|
45
|
-
metadata = {
|
46
|
-
"temperature": 21.6,
|
47
|
-
"experiment": "Experiment 1",
|
48
|
-
"date_of_study": "2024-12-01",
|
49
|
-
"study_note": "We had a great time performing this study and the results look compelling.",
|
50
|
-
}
|
51
|
-
# the dataset as DataFrame
|
52
|
-
dataset_df = pd.DataFrame(dataset_dict, index=["sample1", "sample2", "sample3"])
|
53
|
-
if otype == "DataFrame":
|
54
|
-
for key, value in metadata.items():
|
55
|
-
dataset_df.attrs[key] = value
|
56
|
-
return dataset_df
|
57
|
-
else:
|
58
|
-
dataset_ad = ad.AnnData(
|
59
|
-
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
60
|
-
)
|
61
|
-
return dataset_ad
|
62
|
-
|
63
|
-
|
64
|
-
def small_dataset2(
|
65
|
-
otype: Literal["DataFrame", "AnnData"],
|
66
|
-
gene_symbols_in_index: bool = False,
|
67
|
-
) -> pd.DataFrame | ad.AnnData:
|
68
|
-
if gene_symbols_in_index:
|
69
|
-
var_ids = ["CD8A", "CD4", "CD38"]
|
70
|
-
else:
|
71
|
-
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
|
72
|
-
dataset_dict = {
|
73
|
-
var_ids[0]: [2, 3, 3],
|
74
|
-
var_ids[1]: [3, 4, 5],
|
75
|
-
var_ids[2]: [4, 2, 3],
|
76
|
-
"perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
|
77
|
-
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
78
|
-
}
|
79
|
-
metadata = {
|
80
|
-
"temperature": 22.6,
|
81
|
-
"experiment": "Experiment 2",
|
82
|
-
"date_of_study": "2025-02-13",
|
83
|
-
}
|
84
|
-
dataset_df = pd.DataFrame(
|
85
|
-
dataset_dict,
|
86
|
-
index=["sample4", "sample5", "sample6"],
|
87
|
-
)
|
88
|
-
ad.AnnData(
|
89
|
-
dataset_df[var_ids],
|
90
|
-
obs=dataset_df[["perturbation", "cell_type_by_model"]],
|
91
|
-
)
|
92
|
-
if otype == "DataFrame":
|
93
|
-
for key, value in metadata.items():
|
94
|
-
dataset_df.attrs[key] = value
|
95
|
-
return dataset_df
|
96
|
-
else:
|
97
|
-
dataset_ad = ad.AnnData(
|
98
|
-
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
99
|
-
)
|
100
|
-
return dataset_ad
|
101
|
-
|
102
|
-
|
103
10
|
def small_dataset3_cellxgene(
|
104
11
|
otype: Literal["DataFrame", "AnnData"] = "AnnData",
|
105
12
|
) -> tuple[pd.DataFrame, dict[str, Any]] | ad.AnnData:
|
@@ -0,0 +1,172 @@
|
|
1
|
+
"""The mini immuno dataset.
|
2
|
+
|
3
|
+
.. autosummary::
|
4
|
+
:toctree: .
|
5
|
+
|
6
|
+
define_features_labels
|
7
|
+
get_dataset1
|
8
|
+
get_dataset2
|
9
|
+
|
10
|
+
"""
|
11
|
+
|
12
|
+
from __future__ import annotations
|
13
|
+
|
14
|
+
from typing import TYPE_CHECKING, Literal
|
15
|
+
|
16
|
+
import anndata as ad
|
17
|
+
import pandas as pd
|
18
|
+
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
from lamindb.models import Schema
|
21
|
+
|
22
|
+
|
23
|
+
def define_features_labels() -> None:
|
24
|
+
"""Features & labels to validate the mini immuno datasets.
|
25
|
+
|
26
|
+
.. literalinclude:: scripts/define_mini_immuno_features_labels.py
|
27
|
+
:language: python
|
28
|
+
"""
|
29
|
+
import sys
|
30
|
+
from pathlib import Path
|
31
|
+
|
32
|
+
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
33
|
+
if str(docs_path) not in sys.path:
|
34
|
+
sys.path.append(str(docs_path))
|
35
|
+
|
36
|
+
import define_mini_immuno_features_labels # noqa
|
37
|
+
|
38
|
+
|
39
|
+
def define_mini_immuno_schema_flexible() -> Schema:
|
40
|
+
"""Features & labels to validate the mini immuno datasets.
|
41
|
+
|
42
|
+
.. literalinclude:: scripts/define_mini_immuno_schema_flexible.py
|
43
|
+
:language: python
|
44
|
+
"""
|
45
|
+
import sys
|
46
|
+
from pathlib import Path
|
47
|
+
|
48
|
+
from lamindb.models import Schema
|
49
|
+
|
50
|
+
docs_path = Path(__file__).parent.parent.parent.parent / "docs" / "scripts"
|
51
|
+
if str(docs_path) not in sys.path:
|
52
|
+
sys.path.append(str(docs_path))
|
53
|
+
|
54
|
+
define_features_labels()
|
55
|
+
import define_mini_immuno_schema_flexible # noqa
|
56
|
+
|
57
|
+
return Schema.get(name="Mini immuno schema")
|
58
|
+
|
59
|
+
|
60
|
+
def get_dataset1(
|
61
|
+
otype: Literal["DataFrame", "AnnData"] = "DataFrame",
|
62
|
+
gene_symbols_in_index: bool = False,
|
63
|
+
with_typo: bool = False,
|
64
|
+
with_cell_type_synonym: bool = False,
|
65
|
+
with_cell_type_typo: bool = False,
|
66
|
+
with_gene_typo: bool = False,
|
67
|
+
with_outdated_gene: bool = False,
|
68
|
+
with_wrong_subtype: bool = False,
|
69
|
+
with_index_type_mismatch: bool = False,
|
70
|
+
) -> pd.DataFrame | ad.AnnData:
|
71
|
+
"""A small tabular dataset measuring expression & metadata."""
|
72
|
+
# define the data in the dataset
|
73
|
+
# it's a mix of numerical measurements and observation-level metadata
|
74
|
+
ifng = "IFNJ" if with_typo else "IFNG"
|
75
|
+
thing = "ulabel_but_not_perturbation" if with_wrong_subtype else "DMSO"
|
76
|
+
if gene_symbols_in_index:
|
77
|
+
var_ids = ["CD8A", "CD4", "CD14" if not with_gene_typo else "GeneTypo"]
|
78
|
+
else:
|
79
|
+
var_ids = [
|
80
|
+
"ENSG00000153563",
|
81
|
+
"ENSG00000010610",
|
82
|
+
"ENSG00000170458"
|
83
|
+
if not with_gene_typo
|
84
|
+
else "GeneTypo"
|
85
|
+
if not with_outdated_gene
|
86
|
+
else "ENSG00000278198",
|
87
|
+
]
|
88
|
+
abt_cell = (
|
89
|
+
"CD8-pos alpha-beta T cell"
|
90
|
+
if with_cell_type_typo
|
91
|
+
else "CD8-positive, alpha-beta T cell"
|
92
|
+
)
|
93
|
+
dataset_dict = {
|
94
|
+
var_ids[0]: [1, 2, 3],
|
95
|
+
var_ids[1]: [3, 4, 5],
|
96
|
+
var_ids[2]: [5, 6, 7],
|
97
|
+
"perturbation": pd.Categorical(["DMSO", ifng, thing]),
|
98
|
+
"sample_note": ["was ok", "looks naah", "pretty! 🤩"],
|
99
|
+
"cell_type_by_expert": pd.Categorical(
|
100
|
+
["B-cell" if with_cell_type_synonym else "B cell", abt_cell, abt_cell]
|
101
|
+
),
|
102
|
+
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
103
|
+
"assay_oid": pd.Categorical(["EFO:0008913", "EFO:0008913", "EFO:0008913"]),
|
104
|
+
"concentration": ["0.1%", "200 nM", "0.1%"],
|
105
|
+
"treatment_time_h": [24, 24, 6],
|
106
|
+
"donor": ["D0001", "D0002", None],
|
107
|
+
}
|
108
|
+
# define the dataset-level metadata
|
109
|
+
metadata = {
|
110
|
+
"temperature": 21.6,
|
111
|
+
"experiment": "Experiment 1",
|
112
|
+
"date_of_study": "2024-12-01",
|
113
|
+
"study_note": "We had a great time performing this study and the results look compelling.",
|
114
|
+
}
|
115
|
+
# the dataset as DataFrame
|
116
|
+
dataset_df = pd.DataFrame(
|
117
|
+
dataset_dict,
|
118
|
+
index=["sample1", "sample2", 0] # type: ignore
|
119
|
+
if with_index_type_mismatch
|
120
|
+
else ["sample1", "sample2", "sample3"],
|
121
|
+
)
|
122
|
+
if otype == "DataFrame":
|
123
|
+
for key, value in metadata.items():
|
124
|
+
dataset_df.attrs[key] = value
|
125
|
+
return dataset_df
|
126
|
+
else:
|
127
|
+
dataset_ad = ad.AnnData(
|
128
|
+
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
129
|
+
)
|
130
|
+
return dataset_ad
|
131
|
+
|
132
|
+
|
133
|
+
def get_dataset2(
|
134
|
+
otype: Literal["DataFrame", "AnnData"],
|
135
|
+
gene_symbols_in_index: bool = False,
|
136
|
+
) -> pd.DataFrame | ad.AnnData:
|
137
|
+
if gene_symbols_in_index:
|
138
|
+
var_ids = ["CD8A", "CD4", "CD38"]
|
139
|
+
else:
|
140
|
+
var_ids = ["ENSG00000153563", "ENSG00000010610", "ENSG00000004468"]
|
141
|
+
dataset_dict = {
|
142
|
+
var_ids[0]: [2, 3, 3],
|
143
|
+
var_ids[1]: [3, 4, 5],
|
144
|
+
var_ids[2]: [4, 2, 3],
|
145
|
+
"perturbation": pd.Categorical(["DMSO", "IFNG", "IFNG"]),
|
146
|
+
"cell_type_by_model": pd.Categorical(["B cell", "T cell", "T cell"]),
|
147
|
+
"concentration": ["0.1%", "200 nM", "0.1%"],
|
148
|
+
"treatment_time_h": [24, 24, 6],
|
149
|
+
"donor": ["D0003", "D0003", "D0004"],
|
150
|
+
}
|
151
|
+
metadata = {
|
152
|
+
"temperature": 22.6,
|
153
|
+
"experiment": "Experiment 2",
|
154
|
+
"date_of_study": "2025-02-13",
|
155
|
+
}
|
156
|
+
dataset_df = pd.DataFrame(
|
157
|
+
dataset_dict,
|
158
|
+
index=["sample4", "sample5", "sample6"],
|
159
|
+
)
|
160
|
+
ad.AnnData(
|
161
|
+
dataset_df[var_ids],
|
162
|
+
obs=dataset_df[["perturbation", "cell_type_by_model"]],
|
163
|
+
)
|
164
|
+
if otype == "DataFrame":
|
165
|
+
for key, value in metadata.items():
|
166
|
+
dataset_df.attrs[key] = value
|
167
|
+
return dataset_df
|
168
|
+
else:
|
169
|
+
dataset_ad = ad.AnnData(
|
170
|
+
dataset_df.iloc[:, :3], obs=dataset_df.iloc[:, 3:], uns=metadata
|
171
|
+
)
|
172
|
+
return dataset_ad
|
lamindb/core/loaders.py
CHANGED
@@ -44,7 +44,7 @@ try:
|
|
44
44
|
except ImportError:
|
45
45
|
|
46
46
|
def load_zarr(storepath): # type: ignore
|
47
|
-
raise ImportError("Please install zarr: pip install zarr<=2.18.4")
|
47
|
+
raise ImportError("Please install zarr: pip install 'zarr<=2.18.4'")
|
48
48
|
|
49
49
|
|
50
50
|
is_run_from_ipython = getattr(builtins, "__IPYTHON__", False)
|
@@ -1,20 +1,26 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import dataclass
|
4
|
-
from
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
5
6
|
|
6
7
|
from anndata._io.specs.registry import get_spec
|
7
8
|
|
8
9
|
from ._anndata_accessor import AnnDataAccessor, StorageType, registry
|
9
|
-
from .
|
10
|
+
from ._polars_lazy_df import POLARS_SUFFIXES, _open_polars_lazy_df
|
11
|
+
from ._pyarrow_dataset import PYARROW_SUFFIXES, _open_pyarrow_dataset
|
10
12
|
from ._tiledbsoma import _open_tiledbsoma
|
11
13
|
from .paths import filepath_from_artifact
|
12
14
|
|
13
15
|
if TYPE_CHECKING:
|
16
|
+
from collections.abc import Iterator
|
17
|
+
|
14
18
|
from fsspec.core import OpenFile
|
19
|
+
from polars import LazyFrame as PolarsLazyFrame
|
15
20
|
from pyarrow.dataset import Dataset as PyArrowDataset
|
16
21
|
from tiledbsoma import Collection as SOMACollection
|
17
22
|
from tiledbsoma import Experiment as SOMAExperiment
|
23
|
+
from tiledbsoma import Measurement as SOMAMeasurement
|
18
24
|
from upath import UPath
|
19
25
|
|
20
26
|
from lamindb.models.artifact import Artifact
|
@@ -69,10 +75,17 @@ class BackedAccessor:
|
|
69
75
|
def backed_access(
|
70
76
|
artifact_or_filepath: Artifact | UPath,
|
71
77
|
mode: str = "r",
|
78
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
72
79
|
using_key: str | None = None,
|
73
80
|
**kwargs,
|
74
81
|
) -> (
|
75
|
-
AnnDataAccessor
|
82
|
+
AnnDataAccessor
|
83
|
+
| BackedAccessor
|
84
|
+
| SOMACollection
|
85
|
+
| SOMAExperiment
|
86
|
+
| SOMAMeasurement
|
87
|
+
| PyArrowDataset
|
88
|
+
| Iterator[PolarsLazyFrame]
|
76
89
|
):
|
77
90
|
from lamindb.models import Artifact
|
78
91
|
|
@@ -97,12 +110,15 @@ def backed_access(
|
|
97
110
|
conn, storage = registry.open("h5py", objectpath, mode=mode, **kwargs)
|
98
111
|
elif suffix == ".zarr":
|
99
112
|
conn, storage = registry.open("zarr", objectpath, mode=mode, **kwargs)
|
100
|
-
elif
|
101
|
-
|
113
|
+
elif len(df_suffixes := _flat_suffixes(objectpath)) == 1 and (
|
114
|
+
df_suffix := df_suffixes.pop()
|
115
|
+
) in set(PYARROW_SUFFIXES).union(POLARS_SUFFIXES):
|
116
|
+
return _open_dataframe(objectpath, df_suffix, engine, **kwargs)
|
102
117
|
else:
|
103
118
|
raise ValueError(
|
104
119
|
"The object should have .h5, .hdf5, .h5ad, .zarr, .tiledbsoma suffix "
|
105
|
-
f"
|
120
|
+
f"be compatible with pyarrow.dataset.dataset or polars.scan_* functions, "
|
121
|
+
f"instead of being {suffix} object."
|
106
122
|
)
|
107
123
|
|
108
124
|
is_anndata = suffix == ".h5ad" or get_spec(storage).encoding_type == "anndata"
|
@@ -112,3 +128,81 @@ def backed_access(
|
|
112
128
|
return AnnDataAccessor(conn, storage, name)
|
113
129
|
else:
|
114
130
|
return BackedAccessor(conn, storage)
|
131
|
+
|
132
|
+
|
133
|
+
def _flat_suffixes(paths: UPath | list[UPath]) -> set[str]:
|
134
|
+
# it is assumed here that the paths exist
|
135
|
+
# we don't check here that the filesystem is the same
|
136
|
+
# but this is a requirement for pyarrow.dataset.dataset
|
137
|
+
path_list = []
|
138
|
+
if isinstance(paths, Path):
|
139
|
+
paths = [paths]
|
140
|
+
for path in paths:
|
141
|
+
# assume http is always a file
|
142
|
+
if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
|
143
|
+
path_list += [p for p in path.rglob("*") if p.suffix != ""]
|
144
|
+
else:
|
145
|
+
path_list.append(path)
|
146
|
+
|
147
|
+
suffixes = set()
|
148
|
+
for path in path_list:
|
149
|
+
path_suffixes = path.suffixes
|
150
|
+
# this doesn't work for externally gzipped files, REMOVE LATER
|
151
|
+
path_suffix = (
|
152
|
+
path_suffixes[-2]
|
153
|
+
if len(path_suffixes) > 1 and ".gz" in path_suffixes
|
154
|
+
else path.suffix
|
155
|
+
)
|
156
|
+
suffixes.add(path_suffix)
|
157
|
+
return suffixes
|
158
|
+
|
159
|
+
|
160
|
+
def _open_dataframe(
|
161
|
+
paths: UPath | list[UPath],
|
162
|
+
suffix: str | None = None,
|
163
|
+
engine: Literal["pyarrow", "polars"] = "pyarrow",
|
164
|
+
**kwargs,
|
165
|
+
) -> PyArrowDataset | Iterator[PolarsLazyFrame]:
|
166
|
+
df_suffix: str
|
167
|
+
if suffix is None:
|
168
|
+
df_suffixes = _flat_suffixes(paths)
|
169
|
+
if len(df_suffixes) > 1:
|
170
|
+
raise ValueError(
|
171
|
+
f"The artifacts in the collection have different file formats: {', '.join(df_suffixes)}.\n"
|
172
|
+
"It is not possible to open such stores with pyarrow or polars."
|
173
|
+
)
|
174
|
+
df_suffix = df_suffixes.pop()
|
175
|
+
else:
|
176
|
+
df_suffix = suffix
|
177
|
+
|
178
|
+
if engine == "pyarrow":
|
179
|
+
if df_suffix not in PYARROW_SUFFIXES:
|
180
|
+
raise ValueError(
|
181
|
+
f"{df_suffix} files are not supported by pyarrow, "
|
182
|
+
f"they should have one of these formats: {', '.join(PYARROW_SUFFIXES)}."
|
183
|
+
)
|
184
|
+
# this checks that the filesystem is the same for all paths
|
185
|
+
# this is a requirement of pyarrow.dataset.dataset
|
186
|
+
if not isinstance(paths, Path): # is a list then
|
187
|
+
fs = getattr(paths[0], "fs", None)
|
188
|
+
for path in paths[1:]:
|
189
|
+
# this assumes that the filesystems are cached by fsspec
|
190
|
+
if getattr(path, "fs", None) is not fs:
|
191
|
+
raise ValueError(
|
192
|
+
"The collection has artifacts with different filesystems, "
|
193
|
+
"this is not supported by pyarrow."
|
194
|
+
)
|
195
|
+
dataframe = _open_pyarrow_dataset(paths, **kwargs)
|
196
|
+
elif engine == "polars":
|
197
|
+
if df_suffix not in POLARS_SUFFIXES:
|
198
|
+
raise ValueError(
|
199
|
+
f"{df_suffix} files are not supported by polars, "
|
200
|
+
f"they should have one of these formats: {', '.join(POLARS_SUFFIXES)}."
|
201
|
+
)
|
202
|
+
dataframe = _open_polars_lazy_df(paths, **kwargs)
|
203
|
+
else:
|
204
|
+
raise ValueError(
|
205
|
+
f"Unknown engine: {engine}. It should be 'pyarrow' or 'polars'."
|
206
|
+
)
|
207
|
+
|
208
|
+
return dataframe
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from contextlib import contextmanager
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import TYPE_CHECKING
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from collections.abc import Iterator
|
9
|
+
|
10
|
+
from polars import LazyFrame as PolarsLazyFrame
|
11
|
+
from upath import UPath
|
12
|
+
|
13
|
+
POLARS_SUFFIXES = (".parquet", ".csv", ".ndjson", ".ipc")
|
14
|
+
|
15
|
+
|
16
|
+
@contextmanager
|
17
|
+
def _open_polars_lazy_df(
|
18
|
+
paths: UPath | list[UPath], **kwargs
|
19
|
+
) -> Iterator[PolarsLazyFrame]:
|
20
|
+
try:
|
21
|
+
import polars as pl
|
22
|
+
except ImportError as ie:
|
23
|
+
raise ImportError("Please install polars: pip install polars") from ie
|
24
|
+
|
25
|
+
scans = {
|
26
|
+
".parquet": pl.scan_parquet,
|
27
|
+
".csv": pl.scan_csv,
|
28
|
+
".ndjson": pl.scan_ndjson,
|
29
|
+
".ipc": pl.scan_ipc,
|
30
|
+
}
|
31
|
+
|
32
|
+
path_list = []
|
33
|
+
if isinstance(paths, Path):
|
34
|
+
paths = [paths]
|
35
|
+
for path in paths:
|
36
|
+
# assume http is always a file
|
37
|
+
if getattr(path, "protocol", None) not in {"http", "https"} and path.is_dir():
|
38
|
+
path_list += [p for p in path.rglob("*") if p.suffix != ""]
|
39
|
+
else:
|
40
|
+
path_list.append(path)
|
41
|
+
|
42
|
+
open_files = []
|
43
|
+
|
44
|
+
try:
|
45
|
+
for path in path_list:
|
46
|
+
open_files.append(path.open(mode="rb"))
|
47
|
+
|
48
|
+
yield scans[path_list[0].suffix](open_files, **kwargs)
|
49
|
+
finally:
|
50
|
+
for open_file in open_files:
|
51
|
+
open_file.close()
|
@@ -13,41 +13,26 @@ if TYPE_CHECKING:
|
|
13
13
|
PYARROW_SUFFIXES = (".parquet", ".csv", ".json", ".orc", ".arrow", ".feather", ".ipc")
|
14
14
|
|
15
15
|
|
16
|
-
def _is_pyarrow_dataset(paths: UPath | list[UPath]) -> bool:
|
17
|
-
# it is assumed here that the paths exist
|
18
|
-
# we don't check here that the filesystem is the same
|
19
|
-
# but this is a requirement for pyarrow.dataset.dataset
|
20
|
-
if isinstance(paths, list):
|
21
|
-
path_list = paths
|
22
|
-
elif paths.is_dir():
|
23
|
-
path_list = [path for path in paths.rglob("*") if path.suffix != ""]
|
24
|
-
else:
|
25
|
-
path_list = [paths]
|
26
|
-
suffix = None
|
27
|
-
for path in path_list:
|
28
|
-
path_suffixes = path.suffixes
|
29
|
-
# this doesn't work for externally gzipped files, REMOVE LATER
|
30
|
-
path_suffix = (
|
31
|
-
path_suffixes[-2]
|
32
|
-
if len(path_suffixes) > 1 and ".gz" in path_suffixes
|
33
|
-
else path.suffix
|
34
|
-
)
|
35
|
-
if path_suffix not in PYARROW_SUFFIXES:
|
36
|
-
return False
|
37
|
-
elif suffix is None:
|
38
|
-
suffix = path_suffix
|
39
|
-
elif path_suffix != suffix:
|
40
|
-
return False
|
41
|
-
return True
|
42
|
-
|
43
|
-
|
44
16
|
def _open_pyarrow_dataset(paths: UPath | list[UPath], **kwargs) -> PyArrowDataset:
|
45
17
|
if isinstance(paths, list):
|
18
|
+
# a single path can be a directory, but a list of paths
|
19
|
+
# has to be a flat list of files
|
20
|
+
paths_str = []
|
46
21
|
path0 = paths[0]
|
47
22
|
if isinstance(path0, LocalPathClasses):
|
48
|
-
|
23
|
+
path_to_str = lambda p: p.as_posix()
|
24
|
+
filesystem = None
|
49
25
|
else:
|
50
|
-
|
26
|
+
path_to_str = lambda p: p.path
|
27
|
+
filesystem = path0.fs
|
28
|
+
for path in paths:
|
29
|
+
if (
|
30
|
+
getattr(path, "protocol", None) not in {"http", "https"}
|
31
|
+
and path.is_dir()
|
32
|
+
):
|
33
|
+
paths_str += [path_to_str(p) for p in path.rglob("*") if p.suffix != ""]
|
34
|
+
else:
|
35
|
+
paths_str.append(path_to_str(path))
|
51
36
|
elif isinstance(paths, LocalPathClasses):
|
52
37
|
paths_str, filesystem = paths.as_posix(), None
|
53
38
|
else:
|
@@ -110,7 +110,7 @@ def save_tiledbsoma_experiment(
|
|
110
110
|
) -> Artifact:
|
111
111
|
"""Write `AnnData` to `tiledbsoma.Experiment`.
|
112
112
|
|
113
|
-
Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an
|
113
|
+
Reads `AnnData` objects, writes them to `tiledbsoma.Experiment`, creates & saves an :class:`~lamindb.Artifact`.
|
114
114
|
|
115
115
|
Populates a column `lamin_run_uid` column in `obs` with the current `run.uid`.
|
116
116
|
|
@@ -202,28 +202,44 @@ def save_tiledbsoma_experiment(
|
|
202
202
|
context=ctx,
|
203
203
|
)
|
204
204
|
|
205
|
+
prepare_experiment = False
|
205
206
|
resize_experiment = False
|
206
207
|
if registration_mapping is not None:
|
207
|
-
|
208
|
+
soma_version_parsed = version.parse(soma.__version__)
|
209
|
+
if soma_version_parsed < version.parse("1.15.0rc4"):
|
208
210
|
n_observations = len(registration_mapping.obs_axis.data)
|
209
211
|
else:
|
210
212
|
n_observations = registration_mapping.get_obs_shape()
|
211
|
-
|
213
|
+
prepare_experiment = soma_version_parsed >= version.parse("1.16.2")
|
214
|
+
resize_experiment = not prepare_experiment
|
212
215
|
else: # happens only if not appending and only one adata passed
|
213
216
|
assert len(adata_objects) == 1 # noqa: S101
|
214
217
|
n_observations = adata_objects[0].n_obs
|
215
218
|
|
216
219
|
logger.important(f"Writing the tiledbsoma store to {storepath_str}")
|
220
|
+
experiment_exists: bool | None = None
|
217
221
|
for adata_obj in adata_objects:
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
222
|
+
# do not recheck if True
|
223
|
+
if not experiment_exists and (resize_experiment or prepare_experiment):
|
224
|
+
experiment_exists = soma.Experiment.exists(storepath_str, context=ctx)
|
225
|
+
if experiment_exists:
|
226
|
+
# both can only happen if registration_mapping is not None
|
227
|
+
if resize_experiment:
|
228
|
+
soma_io.resize_experiment(
|
229
|
+
storepath_str,
|
230
|
+
nobs=n_observations,
|
231
|
+
nvars=registration_mapping.get_var_shapes(),
|
232
|
+
context=ctx,
|
233
|
+
)
|
234
|
+
resize_experiment = False
|
235
|
+
elif prepare_experiment:
|
236
|
+
registration_mapping.prepare_experiment(storepath_str, context=ctx)
|
237
|
+
prepare_experiment = False
|
238
|
+
registration_mapping_write = (
|
239
|
+
registration_mapping.subset_for_anndata(adata_obj)
|
240
|
+
if hasattr(registration_mapping, "subset_for_anndata")
|
241
|
+
else registration_mapping
|
242
|
+
)
|
227
243
|
soma_io.from_anndata(
|
228
244
|
storepath_str,
|
229
245
|
adata_obj,
|
@@ -231,7 +247,7 @@ def save_tiledbsoma_experiment(
|
|
231
247
|
context=ctx,
|
232
248
|
obs_id_name=obs_id_name,
|
233
249
|
var_id_name=var_id_name,
|
234
|
-
registration_mapping=
|
250
|
+
registration_mapping=registration_mapping_write,
|
235
251
|
**kwargs,
|
236
252
|
)
|
237
253
|
|
lamindb/core/storage/objects.py
CHANGED
@@ -21,6 +21,7 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
|
|
21
21
|
"""Infer LaminDB storage file suffix from a data object."""
|
22
22
|
if isinstance(dmem, AnnData):
|
23
23
|
if format is not None:
|
24
|
+
# should be `.h5ad`, `.`zarr`, or `.anndata.zarr`
|
24
25
|
if format not in {"h5ad", "zarr", "anndata.zarr"}:
|
25
26
|
raise ValueError(
|
26
27
|
"Error when specifying AnnData storage format, it should be"
|
@@ -31,6 +32,8 @@ def infer_suffix(dmem: SupportedDataTypes, format: str | None = None):
|
|
31
32
|
return ".h5ad"
|
32
33
|
|
33
34
|
if isinstance(dmem, DataFrame):
|
35
|
+
if format == ".csv":
|
36
|
+
return ".csv"
|
34
37
|
return ".parquet"
|
35
38
|
|
36
39
|
if with_package_obj(
|
@@ -79,6 +82,9 @@ def write_to_disk(dmem: SupportedDataTypes, filepath: UPathStr) -> None:
|
|
79
82
|
raise NotImplementedError
|
80
83
|
|
81
84
|
if isinstance(dmem, DataFrame):
|
85
|
+
if filepath.suffix == ".csv":
|
86
|
+
dmem.to_csv(filepath)
|
87
|
+
return
|
82
88
|
dmem.to_parquet(filepath)
|
83
89
|
return
|
84
90
|
|