sclab 0.1.7__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sclab/__init__.py +3 -1
- sclab/_io.py +83 -12
- sclab/_methods_registry.py +65 -0
- sclab/_sclab.py +241 -21
- sclab/dataset/_dataset.py +4 -6
- sclab/dataset/processor/_processor.py +41 -19
- sclab/dataset/processor/_results_panel.py +94 -0
- sclab/dataset/processor/step/_processor_step_base.py +12 -6
- sclab/examples/processor_steps/__init__.py +8 -0
- sclab/examples/processor_steps/_cluster.py +2 -2
- sclab/examples/processor_steps/_differential_expression.py +329 -0
- sclab/examples/processor_steps/_doublet_detection.py +68 -0
- sclab/examples/processor_steps/_gene_expression.py +125 -0
- sclab/examples/processor_steps/_integration.py +116 -0
- sclab/examples/processor_steps/_neighbors.py +26 -6
- sclab/examples/processor_steps/_pca.py +13 -8
- sclab/examples/processor_steps/_preprocess.py +52 -25
- sclab/examples/processor_steps/_qc.py +24 -8
- sclab/examples/processor_steps/_umap.py +2 -2
- sclab/gui/__init__.py +0 -0
- sclab/gui/components/__init__.py +7 -0
- sclab/gui/components/_guided_pseudotime.py +482 -0
- sclab/gui/components/_transfer_metadata.py +186 -0
- sclab/methods/__init__.py +50 -0
- sclab/preprocess/__init__.py +26 -0
- sclab/preprocess/_cca.py +176 -0
- sclab/preprocess/_cca_integrate.py +109 -0
- sclab/preprocess/_filter_obs.py +42 -0
- sclab/preprocess/_harmony.py +421 -0
- sclab/preprocess/_harmony_integrate.py +53 -0
- sclab/preprocess/_normalize_weighted.py +65 -0
- sclab/preprocess/_pca.py +51 -0
- sclab/preprocess/_preprocess.py +155 -0
- sclab/preprocess/_qc.py +38 -0
- sclab/preprocess/_rpca.py +116 -0
- sclab/preprocess/_subset.py +208 -0
- sclab/preprocess/_transfer_metadata.py +196 -0
- sclab/preprocess/_transform.py +82 -0
- sclab/preprocess/_utils.py +96 -0
- sclab/scanpy/__init__.py +0 -0
- sclab/scanpy/_compat.py +92 -0
- sclab/scanpy/_settings.py +526 -0
- sclab/scanpy/logging.py +290 -0
- sclab/scanpy/plotting/__init__.py +0 -0
- sclab/scanpy/plotting/_rcmod.py +73 -0
- sclab/scanpy/plotting/palettes.py +221 -0
- sclab/scanpy/readwrite.py +1108 -0
- sclab/tools/__init__.py +0 -0
- sclab/tools/cellflow/__init__.py +0 -0
- sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
- sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
- sclab/tools/cellflow/pseudotime/__init__.py +0 -0
- sclab/tools/cellflow/pseudotime/_pseudotime.py +336 -0
- sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
- sclab/tools/cellflow/utils/__init__.py +0 -0
- sclab/tools/cellflow/utils/density_nd.py +215 -0
- sclab/tools/cellflow/utils/interpolate.py +334 -0
- sclab/tools/cellflow/utils/periodic_genes.py +106 -0
- sclab/tools/cellflow/utils/smoothen.py +124 -0
- sclab/tools/cellflow/utils/times.py +55 -0
- sclab/tools/differential_expression/__init__.py +7 -0
- sclab/tools/differential_expression/_pseudobulk_edger.py +309 -0
- sclab/tools/differential_expression/_pseudobulk_helpers.py +290 -0
- sclab/tools/differential_expression/_pseudobulk_limma.py +257 -0
- sclab/tools/doublet_detection/__init__.py +5 -0
- sclab/tools/doublet_detection/_scrublet.py +64 -0
- sclab/tools/embedding/__init__.py +0 -0
- sclab/tools/imputation/__init__.py +0 -0
- sclab/tools/imputation/_alra.py +135 -0
- sclab/tools/labeling/__init__.py +6 -0
- sclab/tools/labeling/sctype.py +233 -0
- sclab/tools/utils/__init__.py +5 -0
- sclab/tools/utils/_aggregate_and_filter.py +290 -0
- sclab/utils/__init__.py +5 -0
- sclab/utils/_write_excel.py +510 -0
- {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/METADATA +29 -12
- sclab-0.3.4.dist-info/RECORD +93 -0
- {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/WHEEL +1 -1
- sclab-0.3.4.dist-info/licenses/LICENSE +29 -0
- sclab-0.1.7.dist-info/RECORD +0 -30
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from anndata import AnnData, ImplicitModificationWarning
|
|
6
|
+
from tqdm.auto import tqdm
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def preprocess(
|
|
10
|
+
adata: AnnData,
|
|
11
|
+
counts_layer: str = "counts",
|
|
12
|
+
group_by: str | None = None,
|
|
13
|
+
min_cells: int = 5,
|
|
14
|
+
min_genes: int = 5,
|
|
15
|
+
compute_hvg: bool = True,
|
|
16
|
+
regress_total_counts: bool = False,
|
|
17
|
+
regress_n_genes: bool = False,
|
|
18
|
+
normalization_method: Literal["library", "weighted", "none"] = "library",
|
|
19
|
+
target_scale: float = 1e4,
|
|
20
|
+
weighted_norm_quantile: float = 0.9,
|
|
21
|
+
log1p: bool = True,
|
|
22
|
+
scale: bool = True,
|
|
23
|
+
):
|
|
24
|
+
import scanpy as sc
|
|
25
|
+
|
|
26
|
+
from ._normalize_weighted import normalize_weighted
|
|
27
|
+
|
|
28
|
+
with tqdm(total=100, bar_format="{percentage:3.0f}%|{bar}|") as pbar:
|
|
29
|
+
if counts_layer not in adata.layers:
|
|
30
|
+
adata.layers[counts_layer] = adata.X.copy()
|
|
31
|
+
|
|
32
|
+
if f"{counts_layer}_log1p" not in adata.layers:
|
|
33
|
+
adata.layers[f"{counts_layer}_log1p"] = sc.pp.log1p(
|
|
34
|
+
adata.layers[counts_layer].copy()
|
|
35
|
+
)
|
|
36
|
+
pbar.update(10)
|
|
37
|
+
|
|
38
|
+
adata.X = adata.layers[counts_layer].copy()
|
|
39
|
+
sc.pp.calculate_qc_metrics(
|
|
40
|
+
adata,
|
|
41
|
+
percent_top=None,
|
|
42
|
+
log1p=False,
|
|
43
|
+
inplace=True,
|
|
44
|
+
)
|
|
45
|
+
sc.pp.filter_cells(adata, min_genes=min_genes)
|
|
46
|
+
sc.pp.filter_genes(adata, min_cells=min_cells)
|
|
47
|
+
pbar.update(10)
|
|
48
|
+
|
|
49
|
+
sc.pp.calculate_qc_metrics(
|
|
50
|
+
adata,
|
|
51
|
+
percent_top=None,
|
|
52
|
+
log1p=False,
|
|
53
|
+
inplace=True,
|
|
54
|
+
)
|
|
55
|
+
pbar.update(10)
|
|
56
|
+
|
|
57
|
+
if compute_hvg:
|
|
58
|
+
if group_by is not None:
|
|
59
|
+
adata.var["highly_variable"] = False
|
|
60
|
+
for name, idx in adata.obs.groupby(
|
|
61
|
+
group_by, observed=True
|
|
62
|
+
).groups.items():
|
|
63
|
+
hvg_seurat = sc.pp.highly_variable_genes(
|
|
64
|
+
adata[idx],
|
|
65
|
+
layer=f"{counts_layer}_log1p",
|
|
66
|
+
flavor="seurat",
|
|
67
|
+
inplace=False,
|
|
68
|
+
)["highly_variable"]
|
|
69
|
+
|
|
70
|
+
hvg_seurat_v3 = sc.pp.highly_variable_genes(
|
|
71
|
+
adata[idx],
|
|
72
|
+
layer=counts_layer,
|
|
73
|
+
flavor="seurat_v3_paper",
|
|
74
|
+
n_top_genes=hvg_seurat.sum(),
|
|
75
|
+
inplace=False,
|
|
76
|
+
)["highly_variable"]
|
|
77
|
+
|
|
78
|
+
adata.var[f"highly_variable_{name}"] = hvg_seurat | hvg_seurat_v3
|
|
79
|
+
adata.var["highly_variable"] |= adata.var[f"highly_variable_{name}"]
|
|
80
|
+
|
|
81
|
+
else:
|
|
82
|
+
sc.pp.highly_variable_genes(
|
|
83
|
+
adata, layer=f"{counts_layer}_log1p", flavor="seurat"
|
|
84
|
+
)
|
|
85
|
+
hvg_seurat = adata.var["highly_variable"]
|
|
86
|
+
|
|
87
|
+
sc.pp.highly_variable_genes(
|
|
88
|
+
adata,
|
|
89
|
+
layer=counts_layer,
|
|
90
|
+
flavor="seurat_v3_paper",
|
|
91
|
+
n_top_genes=hvg_seurat.sum(),
|
|
92
|
+
)
|
|
93
|
+
hvg_seurat_v3 = adata.var["highly_variable"]
|
|
94
|
+
|
|
95
|
+
adata.var["highly_variable"] = hvg_seurat | hvg_seurat_v3
|
|
96
|
+
|
|
97
|
+
pbar.update(10)
|
|
98
|
+
pbar.update(10)
|
|
99
|
+
|
|
100
|
+
new_layer = counts_layer
|
|
101
|
+
if normalization_method == "library":
|
|
102
|
+
new_layer += "_normt"
|
|
103
|
+
sc.pp.normalize_total(adata, target_sum=target_scale)
|
|
104
|
+
elif normalization_method == "weighted":
|
|
105
|
+
new_layer += "_normw"
|
|
106
|
+
normalize_weighted(
|
|
107
|
+
adata,
|
|
108
|
+
target_scale=target_scale,
|
|
109
|
+
batch_key=group_by,
|
|
110
|
+
q=weighted_norm_quantile,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
pbar.update(10)
|
|
114
|
+
pbar.update(10)
|
|
115
|
+
|
|
116
|
+
if log1p:
|
|
117
|
+
new_layer += "_log1p"
|
|
118
|
+
adata.uns.pop("log1p", None)
|
|
119
|
+
sc.pp.log1p(adata)
|
|
120
|
+
pbar.update(10)
|
|
121
|
+
|
|
122
|
+
vars_to_regress = []
|
|
123
|
+
if regress_n_genes:
|
|
124
|
+
vars_to_regress.append("n_genes_by_counts")
|
|
125
|
+
|
|
126
|
+
if regress_total_counts and log1p:
|
|
127
|
+
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])
|
|
128
|
+
vars_to_regress.append("log1p_total_counts")
|
|
129
|
+
elif regress_total_counts:
|
|
130
|
+
vars_to_regress.append("total_counts")
|
|
131
|
+
|
|
132
|
+
if vars_to_regress:
|
|
133
|
+
new_layer += "_regr"
|
|
134
|
+
sc.pp.regress_out(adata, keys=vars_to_regress, n_jobs=1)
|
|
135
|
+
pbar.update(10)
|
|
136
|
+
|
|
137
|
+
if scale:
|
|
138
|
+
new_layer += "_scale"
|
|
139
|
+
if group_by is not None:
|
|
140
|
+
for _, idx in adata.obs.groupby(group_by, observed=True).groups.items():
|
|
141
|
+
with warnings.catch_warnings():
|
|
142
|
+
warnings.filterwarnings(
|
|
143
|
+
"ignore",
|
|
144
|
+
category=ImplicitModificationWarning,
|
|
145
|
+
message="Modifying `X` on a view results in data being overridden",
|
|
146
|
+
)
|
|
147
|
+
adata[idx].X = sc.pp.scale(adata[idx].X, zero_center=False)
|
|
148
|
+
else:
|
|
149
|
+
sc.pp.scale(adata, zero_center=False)
|
|
150
|
+
|
|
151
|
+
adata.layers[new_layer] = adata.X.copy()
|
|
152
|
+
|
|
153
|
+
pbar.update(10)
|
|
154
|
+
|
|
155
|
+
adata.X = adata.X.astype(np.float32)
|
sclab/preprocess/_qc.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from anndata import AnnData
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def qc(
|
|
6
|
+
adata: AnnData,
|
|
7
|
+
counts_layer: str = "counts",
|
|
8
|
+
min_counts: int = 50,
|
|
9
|
+
min_genes: int = 5,
|
|
10
|
+
min_cells: int = 5,
|
|
11
|
+
max_rank: int = 0,
|
|
12
|
+
):
|
|
13
|
+
import scanpy as sc
|
|
14
|
+
|
|
15
|
+
if counts_layer not in adata.layers:
|
|
16
|
+
adata.layers[counts_layer] = adata.X.copy()
|
|
17
|
+
|
|
18
|
+
adata.layers["qc_tmp_current_X"] = adata.X
|
|
19
|
+
adata.X = adata.layers[counts_layer].copy()
|
|
20
|
+
rowsums = np.asarray(adata.X.sum(axis=1)).squeeze()
|
|
21
|
+
|
|
22
|
+
obs_idx = adata.obs_names[rowsums >= min_counts]
|
|
23
|
+
adata._inplace_subset_obs(obs_idx)
|
|
24
|
+
|
|
25
|
+
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
|
|
26
|
+
|
|
27
|
+
sc.pp.filter_cells(adata, min_genes=min_genes)
|
|
28
|
+
sc.pp.filter_genes(adata, min_cells=min_cells)
|
|
29
|
+
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
|
|
30
|
+
adata.obs["barcode_rank"] = adata.obs["total_counts"].rank(ascending=False)
|
|
31
|
+
|
|
32
|
+
# Restore original X
|
|
33
|
+
adata.X = adata.layers.pop("qc_tmp_current_X")
|
|
34
|
+
|
|
35
|
+
if max_rank > 0:
|
|
36
|
+
series = adata.obs["barcode_rank"]
|
|
37
|
+
index = series.loc[series < max_rank].index
|
|
38
|
+
adata._inplace_subset_obs(index)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from anndata import AnnData
|
|
3
|
+
from numpy.typing import NDArray
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def rpca(
|
|
7
|
+
adata: AnnData,
|
|
8
|
+
key: str,
|
|
9
|
+
*,
|
|
10
|
+
basis: str = "X",
|
|
11
|
+
adjusted_basis: str | None = None,
|
|
12
|
+
reference_batch: str | list[str] | None = None,
|
|
13
|
+
mask_var: str | None = None,
|
|
14
|
+
n_components: int = 30,
|
|
15
|
+
min_variance_ratio: float = 0.0005,
|
|
16
|
+
svd_solver: str = "arpack",
|
|
17
|
+
normalize: bool = True,
|
|
18
|
+
):
|
|
19
|
+
if basis is None:
|
|
20
|
+
basis = "X"
|
|
21
|
+
|
|
22
|
+
if adjusted_basis is None:
|
|
23
|
+
adjusted_basis = basis + "_rpca"
|
|
24
|
+
|
|
25
|
+
if mask_var is not None:
|
|
26
|
+
mask = adata.var[mask_var].values
|
|
27
|
+
else:
|
|
28
|
+
mask = np.ones(adata.n_vars, dtype=bool)
|
|
29
|
+
|
|
30
|
+
X = _get_basis(adata[:, mask], basis)
|
|
31
|
+
uns = {}
|
|
32
|
+
|
|
33
|
+
groups = adata.obs.groupby(key, observed=True).groups
|
|
34
|
+
if reference_batch is None:
|
|
35
|
+
reference_batch = list(groups.keys())
|
|
36
|
+
elif isinstance(reference_batch, str):
|
|
37
|
+
reference_batch = [reference_batch]
|
|
38
|
+
|
|
39
|
+
for gr, idx in groups.items():
|
|
40
|
+
if gr not in reference_batch:
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
ref_basis_key = f"{adjusted_basis}_{gr}"
|
|
44
|
+
ref_PCs_key = f"{adjusted_basis}_{gr}_PCs"
|
|
45
|
+
|
|
46
|
+
X_reference = _get_basis(adata[idx, mask], basis)
|
|
47
|
+
proj_result = pca_projection(
|
|
48
|
+
X,
|
|
49
|
+
X_reference,
|
|
50
|
+
n_components=n_components,
|
|
51
|
+
min_variance_ratio=min_variance_ratio,
|
|
52
|
+
svd_solver=svd_solver,
|
|
53
|
+
normalize=normalize,
|
|
54
|
+
)
|
|
55
|
+
res_ncomps = proj_result[0].shape[1]
|
|
56
|
+
components = np.zeros((res_ncomps, adata.n_vars))
|
|
57
|
+
components[:, mask] = proj_result[1]
|
|
58
|
+
|
|
59
|
+
adata.obsm[ref_basis_key] = proj_result[0]
|
|
60
|
+
adata.varm[ref_PCs_key] = components.T
|
|
61
|
+
|
|
62
|
+
uns[gr] = {
|
|
63
|
+
"n_components": res_ncomps,
|
|
64
|
+
"explained_variance_ratio": proj_result[2],
|
|
65
|
+
"explained_variance": proj_result[3],
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
adata.uns[adjusted_basis] = uns
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def pca_projection(
|
|
72
|
+
X: NDArray,
|
|
73
|
+
X_reference: NDArray,
|
|
74
|
+
n_components: int = 30,
|
|
75
|
+
min_variance_ratio: float = 0.0005,
|
|
76
|
+
svd_solver: str = "arpack",
|
|
77
|
+
normalize: bool = False,
|
|
78
|
+
) -> tuple[NDArray, NDArray, NDArray, NDArray]:
|
|
79
|
+
import scanpy as sc
|
|
80
|
+
|
|
81
|
+
pca_kwargs = dict(
|
|
82
|
+
n_comps=n_components,
|
|
83
|
+
svd_solver=svd_solver,
|
|
84
|
+
return_info=True,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
pca_result = sc.pp.pca(X_reference, **pca_kwargs)
|
|
88
|
+
_, components, explained_variance_ratio, explained_variance = pca_result
|
|
89
|
+
|
|
90
|
+
components_mask = explained_variance_ratio > min_variance_ratio
|
|
91
|
+
components = components[components_mask]
|
|
92
|
+
explained_variance_ratio = explained_variance_ratio[components_mask]
|
|
93
|
+
explained_variance = explained_variance[components_mask]
|
|
94
|
+
|
|
95
|
+
X_pca = X.dot(components.T)
|
|
96
|
+
|
|
97
|
+
if normalize:
|
|
98
|
+
X_pca = X_pca / np.linalg.norm(X_pca, axis=1, keepdims=True)
|
|
99
|
+
|
|
100
|
+
return X_pca, components, explained_variance_ratio, explained_variance
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _get_basis(adata: AnnData, basis: str):
|
|
104
|
+
if basis == "X":
|
|
105
|
+
X = adata.X
|
|
106
|
+
|
|
107
|
+
elif basis in adata.layers:
|
|
108
|
+
X = adata.layers[basis]
|
|
109
|
+
|
|
110
|
+
elif basis in adata.obsm:
|
|
111
|
+
X = adata.obsm[basis]
|
|
112
|
+
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError(f"Unknown basis {basis}")
|
|
115
|
+
|
|
116
|
+
return X
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from typing import Sequence
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from anndata import AnnData
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def subset_obs(
|
|
9
|
+
adata: AnnData,
|
|
10
|
+
subset: pd.Index | Sequence[str | int | bool] | str,
|
|
11
|
+
) -> None:
|
|
12
|
+
"""Subset observations (rows) in an AnnData object.
|
|
13
|
+
|
|
14
|
+
This function modifies the AnnData object in-place by selecting a subset of observations
|
|
15
|
+
based on the provided subset parameter. The subsetting can be done using observation
|
|
16
|
+
names, integer indices, a boolean mask, a query string, or a pandas Index.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
adata : AnnData
|
|
21
|
+
The annotated data matrix to subset. Will be modified in-place.
|
|
22
|
+
subset : pd.Index | Sequence[str | int | bool] | str
|
|
23
|
+
The subset specification. Can be one of:
|
|
24
|
+
* A pandas Index containing observation names
|
|
25
|
+
* A sequence of observation names (strings)
|
|
26
|
+
* A sequence of integer indices
|
|
27
|
+
* A boolean mask of length `adata.n_obs`
|
|
28
|
+
* A query string to match observations by their metadata columns
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
>>> # Create an example AnnData object
|
|
33
|
+
>>> import anndata
|
|
34
|
+
>>> import pandas as pd
|
|
35
|
+
>>> import numpy as np
|
|
36
|
+
>>>
|
|
37
|
+
>>> obs = pd.DataFrame(
|
|
38
|
+
... index=['A', 'B', 'C'],
|
|
39
|
+
... data={'cell_type': ['type1', 'type2', 'type2']})
|
|
40
|
+
>>> adata_ = anndata.AnnData(obs=obs)
|
|
41
|
+
>>>
|
|
42
|
+
>>> # Subset using pandas Index
|
|
43
|
+
>>> adata = adata_.copy()
|
|
44
|
+
>>> subset_obs(adata, pd.Index(['B', 'C']))
|
|
45
|
+
>>> adata.obs_names.tolist()
|
|
46
|
+
['B', 'C']
|
|
47
|
+
>>>
|
|
48
|
+
>>> # Subset using observation names
|
|
49
|
+
>>> adata = adata_.copy()
|
|
50
|
+
>>> subset_obs(adata, ['A', 'B'])
|
|
51
|
+
>>> adata.obs_names.tolist()
|
|
52
|
+
['A', 'B']
|
|
53
|
+
>>>
|
|
54
|
+
>>> # Subset using integer indices
|
|
55
|
+
>>> adata = adata_.copy()
|
|
56
|
+
>>> subset_obs(adata, [0, 1])
|
|
57
|
+
>>> adata.obs_names.tolist()
|
|
58
|
+
['A', 'B']
|
|
59
|
+
>>>
|
|
60
|
+
>>> # Subset using boolean mask
|
|
61
|
+
>>> adata = adata_.copy()
|
|
62
|
+
>>> subset_obs(adata, [True, False, True])
|
|
63
|
+
>>> adata.obs_names.tolist()
|
|
64
|
+
['A', 'C']
|
|
65
|
+
>>>
|
|
66
|
+
>>> # Subset using query string
|
|
67
|
+
>>> adata = adata_.copy()
|
|
68
|
+
>>> subset_obs(adata, 'cell_type == "type2"')
|
|
69
|
+
>>> adata.obs_names.tolist()
|
|
70
|
+
['B', 'C']
|
|
71
|
+
|
|
72
|
+
Notes
|
|
73
|
+
-----
|
|
74
|
+
- The function modifies the AnnData object in-place
|
|
75
|
+
- When using a boolean mask, its length must match the number of observations
|
|
76
|
+
- When using integer indices, they must be valid indices for the observations
|
|
77
|
+
- Invalid observation names or indices will raise KeyError or IndexError respectively
|
|
78
|
+
- The order of observations in the output will match the order in the subset parameter
|
|
79
|
+
"""
|
|
80
|
+
if isinstance(subset, str):
|
|
81
|
+
subset = adata.obs.query(subset).index
|
|
82
|
+
|
|
83
|
+
if not isinstance(subset, pd.Index):
|
|
84
|
+
subset = np.asarray(subset)
|
|
85
|
+
|
|
86
|
+
# Handle boolean mask
|
|
87
|
+
if subset.dtype.kind == "b":
|
|
88
|
+
if len(subset) != adata.n_obs:
|
|
89
|
+
raise IndexError(
|
|
90
|
+
f"Boolean mask length ({len(subset)}) does not match number of "
|
|
91
|
+
f"observations ({adata.n_obs})"
|
|
92
|
+
)
|
|
93
|
+
subset = adata.obs_names[subset]
|
|
94
|
+
|
|
95
|
+
# Handle integer indices
|
|
96
|
+
elif subset.dtype.kind in "iu":
|
|
97
|
+
if np.any(subset < 0) or np.any(subset >= adata.n_obs):
|
|
98
|
+
raise IndexError(f"Integer indices must be between 0 and {adata.n_obs - 1}")
|
|
99
|
+
subset = adata.obs_names[subset]
|
|
100
|
+
|
|
101
|
+
if adata.n_obs == subset.size and (subset == adata.obs_names).all():
|
|
102
|
+
# No need to subset, avoid making a copy. Useful for large AnnData objects
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
adata._inplace_subset_obs(subset)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def subset_var(
|
|
109
|
+
adata: AnnData,
|
|
110
|
+
subset: pd.Index | Sequence[str | int | bool] | str,
|
|
111
|
+
) -> None:
|
|
112
|
+
"""Subset variables (columns) in an AnnData object.
|
|
113
|
+
|
|
114
|
+
This function modifies the AnnData object in-place by selecting a subset of variables
|
|
115
|
+
based on the provided subset parameter. The subsetting can be done using variable
|
|
116
|
+
names, integer indices, a boolean mask, a query string, or a pandas Index.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
adata : AnnData
|
|
121
|
+
The annotated data matrix to subset. Will be modified in-place.
|
|
122
|
+
subset : pd.Index | Sequence[str | int | bool] | str
|
|
123
|
+
The subset specification. Can be one of:
|
|
124
|
+
* A pandas Index containing variable names
|
|
125
|
+
* A sequence of variable names (strings)
|
|
126
|
+
* A sequence of integer indices
|
|
127
|
+
* A boolean mask of length `adata.n_vars`
|
|
128
|
+
* A query string to match variables by their metadata columns
|
|
129
|
+
|
|
130
|
+
Examples
|
|
131
|
+
--------
|
|
132
|
+
>>> # Create an example AnnData object
|
|
133
|
+
>>> import anndata
|
|
134
|
+
>>> import pandas as pd
|
|
135
|
+
>>> import numpy as np
|
|
136
|
+
>>>
|
|
137
|
+
>>> var = pd.DataFrame(
|
|
138
|
+
... index=['gene1', 'gene2', 'gene3'],
|
|
139
|
+
... data={'gene_type': ['type1', 'type2', 'type1']})
|
|
140
|
+
>>> adata_ = anndata.AnnData(var=var)
|
|
141
|
+
>>>
|
|
142
|
+
>>> # Subset using pandas Index
|
|
143
|
+
>>> adata = adata_.copy()
|
|
144
|
+
>>> subset_var(adata, pd.Index(['gene2', 'gene3']))
|
|
145
|
+
>>> adata.var_names.tolist()
|
|
146
|
+
['gene2', 'gene3']
|
|
147
|
+
>>>
|
|
148
|
+
>>> # Subset using variable names
|
|
149
|
+
>>> adata = adata_.copy()
|
|
150
|
+
>>> subset_var(adata, ['gene1', 'gene2'])
|
|
151
|
+
>>> adata.var_names.tolist()
|
|
152
|
+
['gene1', 'gene2']
|
|
153
|
+
>>>
|
|
154
|
+
>>> # Subset using integer indices
|
|
155
|
+
>>> adata = adata_.copy()
|
|
156
|
+
>>> subset_var(adata, [0, 1])
|
|
157
|
+
>>> adata.var_names.tolist()
|
|
158
|
+
['gene1', 'gene2']
|
|
159
|
+
>>>
|
|
160
|
+
>>> # Subset using boolean mask
|
|
161
|
+
>>> adata = adata_.copy()
|
|
162
|
+
>>> subset_var(adata, [True, False, True])
|
|
163
|
+
>>> adata.var_names.tolist()
|
|
164
|
+
['gene1', 'gene3']
|
|
165
|
+
>>>
|
|
166
|
+
>>> # Subset using query string
|
|
167
|
+
>>> adata = adata_.copy()
|
|
168
|
+
>>> subset_var(adata, 'gene_type == "type1"')
|
|
169
|
+
>>> adata.var_names.tolist()
|
|
170
|
+
['gene1', 'gene3']
|
|
171
|
+
|
|
172
|
+
Notes
|
|
173
|
+
-----
|
|
174
|
+
- The function modifies the AnnData object in-place
|
|
175
|
+
- When using a boolean mask, its length must match the number of variables
|
|
176
|
+
- When using integer indices, they must be valid indices for the variables
|
|
177
|
+
- Invalid variable names or indices will raise KeyError or IndexError respectively
|
|
178
|
+
- The order of variables in the output will match the order in the subset parameter
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
if isinstance(subset, str):
|
|
182
|
+
subset = adata.var.query(subset).index
|
|
183
|
+
|
|
184
|
+
if not isinstance(subset, pd.Index):
|
|
185
|
+
subset = np.asarray(subset)
|
|
186
|
+
|
|
187
|
+
# Handle boolean mask
|
|
188
|
+
if subset.dtype.kind == "b":
|
|
189
|
+
if len(subset) != adata.n_vars:
|
|
190
|
+
raise IndexError(
|
|
191
|
+
f"Boolean mask length ({len(subset)}) does not match number of "
|
|
192
|
+
f"variables ({adata.n_vars})"
|
|
193
|
+
)
|
|
194
|
+
subset = adata.var_names[subset]
|
|
195
|
+
|
|
196
|
+
# Handle integer indices
|
|
197
|
+
elif subset.dtype.kind in "iu":
|
|
198
|
+
if np.any(subset < 0) or np.any(subset >= adata.n_vars):
|
|
199
|
+
raise IndexError(
|
|
200
|
+
f"Integer indices must be between 0 and {adata.n_vars - 1}"
|
|
201
|
+
)
|
|
202
|
+
subset = adata.var_names[subset]
|
|
203
|
+
|
|
204
|
+
if adata.n_vars == subset.size and (subset == adata.var_names).all():
|
|
205
|
+
# No need to subset, avoid making a copy. Useful for large AnnData objects
|
|
206
|
+
return
|
|
207
|
+
|
|
208
|
+
adata._inplace_subset_var(subset)
|