sclab 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sclab might be problematic. Click here for more details.
- sclab/__init__.py +1 -1
- sclab/_sclab.py +7 -3
- sclab/dataset/_dataset.py +1 -1
- sclab/dataset/processor/_processor.py +19 -4
- sclab/examples/processor_steps/__init__.py +2 -0
- sclab/examples/processor_steps/_doublet_detection.py +68 -0
- sclab/examples/processor_steps/_integration.py +47 -20
- sclab/examples/processor_steps/_neighbors.py +24 -4
- sclab/examples/processor_steps/_pca.py +11 -6
- sclab/examples/processor_steps/_preprocess.py +14 -1
- sclab/examples/processor_steps/_qc.py +22 -6
- sclab/gui/__init__.py +0 -0
- sclab/gui/components/__init__.py +7 -0
- sclab/gui/components/_guided_pseudotime.py +482 -0
- sclab/gui/components/_transfer_metadata.py +186 -0
- sclab/methods/__init__.py +16 -0
- sclab/preprocess/__init__.py +19 -0
- sclab/preprocess/_cca.py +154 -0
- sclab/preprocess/_cca_integrate.py +109 -0
- sclab/preprocess/_filter_obs.py +42 -0
- sclab/preprocess/_harmony.py +421 -0
- sclab/preprocess/_harmony_integrate.py +53 -0
- sclab/preprocess/_normalize_weighted.py +61 -0
- sclab/preprocess/_subset.py +208 -0
- sclab/preprocess/_transfer_metadata.py +137 -0
- sclab/preprocess/_transform.py +82 -0
- sclab/preprocess/_utils.py +96 -0
- sclab/tools/__init__.py +0 -0
- sclab/tools/cellflow/__init__.py +0 -0
- sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
- sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
- sclab/tools/cellflow/pseudotime/__init__.py +0 -0
- sclab/tools/cellflow/pseudotime/_pseudotime.py +332 -0
- sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
- sclab/tools/cellflow/utils/__init__.py +0 -0
- sclab/tools/cellflow/utils/density_nd.py +215 -0
- sclab/tools/cellflow/utils/interpolate.py +334 -0
- sclab/tools/cellflow/utils/smoothen.py +124 -0
- sclab/tools/cellflow/utils/times.py +55 -0
- sclab/tools/differential_expression/__init__.py +5 -0
- sclab/tools/differential_expression/_pseudobulk_edger.py +304 -0
- sclab/tools/differential_expression/_pseudobulk_helpers.py +277 -0
- sclab/tools/doublet_detection/__init__.py +5 -0
- sclab/tools/doublet_detection/_scrublet.py +64 -0
- sclab/tools/labeling/__init__.py +6 -0
- sclab/tools/labeling/sctype.py +233 -0
- sclab/utils/__init__.py +5 -0
- sclab/utils/_write_excel.py +510 -0
- {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/METADATA +6 -2
- sclab-0.3.1.dist-info/RECORD +82 -0
- sclab-0.2.5.dist-info/RECORD +0 -45
- {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/WHEEL +0 -0
- {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from anndata import AnnData
|
|
7
|
+
from numpy.typing import NDArray
|
|
8
|
+
from scipy import stats
|
|
9
|
+
from scipy.sparse import csc_matrix, csr_matrix, issparse
|
|
10
|
+
|
|
11
|
+
from ...preprocess import pool_neighbors
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _get_classification_scores_matrix(
|
|
17
|
+
adata: AnnData,
|
|
18
|
+
markers: pd.DataFrame,
|
|
19
|
+
marker_class_key: str,
|
|
20
|
+
neighbors_key: Optional[str] = None,
|
|
21
|
+
weighted_pooling: bool = False,
|
|
22
|
+
directed_pooling: bool = True,
|
|
23
|
+
layer: Optional[str] = None,
|
|
24
|
+
penalize_non_specific: bool = True,
|
|
25
|
+
):
|
|
26
|
+
# Ianevski, A., Giri, A.K. & Aittokallio, T.
|
|
27
|
+
# Fully-automated and ultra-fast cell-type identification using specific
|
|
28
|
+
# marker combinations from single-cell transcriptomic data.
|
|
29
|
+
# Nat Commun 13, 1246 (2022).
|
|
30
|
+
# https://doi.org/10.1038/s41467-022-28803-w
|
|
31
|
+
|
|
32
|
+
if layer is not None:
|
|
33
|
+
X = adata.layers[layer]
|
|
34
|
+
|
|
35
|
+
else:
|
|
36
|
+
X = adata.X
|
|
37
|
+
|
|
38
|
+
min_val: np.number = X.min()
|
|
39
|
+
M = X > min_val
|
|
40
|
+
n_cells = np.asarray(M.sum(axis=0)).squeeze()
|
|
41
|
+
mask = n_cells > 5
|
|
42
|
+
print(f"using {mask.sum()} genes")
|
|
43
|
+
|
|
44
|
+
markers = markers.loc[markers["names"].isin(adata.var_names[mask])].copy()
|
|
45
|
+
classes = markers[marker_class_key].cat.categories
|
|
46
|
+
|
|
47
|
+
x = markers[[marker_class_key, "names"]].groupby("names").count()[marker_class_key]
|
|
48
|
+
if penalize_non_specific:
|
|
49
|
+
S = 1.0 - (x - x.min()) / (x.max() - x.min())
|
|
50
|
+
S = S[S > 0]
|
|
51
|
+
else:
|
|
52
|
+
S = x * 0.0 + 1.0
|
|
53
|
+
|
|
54
|
+
X: NDArray | csr_matrix | csc_matrix
|
|
55
|
+
if neighbors_key is not None:
|
|
56
|
+
X = pool_neighbors(
|
|
57
|
+
adata[:, S.index],
|
|
58
|
+
layer=layer,
|
|
59
|
+
neighbors_key=neighbors_key,
|
|
60
|
+
weighted=weighted_pooling,
|
|
61
|
+
directed=directed_pooling,
|
|
62
|
+
copy=True,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
elif layer is not None:
|
|
66
|
+
X = adata[:, S.index].layers[layer].copy()
|
|
67
|
+
|
|
68
|
+
else:
|
|
69
|
+
X = adata[:, S.index].X.copy()
|
|
70
|
+
|
|
71
|
+
if issparse(X):
|
|
72
|
+
X = np.asarray(X.todense("C"))
|
|
73
|
+
|
|
74
|
+
Z: NDArray
|
|
75
|
+
Z = stats.zscore(X, axis=0)
|
|
76
|
+
Xp = Z * S.values
|
|
77
|
+
|
|
78
|
+
Xc = np.zeros((adata.shape[0], len(classes)))
|
|
79
|
+
for c, cell_class in enumerate(classes):
|
|
80
|
+
if cell_class == "Unknown":
|
|
81
|
+
continue
|
|
82
|
+
up_genes = markers.loc[
|
|
83
|
+
(markers[marker_class_key] == cell_class) & (markers["logfoldchanges"] > 0),
|
|
84
|
+
"names",
|
|
85
|
+
]
|
|
86
|
+
dw_genes = markers.loc[
|
|
87
|
+
(markers[marker_class_key] == cell_class) & (markers["logfoldchanges"] < 0),
|
|
88
|
+
"names",
|
|
89
|
+
]
|
|
90
|
+
x_up = Xp[:, S.index.isin(up_genes)]
|
|
91
|
+
x_dw = Xp[:, S.index.isin(dw_genes)]
|
|
92
|
+
if len(up_genes) > 0:
|
|
93
|
+
Xc[:, c] += x_up.sum(axis=1) / np.sqrt(len(up_genes))
|
|
94
|
+
if len(dw_genes) > 0:
|
|
95
|
+
Xc[:, c] -= x_dw.sum(axis=1) / np.sqrt(len(dw_genes))
|
|
96
|
+
|
|
97
|
+
return Xc
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def classify_cells(
|
|
101
|
+
adata: AnnData,
|
|
102
|
+
markers: pd.DataFrame,
|
|
103
|
+
marker_class_key: Optional[str] = None,
|
|
104
|
+
cluster_key: Optional[str] = None,
|
|
105
|
+
layer: Optional[str] = None,
|
|
106
|
+
key_added: Optional[str] = None,
|
|
107
|
+
threshold: float = 0.25,
|
|
108
|
+
penalize_non_specific: bool = True,
|
|
109
|
+
neighbors_key: Optional[str] = None,
|
|
110
|
+
save_scores: bool = False,
|
|
111
|
+
):
|
|
112
|
+
"""
|
|
113
|
+
Classify cells based on a set of marker genes.
|
|
114
|
+
|
|
115
|
+
Ianevski, A., Giri, A.K. & Aittokallio, T.
|
|
116
|
+
Fully-automated and ultra-fast cell-type identification using specific
|
|
117
|
+
marker combinations from single-cell transcriptomic data.
|
|
118
|
+
Nat Commun 13, 1246 (2022).
|
|
119
|
+
https://doi.org/10.1038/s41467-022-28803-w
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
adata
|
|
124
|
+
AnnData object.
|
|
125
|
+
markers
|
|
126
|
+
Marker genes.
|
|
127
|
+
marker_class_key
|
|
128
|
+
Column in `markers` that contains the cell type information.
|
|
129
|
+
cluster_key
|
|
130
|
+
Column in `adata.obs` that contains the cluster information. If
|
|
131
|
+
not provided, the classification will be performed on a cell by cell
|
|
132
|
+
basis, pooling across neighbor cells. This pooling can be avoided by
|
|
133
|
+
setting `force_pooling` to `False`.
|
|
134
|
+
layer
|
|
135
|
+
Layer to use for classification. Defaults to `X`.
|
|
136
|
+
key_added
|
|
137
|
+
Key under which to add the classification information.
|
|
138
|
+
threshold
|
|
139
|
+
Confidence threshold for classification. Defaults to `0.25`.
|
|
140
|
+
penalize_non_specific
|
|
141
|
+
Whether to penalize non-specific markers. Defaults to `True`.
|
|
142
|
+
neighbors_key
|
|
143
|
+
If provided, counts will be pooled across neighbor cells using the
|
|
144
|
+
distances in `adata.uns[neighbors_key]["distances"]`. Defaults to `None`.
|
|
145
|
+
save_scores
|
|
146
|
+
Whether to save the classification scores. Defaults to `False`
|
|
147
|
+
"""
|
|
148
|
+
# cite("10.1038/s41467-022-28803-w", __package__)
|
|
149
|
+
|
|
150
|
+
if marker_class_key is not None:
|
|
151
|
+
marker_class = markers[marker_class_key]
|
|
152
|
+
if not marker_class.dtype.name.startswith("category"):
|
|
153
|
+
markers[marker_class_key] = marker_class.astype("category")
|
|
154
|
+
else:
|
|
155
|
+
col_mask = markers.dtypes == "category"
|
|
156
|
+
assert col_mask.sum() == 1, (
|
|
157
|
+
"markers_df must have exactly one column of type 'category'"
|
|
158
|
+
)
|
|
159
|
+
marker_class_key = markers.loc[:, col_mask].squeeze().name
|
|
160
|
+
|
|
161
|
+
classes = markers[marker_class_key].cat.categories
|
|
162
|
+
dtype = markers[marker_class_key].dtype
|
|
163
|
+
|
|
164
|
+
# if doing cell by cell classification, we should pool counts to use cell
|
|
165
|
+
# neighborhood information. This allows to estimate the confidence of the
|
|
166
|
+
# classification. We specify pooling by providing a neighbors_key.
|
|
167
|
+
posXc = _get_classification_scores_matrix(
|
|
168
|
+
adata,
|
|
169
|
+
markers.query("logfoldchanges > 0"),
|
|
170
|
+
marker_class_key,
|
|
171
|
+
neighbors_key,
|
|
172
|
+
weighted_pooling=True,
|
|
173
|
+
directed_pooling=True,
|
|
174
|
+
layer=layer,
|
|
175
|
+
penalize_non_specific=penalize_non_specific,
|
|
176
|
+
)
|
|
177
|
+
negXc = _get_classification_scores_matrix(
|
|
178
|
+
adata,
|
|
179
|
+
markers.query("logfoldchanges < 0"),
|
|
180
|
+
marker_class_key,
|
|
181
|
+
neighbors_key,
|
|
182
|
+
weighted_pooling=True,
|
|
183
|
+
directed_pooling=True,
|
|
184
|
+
layer=layer,
|
|
185
|
+
penalize_non_specific=penalize_non_specific,
|
|
186
|
+
)
|
|
187
|
+
Xc = posXc + negXc
|
|
188
|
+
|
|
189
|
+
if cluster_key is not None:
|
|
190
|
+
mappings = {}
|
|
191
|
+
mappings_nona = {}
|
|
192
|
+
for c in adata.obs[cluster_key].cat.categories:
|
|
193
|
+
cluster_scores_matrix = Xc[adata.obs[cluster_key] == c]
|
|
194
|
+
n_cells_in_cluster = cluster_scores_matrix.shape[0]
|
|
195
|
+
|
|
196
|
+
scores = cluster_scores_matrix.sum(axis=0)
|
|
197
|
+
confidence = scores.max() / n_cells_in_cluster
|
|
198
|
+
if confidence >= threshold:
|
|
199
|
+
mappings[c] = classes[np.argmax(scores)]
|
|
200
|
+
else:
|
|
201
|
+
mappings[c] = pd.NA
|
|
202
|
+
logger.warning(
|
|
203
|
+
f"Cluster {str(c):>5} classified as Unknown with confidence score {confidence: 8.2f}"
|
|
204
|
+
)
|
|
205
|
+
mappings_nona[c] = classes[np.argmax(scores)]
|
|
206
|
+
classifications = adata.obs[cluster_key].map(mappings).astype(dtype)
|
|
207
|
+
classifications_nona = adata.obs[cluster_key].map(mappings_nona).astype(dtype)
|
|
208
|
+
else:
|
|
209
|
+
if neighbors_key is not None:
|
|
210
|
+
n_neigs = adata.uns[neighbors_key]["params"]["n_neighbors"]
|
|
211
|
+
else:
|
|
212
|
+
n_neigs = 1
|
|
213
|
+
index = adata.obs_names
|
|
214
|
+
classifications = classes.values[Xc.argmax(axis=1)]
|
|
215
|
+
classifications = pd.Series(classifications, index=index).astype(dtype)
|
|
216
|
+
classifications_nona = classifications.copy()
|
|
217
|
+
classifications.loc[Xc.max(axis=1) < threshold * n_neigs] = pd.NA
|
|
218
|
+
|
|
219
|
+
N = len(classifications)
|
|
220
|
+
n_unknowns = pd.isna(classifications).sum()
|
|
221
|
+
n_estimated = N - n_unknowns
|
|
222
|
+
|
|
223
|
+
logger.info(f"Estimated types for {n_estimated} cells ({n_estimated / N:.2%})")
|
|
224
|
+
|
|
225
|
+
if key_added is None:
|
|
226
|
+
key_added = marker_class_key
|
|
227
|
+
|
|
228
|
+
adata.obs[key_added] = classifications
|
|
229
|
+
adata.obs[key_added + "_noNA"] = classifications_nona
|
|
230
|
+
|
|
231
|
+
if save_scores:
|
|
232
|
+
adata.obs[key_added + "_score"] = Xc.max(axis=1)
|
|
233
|
+
adata.obsm[key_added + "_scores"] = Xc
|