sclab 0.1.7__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sclab/__init__.py +3 -1
- sclab/_io.py +83 -12
- sclab/_methods_registry.py +65 -0
- sclab/_sclab.py +241 -21
- sclab/dataset/_dataset.py +4 -6
- sclab/dataset/processor/_processor.py +41 -19
- sclab/dataset/processor/_results_panel.py +94 -0
- sclab/dataset/processor/step/_processor_step_base.py +12 -6
- sclab/examples/processor_steps/__init__.py +8 -0
- sclab/examples/processor_steps/_cluster.py +2 -2
- sclab/examples/processor_steps/_differential_expression.py +329 -0
- sclab/examples/processor_steps/_doublet_detection.py +68 -0
- sclab/examples/processor_steps/_gene_expression.py +125 -0
- sclab/examples/processor_steps/_integration.py +116 -0
- sclab/examples/processor_steps/_neighbors.py +26 -6
- sclab/examples/processor_steps/_pca.py +13 -8
- sclab/examples/processor_steps/_preprocess.py +52 -25
- sclab/examples/processor_steps/_qc.py +24 -8
- sclab/examples/processor_steps/_umap.py +2 -2
- sclab/gui/__init__.py +0 -0
- sclab/gui/components/__init__.py +7 -0
- sclab/gui/components/_guided_pseudotime.py +482 -0
- sclab/gui/components/_transfer_metadata.py +186 -0
- sclab/methods/__init__.py +50 -0
- sclab/preprocess/__init__.py +26 -0
- sclab/preprocess/_cca.py +176 -0
- sclab/preprocess/_cca_integrate.py +109 -0
- sclab/preprocess/_filter_obs.py +42 -0
- sclab/preprocess/_harmony.py +421 -0
- sclab/preprocess/_harmony_integrate.py +53 -0
- sclab/preprocess/_normalize_weighted.py +65 -0
- sclab/preprocess/_pca.py +51 -0
- sclab/preprocess/_preprocess.py +155 -0
- sclab/preprocess/_qc.py +38 -0
- sclab/preprocess/_rpca.py +116 -0
- sclab/preprocess/_subset.py +208 -0
- sclab/preprocess/_transfer_metadata.py +196 -0
- sclab/preprocess/_transform.py +82 -0
- sclab/preprocess/_utils.py +96 -0
- sclab/scanpy/__init__.py +0 -0
- sclab/scanpy/_compat.py +92 -0
- sclab/scanpy/_settings.py +526 -0
- sclab/scanpy/logging.py +290 -0
- sclab/scanpy/plotting/__init__.py +0 -0
- sclab/scanpy/plotting/_rcmod.py +73 -0
- sclab/scanpy/plotting/palettes.py +221 -0
- sclab/scanpy/readwrite.py +1108 -0
- sclab/tools/__init__.py +0 -0
- sclab/tools/cellflow/__init__.py +0 -0
- sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
- sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
- sclab/tools/cellflow/pseudotime/__init__.py +0 -0
- sclab/tools/cellflow/pseudotime/_pseudotime.py +336 -0
- sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
- sclab/tools/cellflow/utils/__init__.py +0 -0
- sclab/tools/cellflow/utils/density_nd.py +215 -0
- sclab/tools/cellflow/utils/interpolate.py +334 -0
- sclab/tools/cellflow/utils/periodic_genes.py +106 -0
- sclab/tools/cellflow/utils/smoothen.py +124 -0
- sclab/tools/cellflow/utils/times.py +55 -0
- sclab/tools/differential_expression/__init__.py +7 -0
- sclab/tools/differential_expression/_pseudobulk_edger.py +309 -0
- sclab/tools/differential_expression/_pseudobulk_helpers.py +290 -0
- sclab/tools/differential_expression/_pseudobulk_limma.py +257 -0
- sclab/tools/doublet_detection/__init__.py +5 -0
- sclab/tools/doublet_detection/_scrublet.py +64 -0
- sclab/tools/embedding/__init__.py +0 -0
- sclab/tools/imputation/__init__.py +0 -0
- sclab/tools/imputation/_alra.py +135 -0
- sclab/tools/labeling/__init__.py +6 -0
- sclab/tools/labeling/sctype.py +233 -0
- sclab/tools/utils/__init__.py +5 -0
- sclab/tools/utils/_aggregate_and_filter.py +290 -0
- sclab/utils/__init__.py +5 -0
- sclab/utils/_write_excel.py +510 -0
- {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/METADATA +29 -12
- sclab-0.3.4.dist-info/RECORD +93 -0
- {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/WHEEL +1 -1
- sclab-0.3.4.dist-info/licenses/LICENSE +29 -0
- sclab-0.1.7.dist-info/RECORD +0 -30
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from anndata import AnnData
|
|
6
|
+
from numpy import ndarray
|
|
7
|
+
from scipy.sparse import csr_matrix, issparse
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# code inspired from
|
|
11
|
+
# https://www.sc-best-practices.org/conditions/differential_gene_expression.html
|
|
12
|
+
def aggregate_and_filter(
|
|
13
|
+
adata: AnnData,
|
|
14
|
+
group_key: str = "batch",
|
|
15
|
+
cell_identity_key: str | None = None,
|
|
16
|
+
layer: str | None = None,
|
|
17
|
+
replicas_per_group: int = 3,
|
|
18
|
+
min_cells_per_group: int = 30,
|
|
19
|
+
bootstrap_sampling: bool = False,
|
|
20
|
+
use_cells: dict[str, list[str]] | None = None,
|
|
21
|
+
make_stats: bool = True,
|
|
22
|
+
make_dummies: bool = True,
|
|
23
|
+
) -> AnnData:
|
|
24
|
+
"""
|
|
25
|
+
Aggregate and filter cells in an AnnData object into cell populations.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
adata : AnnData
|
|
30
|
+
AnnData object to aggregate and filter.
|
|
31
|
+
group_key : str, optional
|
|
32
|
+
Key to group cells by. Defaults to 'batch'.
|
|
33
|
+
cell_identity_key : str, optional
|
|
34
|
+
Key to use to identify cell identities. Defaults to None.
|
|
35
|
+
layer : str, optional
|
|
36
|
+
Layer in AnnData object to use for aggregation. Defaults to None.
|
|
37
|
+
replicas_per_group : int, optional
|
|
38
|
+
Number of replicas to create for each group. Defaults to 3.
|
|
39
|
+
min_cells_per_group : int, optional
|
|
40
|
+
Minimum number of cells required for a group to be included. Defaults to 30.
|
|
41
|
+
bootstrap_sampling : bool, optional
|
|
42
|
+
Whether to use bootstrap sampling to create replicas. Defaults to False.
|
|
43
|
+
use_cells : dict[str, list[str]], optional
|
|
44
|
+
If not None, only use the specified cells. Defaults to None.
|
|
45
|
+
make_stats : bool, optional
|
|
46
|
+
Whether to create expression statistics for each group. Defaults to True.
|
|
47
|
+
make_dummies : bool, optional
|
|
48
|
+
Whether to make categorical columns into dummies. Defaults to True.
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
AnnData
|
|
53
|
+
AnnData object with aggregated and filtered cells.
|
|
54
|
+
"""
|
|
55
|
+
adata = _prepare_dataset(adata, use_cells)
|
|
56
|
+
|
|
57
|
+
grouping_keys = [group_key]
|
|
58
|
+
if cell_identity_key is not None:
|
|
59
|
+
grouping_keys.append(cell_identity_key)
|
|
60
|
+
|
|
61
|
+
groups_to_drop = _get_groups_to_drop(adata, grouping_keys, min_cells_per_group)
|
|
62
|
+
|
|
63
|
+
_prepare_categorical_column(adata, group_key)
|
|
64
|
+
group_dtype = adata.obs[group_key].dtype
|
|
65
|
+
|
|
66
|
+
if cell_identity_key is not None:
|
|
67
|
+
_prepare_categorical_column(adata, cell_identity_key)
|
|
68
|
+
cell_identity_dtype = adata.obs[cell_identity_key].dtype
|
|
69
|
+
|
|
70
|
+
if make_stats:
|
|
71
|
+
var_dataframe = _create_var_dataframe(
|
|
72
|
+
adata, layer, grouping_keys, groups_to_drop
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
var_dataframe = pd.DataFrame(index=adata.var_names)
|
|
76
|
+
|
|
77
|
+
data = {}
|
|
78
|
+
meta = {}
|
|
79
|
+
groups = adata.obs.groupby(grouping_keys, observed=True).groups
|
|
80
|
+
for group, group_idxs in groups.items():
|
|
81
|
+
if not isinstance(group, tuple):
|
|
82
|
+
group = (group,)
|
|
83
|
+
|
|
84
|
+
if not _including(group, groups_to_drop):
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
sample_id = "_".join(group)
|
|
88
|
+
match group:
|
|
89
|
+
case (gid, cid):
|
|
90
|
+
group_metadata = {group_key: gid, cell_identity_key: cid}
|
|
91
|
+
case (gid,):
|
|
92
|
+
group_metadata = {group_key: gid}
|
|
93
|
+
|
|
94
|
+
adata_group = adata[group_idxs]
|
|
95
|
+
indices = _get_replica_idxs(adata_group, replicas_per_group, bootstrap_sampling)
|
|
96
|
+
for i, rep_idx in enumerate(indices):
|
|
97
|
+
replica_number = i + 1
|
|
98
|
+
replica_size = len(rep_idx)
|
|
99
|
+
replica_sample_id = f"{sample_id}_rep{replica_number}"
|
|
100
|
+
|
|
101
|
+
adata_group_replica = adata_group[rep_idx]
|
|
102
|
+
X = _get_layer(adata_group_replica, layer)
|
|
103
|
+
|
|
104
|
+
data[replica_sample_id] = np.array(X.sum(axis=0)).flatten()
|
|
105
|
+
meta[replica_sample_id] = {
|
|
106
|
+
**group_metadata,
|
|
107
|
+
"replica": str(replica_number),
|
|
108
|
+
"replica_size": replica_size,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
data = pd.DataFrame(data).T
|
|
112
|
+
meta = pd.DataFrame(meta).T
|
|
113
|
+
meta["replica"] = meta["replica"].astype("category")
|
|
114
|
+
meta["replica_size"] = meta["replica_size"].astype(int)
|
|
115
|
+
meta[group_key] = meta[group_key].astype(group_dtype)
|
|
116
|
+
if cell_identity_key is not None:
|
|
117
|
+
meta[cell_identity_key] = meta[cell_identity_key].astype(cell_identity_dtype)
|
|
118
|
+
|
|
119
|
+
aggr_adata = AnnData(
|
|
120
|
+
data.values,
|
|
121
|
+
obs=meta,
|
|
122
|
+
var=var_dataframe,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if make_dummies:
|
|
126
|
+
_join_dummies(aggr_adata, group_key)
|
|
127
|
+
|
|
128
|
+
return aggr_adata
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _prepare_dataset(
|
|
132
|
+
adata: AnnData,
|
|
133
|
+
use_cells: dict[str, list[str]] | None,
|
|
134
|
+
) -> AnnData:
|
|
135
|
+
if use_cells is not None:
|
|
136
|
+
for key, value in use_cells.items():
|
|
137
|
+
adata = adata[adata.obs[key].isin(value)]
|
|
138
|
+
|
|
139
|
+
return adata.copy()
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _get_groups_to_drop(
|
|
143
|
+
adata: AnnData,
|
|
144
|
+
grouping_keys: str | list[str],
|
|
145
|
+
min_cells_per_group: int,
|
|
146
|
+
):
|
|
147
|
+
group_sizes = adata.obs.groupby(grouping_keys, observed=True).size()
|
|
148
|
+
groups_to_drop = group_sizes[group_sizes < min_cells_per_group].index.to_list()
|
|
149
|
+
|
|
150
|
+
if len(groups_to_drop) > 0:
|
|
151
|
+
print("Dropping the following samples:")
|
|
152
|
+
|
|
153
|
+
groups_to_drop = groups_to_drop + [
|
|
154
|
+
(g,) for g in groups_to_drop if not isinstance(g, tuple)
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
return groups_to_drop
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _prepare_categorical_column(adata: AnnData, column: str) -> None:
|
|
161
|
+
if not isinstance(adata.obs[column].dtype, pd.CategoricalDtype):
|
|
162
|
+
adata.obs[column] = adata.obs[column].astype("category")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _create_var_dataframe(
|
|
166
|
+
adata: AnnData,
|
|
167
|
+
layer: str,
|
|
168
|
+
grouping_keys: list[str],
|
|
169
|
+
groups_to_drop: list[str],
|
|
170
|
+
):
|
|
171
|
+
columns = _get_var_dataframe_columns(adata, grouping_keys, groups_to_drop)
|
|
172
|
+
var_dataframe = pd.DataFrame(index=adata.var_names, columns=columns, dtype=float)
|
|
173
|
+
|
|
174
|
+
groups = adata.obs.groupby(grouping_keys, observed=True).groups
|
|
175
|
+
for group, idx in groups.items():
|
|
176
|
+
if not isinstance(group, tuple):
|
|
177
|
+
group = (group,)
|
|
178
|
+
|
|
179
|
+
if not _including(group, groups_to_drop):
|
|
180
|
+
continue
|
|
181
|
+
|
|
182
|
+
sample_id = "_".join(group)
|
|
183
|
+
rest_id = f"not{sample_id}"
|
|
184
|
+
|
|
185
|
+
adata_subset = adata[idx]
|
|
186
|
+
rest_subset = adata[~adata.obs_names.isin(idx)]
|
|
187
|
+
|
|
188
|
+
X = _get_layer(adata_subset, layer, dense=True)
|
|
189
|
+
Y = _get_layer(rest_subset, layer, dense=True)
|
|
190
|
+
|
|
191
|
+
var_dataframe[f"pct_expr_{sample_id}"] = (X > 0).mean(axis=0)
|
|
192
|
+
var_dataframe[f"pct_expr_{rest_id}"] = (Y > 0).mean(axis=0)
|
|
193
|
+
var_dataframe[f"num_expr_{sample_id}"] = (X > 0).sum(axis=0)
|
|
194
|
+
var_dataframe[f"num_expr_{rest_id}"] = (Y > 0).sum(axis=0)
|
|
195
|
+
var_dataframe[f"tot_expr_{sample_id}"] = X.sum(axis=0)
|
|
196
|
+
var_dataframe[f"tot_expr_{rest_id}"] = Y.sum(axis=0)
|
|
197
|
+
|
|
198
|
+
return var_dataframe
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _get_var_dataframe_columns(
|
|
202
|
+
adata: AnnData, grouping_keys: list[str], groups_to_drop: list[str]
|
|
203
|
+
) -> list[str]:
|
|
204
|
+
columns = []
|
|
205
|
+
|
|
206
|
+
groups = adata.obs.groupby(grouping_keys, observed=True).groups
|
|
207
|
+
for group, _ in groups.items():
|
|
208
|
+
if not isinstance(group, tuple):
|
|
209
|
+
group = (group,)
|
|
210
|
+
|
|
211
|
+
if not _including(group, groups_to_drop):
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
sample_id = "_".join(group)
|
|
215
|
+
rest_id = f"not{sample_id}"
|
|
216
|
+
|
|
217
|
+
columns.extend(
|
|
218
|
+
[
|
|
219
|
+
f"pct_expr_{sample_id}",
|
|
220
|
+
f"pct_expr_{rest_id}",
|
|
221
|
+
f"num_expr_{sample_id}",
|
|
222
|
+
f"num_expr_{rest_id}",
|
|
223
|
+
f"tot_expr_{sample_id}",
|
|
224
|
+
f"tot_expr_{rest_id}",
|
|
225
|
+
]
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
return columns
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _including(group: tuple | str, groups_to_drop: list[str]) -> bool:
|
|
232
|
+
match group:
|
|
233
|
+
case (gid, cid):
|
|
234
|
+
if isinstance(cid, float) and np.isnan(cid):
|
|
235
|
+
return False
|
|
236
|
+
|
|
237
|
+
case (gid,) | gid:
|
|
238
|
+
...
|
|
239
|
+
|
|
240
|
+
if gid in groups_to_drop:
|
|
241
|
+
return False
|
|
242
|
+
|
|
243
|
+
return True
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _get_replica_idxs(
|
|
247
|
+
adata_group: AnnData,
|
|
248
|
+
replicas_per_group: int,
|
|
249
|
+
bootstrap_sampling: bool,
|
|
250
|
+
):
|
|
251
|
+
group_size = adata_group.n_obs
|
|
252
|
+
indices = list(adata_group.obs_names)
|
|
253
|
+
if bootstrap_sampling:
|
|
254
|
+
indices = np.array(
|
|
255
|
+
[
|
|
256
|
+
np.random.choice(indices, size=group_size, replace=True)
|
|
257
|
+
for _ in range(replicas_per_group)
|
|
258
|
+
]
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
else:
|
|
262
|
+
random.shuffle(indices)
|
|
263
|
+
indices = np.array_split(np.array(indices), replicas_per_group)
|
|
264
|
+
|
|
265
|
+
return indices
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _get_layer(adata: AnnData, layer: str | None, dense: bool = False):
|
|
269
|
+
X: ndarray | csr_matrix
|
|
270
|
+
|
|
271
|
+
if layer is None or layer == "X":
|
|
272
|
+
X = adata.X
|
|
273
|
+
else:
|
|
274
|
+
X = adata.layers[layer]
|
|
275
|
+
|
|
276
|
+
if dense:
|
|
277
|
+
if issparse(X):
|
|
278
|
+
X = np.asarray(X.todense())
|
|
279
|
+
else:
|
|
280
|
+
X = np.asarray(X)
|
|
281
|
+
|
|
282
|
+
return X
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _join_dummies(aggr_adata: AnnData, group_key: str) -> None:
|
|
286
|
+
dummies = pd.get_dummies(aggr_adata.obs[group_key], prefix=group_key).astype(str)
|
|
287
|
+
dummies = dummies.astype(str).apply(lambda s: s.map({"True": "", "False": "not"}))
|
|
288
|
+
dummies = dummies + aggr_adata.obs[group_key].cat.categories
|
|
289
|
+
|
|
290
|
+
aggr_adata.obs = aggr_adata.obs.join(dummies)
|