sclab 0.1.7__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. sclab/__init__.py +3 -1
  2. sclab/_io.py +83 -12
  3. sclab/_methods_registry.py +65 -0
  4. sclab/_sclab.py +241 -21
  5. sclab/dataset/_dataset.py +4 -6
  6. sclab/dataset/processor/_processor.py +41 -19
  7. sclab/dataset/processor/_results_panel.py +94 -0
  8. sclab/dataset/processor/step/_processor_step_base.py +12 -6
  9. sclab/examples/processor_steps/__init__.py +8 -0
  10. sclab/examples/processor_steps/_cluster.py +2 -2
  11. sclab/examples/processor_steps/_differential_expression.py +329 -0
  12. sclab/examples/processor_steps/_doublet_detection.py +68 -0
  13. sclab/examples/processor_steps/_gene_expression.py +125 -0
  14. sclab/examples/processor_steps/_integration.py +116 -0
  15. sclab/examples/processor_steps/_neighbors.py +26 -6
  16. sclab/examples/processor_steps/_pca.py +13 -8
  17. sclab/examples/processor_steps/_preprocess.py +52 -25
  18. sclab/examples/processor_steps/_qc.py +24 -8
  19. sclab/examples/processor_steps/_umap.py +2 -2
  20. sclab/gui/__init__.py +0 -0
  21. sclab/gui/components/__init__.py +7 -0
  22. sclab/gui/components/_guided_pseudotime.py +482 -0
  23. sclab/gui/components/_transfer_metadata.py +186 -0
  24. sclab/methods/__init__.py +50 -0
  25. sclab/preprocess/__init__.py +26 -0
  26. sclab/preprocess/_cca.py +176 -0
  27. sclab/preprocess/_cca_integrate.py +109 -0
  28. sclab/preprocess/_filter_obs.py +42 -0
  29. sclab/preprocess/_harmony.py +421 -0
  30. sclab/preprocess/_harmony_integrate.py +53 -0
  31. sclab/preprocess/_normalize_weighted.py +65 -0
  32. sclab/preprocess/_pca.py +51 -0
  33. sclab/preprocess/_preprocess.py +155 -0
  34. sclab/preprocess/_qc.py +38 -0
  35. sclab/preprocess/_rpca.py +116 -0
  36. sclab/preprocess/_subset.py +208 -0
  37. sclab/preprocess/_transfer_metadata.py +196 -0
  38. sclab/preprocess/_transform.py +82 -0
  39. sclab/preprocess/_utils.py +96 -0
  40. sclab/scanpy/__init__.py +0 -0
  41. sclab/scanpy/_compat.py +92 -0
  42. sclab/scanpy/_settings.py +526 -0
  43. sclab/scanpy/logging.py +290 -0
  44. sclab/scanpy/plotting/__init__.py +0 -0
  45. sclab/scanpy/plotting/_rcmod.py +73 -0
  46. sclab/scanpy/plotting/palettes.py +221 -0
  47. sclab/scanpy/readwrite.py +1108 -0
  48. sclab/tools/__init__.py +0 -0
  49. sclab/tools/cellflow/__init__.py +0 -0
  50. sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
  51. sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
  52. sclab/tools/cellflow/pseudotime/__init__.py +0 -0
  53. sclab/tools/cellflow/pseudotime/_pseudotime.py +336 -0
  54. sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
  55. sclab/tools/cellflow/utils/__init__.py +0 -0
  56. sclab/tools/cellflow/utils/density_nd.py +215 -0
  57. sclab/tools/cellflow/utils/interpolate.py +334 -0
  58. sclab/tools/cellflow/utils/periodic_genes.py +106 -0
  59. sclab/tools/cellflow/utils/smoothen.py +124 -0
  60. sclab/tools/cellflow/utils/times.py +55 -0
  61. sclab/tools/differential_expression/__init__.py +7 -0
  62. sclab/tools/differential_expression/_pseudobulk_edger.py +309 -0
  63. sclab/tools/differential_expression/_pseudobulk_helpers.py +290 -0
  64. sclab/tools/differential_expression/_pseudobulk_limma.py +257 -0
  65. sclab/tools/doublet_detection/__init__.py +5 -0
  66. sclab/tools/doublet_detection/_scrublet.py +64 -0
  67. sclab/tools/embedding/__init__.py +0 -0
  68. sclab/tools/imputation/__init__.py +0 -0
  69. sclab/tools/imputation/_alra.py +135 -0
  70. sclab/tools/labeling/__init__.py +6 -0
  71. sclab/tools/labeling/sctype.py +233 -0
  72. sclab/tools/utils/__init__.py +5 -0
  73. sclab/tools/utils/_aggregate_and_filter.py +290 -0
  74. sclab/utils/__init__.py +5 -0
  75. sclab/utils/_write_excel.py +510 -0
  76. {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/METADATA +29 -12
  77. sclab-0.3.4.dist-info/RECORD +93 -0
  78. {sclab-0.1.7.dist-info → sclab-0.3.4.dist-info}/WHEEL +1 -1
  79. sclab-0.3.4.dist-info/licenses/LICENSE +29 -0
  80. sclab-0.1.7.dist-info/RECORD +0 -30
@@ -0,0 +1,309 @@
1
+ import pandas as pd
2
+ from anndata import AnnData
3
+
4
+ from ._pseudobulk_helpers import aggregate_and_filter
5
+
6
+
7
+ def pseudobulk_edger(
8
+ adata_: AnnData,
9
+ group_key: str,
10
+ condition_group: str | list[str] | None = None,
11
+ reference_group: str | None = None,
12
+ cell_identity_key: str | None = None,
13
+ batch_key: str | None = None,
14
+ layer: str | None = None,
15
+ replicas_per_group: int = 5,
16
+ min_cells_per_group: int = 30,
17
+ bootstrap_sampling: bool = False,
18
+ use_cells: dict[str, list[str]] | None = None,
19
+ aggregate: bool = True,
20
+ verbosity: int = 0,
21
+ ) -> dict[str, pd.DataFrame]:
22
+ """
23
+ Fits a model using edgeR and computes top tags for a given condition vs
24
+ reference group.
25
+
26
+ Parameters
27
+ ----------
28
+ adata_ : AnnData
29
+ Annotated data matrix.
30
+ group_key : str
31
+ Key in AnnData object to use to group cells.
32
+ condition_group : str | list[str] | None, optional
33
+ Condition group to compare to reference group. If None, each group will be
34
+ contrasted to the corresponding reference group.
35
+ reference_group : str | None, optional
36
+ Reference group to compare condition group(s) to. If None, the condition group
37
+ is compared to the rest of the cells.
38
+ cell_identity_key : str | None, optional
39
+ If provided, separate contrasts will be computed for each identity. Defaults to None.
40
+ layer : str | None, optional
41
+ Layer in AnnData object to use. EdgeR requires raw counts. Defaults to None.
42
+ replicas_per_group : int, optional
43
+ Number of replicas to create for each group. Defaults to 10.
44
+ min_cells_per_group : int, optional
45
+ Minimum number of cells required for a group to be included. Defaults to 30.
46
+ bootstrap_sampling : bool, optional
47
+ Whether to use bootstrap sampling to create replicas. Defaults to True.
48
+ use_cells : dict[str, list[str]] | None, optional
49
+ If not None, only use the specified cells. Defaults to None. Dictionary key
50
+ is a categorical variable in the obs dataframe and the dictionary value is a
51
+ list of categories to include.
52
+ aggregate : bool, optional
53
+ Whether to aggregate cells before fitting the model. EdgeR requires a small
54
+ number of samples, so if adata_ is a single-cell experiment, the cells should
55
+ be aggregated. Defaults to True.
56
+ verbosity : int, optional
57
+ Verbosity level. Defaults to 0.
58
+
59
+ Returns
60
+ -------
61
+ dict[str, pd.DataFrame]
62
+ Dictionary of dataframes, one for each contrast, with the following columns:
63
+
64
+ * gene_ids : str
65
+ Gene IDs.
66
+ * logFC : float
67
+ Log2 fold change.
68
+ * logCPM : float
69
+ Log2 CPM.
70
+ * F: float
71
+ F-statistic.
72
+ * PValue : float
73
+ p-value.
74
+ * FDR : float
75
+ False discovery rate.
76
+ * pct_expr_cnd : float
77
+ Percentage of cells in condition group expressing the gene.
78
+ * pct_expr_ref : float
79
+ Percentage of cells in reference group expressing the gene.
80
+ """
81
+ _try_imports()
82
+ import anndata2ri # noqa: F401
83
+ import rpy2.robjects as robjects
84
+ from rpy2.rinterface_lib.embedded import RRuntimeError # noqa: F401
85
+ from rpy2.robjects import pandas2ri # noqa: F401
86
+ from rpy2.robjects.conversion import localconverter # noqa: F401
87
+
88
+ R = robjects.r
89
+
90
+ if aggregate:
91
+ aggr_adata = aggregate_and_filter(
92
+ adata_,
93
+ group_key,
94
+ cell_identity_key,
95
+ layer,
96
+ replicas_per_group,
97
+ min_cells_per_group,
98
+ bootstrap_sampling,
99
+ use_cells,
100
+ )
101
+ else:
102
+ aggr_adata = adata_.copy()
103
+
104
+ with localconverter(anndata2ri.converter):
105
+ R.assign("aggr_adata", aggr_adata)
106
+
107
+ # defines the R function for fitting the model with edgeR
108
+ R(_fit_model_r_script)
109
+
110
+ if condition_group is None:
111
+ condition_group_list = aggr_adata.obs[group_key].unique()
112
+ elif isinstance(condition_group, str):
113
+ condition_group_list = [condition_group]
114
+ else:
115
+ condition_group_list = condition_group
116
+
117
+ if cell_identity_key is not None:
118
+ cids = aggr_adata.obs[cell_identity_key].unique()
119
+ else:
120
+ cids = [""]
121
+
122
+ tt_dict = {}
123
+ for condition_group in condition_group_list:
124
+ if reference_group is not None and condition_group == reference_group:
125
+ continue
126
+
127
+ if verbosity > 0:
128
+ print(f"Fitting model for {condition_group}...")
129
+
130
+ if reference_group is not None:
131
+ gk = group_key
132
+ else:
133
+ gk = f"{group_key}_{condition_group}"
134
+
135
+ try:
136
+ R(f"""
137
+ outs <- fit_edger_model(aggr_adata, "{gk}", "{cell_identity_key}", "{batch_key}", verbosity = {verbosity})
138
+ fit <- outs$fit
139
+ y <- outs$y
140
+ """)
141
+
142
+ except RRuntimeError as e:
143
+ print("Error fitting model for", condition_group)
144
+ print("Error:", e)
145
+ print("Skipping...", flush=True)
146
+ continue
147
+
148
+ if reference_group is None:
149
+ new_contrasts_tuples = [
150
+ (
151
+ condition_group, # common prefix
152
+ "", # condition group
153
+ "not", # reference group
154
+ cid, # cell identity
155
+ )
156
+ for cid in cids
157
+ ]
158
+
159
+ else:
160
+ new_contrasts_tuples = [
161
+ (
162
+ "", # common prefix
163
+ condition_group, # condition group
164
+ reference_group, # reference group
165
+ cid, # cell identity
166
+ )
167
+ for cid in cids
168
+ ]
169
+
170
+ new_contrasts = [
171
+ f"group{cnd}{prefix}_{cid}".strip("_")
172
+ + "-"
173
+ + f"group{ref}{prefix}_{cid}".strip("_")
174
+ for prefix, cnd, ref, cid in new_contrasts_tuples
175
+ ]
176
+
177
+ for contrast, contrast_tuple in zip(new_contrasts, new_contrasts_tuples):
178
+ prefix, cnd, ref, cid = contrast_tuple
179
+
180
+ if ref == "not":
181
+ cnd, ref = "", "rest"
182
+
183
+ contrast_key = f"{prefix}{cnd}_vs_{ref}"
184
+ if cid:
185
+ contrast_key = f"{cell_identity_key}:{cid}|{contrast_key}"
186
+
187
+ if verbosity > 0:
188
+ print(f"Computing contrast: {contrast_key}... ({contrast})")
189
+
190
+ R(f"myContrast <- makeContrasts('{contrast}', levels = y$design)")
191
+ R("qlf <- glmQLFTest(fit, contrast=myContrast)")
192
+ R("tt <- topTags(qlf, n = Inf)$table")
193
+ tt: pd.DataFrame = pandas2ri.rpy2py(R("tt"))
194
+ tt.index.name = "gene_ids"
195
+
196
+ genes = tt.index
197
+ cnd, ref = [c[5:] for c in contrast.split("-")]
198
+ tt["pct_expr_cnd"] = aggr_adata.var[f"pct_expr_{cnd}"].loc[genes]
199
+ tt["pct_expr_ref"] = aggr_adata.var[f"pct_expr_{ref}"].loc[genes]
200
+ tt["num_expr_cnd"] = aggr_adata.var[f"num_expr_{cnd}"].loc[genes]
201
+ tt["num_expr_ref"] = aggr_adata.var[f"num_expr_{ref}"].loc[genes]
202
+ tt["tot_expr_cnd"] = aggr_adata.var[f"tot_expr_{cnd}"].loc[genes]
203
+ tt["tot_expr_ref"] = aggr_adata.var[f"tot_expr_{ref}"].loc[genes]
204
+ tt["mean_cnd"] = tt["tot_expr_cnd"] / tt["num_expr_cnd"]
205
+ tt["mean_ref"] = tt["tot_expr_ref"] / tt["num_expr_ref"]
206
+ tt_dict[contrast_key] = tt
207
+
208
+ return tt_dict
209
+
210
+
211
+ _fit_model_r_script = """
212
+ suppressPackageStartupMessages({
213
+ library(edgeR)
214
+ library(MAST)
215
+ })
216
+
217
+ fit_edger_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
218
+
219
+ if (verbosity > 0){
220
+ cat("Group key:", group_key, "\n")
221
+ cat("Cell identity key:", cell_identity_key, "\n")
222
+ }
223
+
224
+ # create a vector that is concatentation of condition and cell type that we will later use with contrasts
225
+ if (cell_identity_key == "None"){
226
+ group <- colData(adata_)[[group_key]]
227
+ } else {
228
+ group <- paste0(colData(adata_)[[group_key]], "_", colData(adata_)[[cell_identity_key]])
229
+ }
230
+
231
+ if (verbosity > 1){
232
+ cat("Group(s):", group, "\n")
233
+ }
234
+
235
+ group <- factor(group)
236
+ replica <- factor(colData(adata_)$replica)
237
+
238
+ # create a design matrix
239
+ if (batch_key == "None"){
240
+ design <- model.matrix(~ 0 + group + replica)
241
+ } else {
242
+ batch <- factor(colData(adata_)[[batch_key]])
243
+ design <- model.matrix(~ 0 + group + replica + batch)
244
+ }
245
+ colnames(design) <- make.names(colnames(design))
246
+
247
+ # create an edgeR object with counts and grouping factor
248
+ y <- DGEList(assay(adata_, "X"), group = group)
249
+
250
+ # filter out genes with low counts
251
+ if (verbosity > 1){
252
+ cat("Dimensions before subsetting:", dim(y), "\n")
253
+ }
254
+
255
+ keep <- filterByExpr(y, design = design)
256
+ y <- y[keep, , keep.lib.sizes=FALSE]
257
+ if (verbosity > 1){
258
+ cat("Dimensions after subsetting:", dim(y), "\n")
259
+ }
260
+
261
+ # normalize
262
+ y <- calcNormFactors(y)
263
+
264
+ # estimate dispersion
265
+ y <- estimateDisp(y, design = design)
266
+ # fit the model
267
+ fit <- glmQLFit(y, design)
268
+
269
+ return(list("fit"=fit, "design"=design, "y"=y))
270
+ }
271
+ """
272
+
273
+
274
+ def _try_imports():
275
+ try:
276
+ import rpy2.robjects as robjects
277
+ from rpy2.robjects.packages import PackageNotInstalledError, importr
278
+
279
+ robjects.r("options(warn=-1)")
280
+ import anndata2ri # noqa: F401
281
+ from rpy2.rinterface_lib.embedded import RRuntimeError # noqa: F401
282
+ from rpy2.robjects import numpy2ri, pandas2ri # noqa: F401
283
+ from rpy2.robjects.conversion import localconverter # noqa: F401
284
+
285
+ importr("edgeR")
286
+ importr("MAST")
287
+ importr("SingleCellExperiment")
288
+
289
+ except ModuleNotFoundError:
290
+ message = (
291
+ "edger_pseudobulk requires rpy2 and anndata2ri to be installed.\n"
292
+ "please install with one of the following:\n"
293
+ "$ pip install rpy2 anndata2ri\n"
294
+ "or\n"
295
+ "$ conda install -c conda-forge rpy2 anndata2ri\n"
296
+ )
297
+ print(message)
298
+ raise ModuleNotFoundError(message)
299
+
300
+ except PackageNotInstalledError:
301
+ message = (
302
+ "edger_pseudobulk requires the following R packages to be installed: edgeR, MAST, and SingleCellExperiment.\n"
303
+ "> \n"
304
+ "> if (!require('BiocManager', quietly = TRUE)) install.packages('BiocManager');\n"
305
+ "> BiocManager::install(c('edgeR', 'MAST', 'SingleCellExperiment'));\n"
306
+ "> \n"
307
+ )
308
+ print(message)
309
+ raise ImportError(message)
@@ -0,0 +1,290 @@
1
+ import random
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from anndata import AnnData
6
+ from numpy import ndarray
7
+ from scipy.sparse import csr_matrix, issparse
8
+
9
+
10
+ # code inspired from
11
+ # https://www.sc-best-practices.org/conditions/differential_gene_expression.html
12
+ def aggregate_and_filter(
13
+ adata: AnnData,
14
+ group_key: str = "batch",
15
+ cell_identity_key: str | None = None,
16
+ layer: str | None = None,
17
+ replicas_per_group: int = 3,
18
+ min_cells_per_group: int = 30,
19
+ bootstrap_sampling: bool = False,
20
+ use_cells: dict[str, list[str]] | None = None,
21
+ make_stats: bool = True,
22
+ make_dummies: bool = True,
23
+ ) -> AnnData:
24
+ """
25
+ Aggregate and filter cells in an AnnData object into cell populations.
26
+
27
+ Parameters
28
+ ----------
29
+ adata : AnnData
30
+ AnnData object to aggregate and filter.
31
+ group_key : str, optional
32
+ Key to group cells by. Defaults to 'batch'.
33
+ cell_identity_key : str, optional
34
+ Key to use to identify cell identities. Defaults to None.
35
+ layer : str, optional
36
+ Layer in AnnData object to use for aggregation. Defaults to None.
37
+ replicas_per_group : int, optional
38
+ Number of replicas to create for each group. Defaults to 3.
39
+ min_cells_per_group : int, optional
40
+ Minimum number of cells required for a group to be included. Defaults to 30.
41
+ bootstrap_sampling : bool, optional
42
+ Whether to use bootstrap sampling to create replicas. Defaults to False.
43
+ use_cells : dict[str, list[str]], optional
44
+ If not None, only use the specified cells. Defaults to None.
45
+ make_stats : bool, optional
46
+ Whether to create expression statistics for each group. Defaults to True.
47
+ make_dummies : bool, optional
48
+ Whether to make categorical columns into dummies. Defaults to True.
49
+
50
+ Returns
51
+ -------
52
+ AnnData
53
+ AnnData object with aggregated and filtered cells.
54
+ """
55
+ adata = _prepare_dataset(adata, use_cells)
56
+
57
+ grouping_keys = [group_key]
58
+ if cell_identity_key is not None:
59
+ grouping_keys.append(cell_identity_key)
60
+
61
+ groups_to_drop = _get_groups_to_drop(adata, grouping_keys, min_cells_per_group)
62
+
63
+ _prepare_categorical_column(adata, group_key)
64
+ group_dtype = adata.obs[group_key].dtype
65
+
66
+ if cell_identity_key is not None:
67
+ _prepare_categorical_column(adata, cell_identity_key)
68
+ cell_identity_dtype = adata.obs[cell_identity_key].dtype
69
+
70
+ if make_stats:
71
+ var_dataframe = _create_var_dataframe(
72
+ adata, layer, grouping_keys, groups_to_drop
73
+ )
74
+ else:
75
+ var_dataframe = pd.DataFrame(index=adata.var_names)
76
+
77
+ data = {}
78
+ meta = {}
79
+ groups = adata.obs.groupby(grouping_keys, observed=True).groups
80
+ for group, group_idxs in groups.items():
81
+ if not isinstance(group, tuple):
82
+ group = (group,)
83
+
84
+ if not _including(group, groups_to_drop):
85
+ continue
86
+
87
+ sample_id = "_".join(group)
88
+ match group:
89
+ case (gid, cid):
90
+ group_metadata = {group_key: gid, cell_identity_key: cid}
91
+ case (gid,):
92
+ group_metadata = {group_key: gid}
93
+
94
+ adata_group = adata[group_idxs]
95
+ indices = _get_replica_idxs(adata_group, replicas_per_group, bootstrap_sampling)
96
+ for i, rep_idx in enumerate(indices):
97
+ replica_number = i + 1
98
+ replica_size = len(rep_idx)
99
+ replica_sample_id = f"{sample_id}_rep{replica_number}"
100
+
101
+ adata_group_replica = adata_group[rep_idx]
102
+ X = _get_layer(adata_group_replica, layer)
103
+
104
+ data[replica_sample_id] = np.array(X.sum(axis=0)).flatten()
105
+ meta[replica_sample_id] = {
106
+ **group_metadata,
107
+ "replica": str(replica_number),
108
+ "replica_size": replica_size,
109
+ }
110
+
111
+ data = pd.DataFrame(data).T
112
+ meta = pd.DataFrame(meta).T
113
+ meta["replica"] = meta["replica"].astype("category")
114
+ meta["replica_size"] = meta["replica_size"].astype(int)
115
+ meta[group_key] = meta[group_key].astype(group_dtype)
116
+ if cell_identity_key is not None:
117
+ meta[cell_identity_key] = meta[cell_identity_key].astype(cell_identity_dtype)
118
+
119
+ aggr_adata = AnnData(
120
+ data.values,
121
+ obs=meta,
122
+ var=var_dataframe,
123
+ )
124
+
125
+ if make_dummies:
126
+ _join_dummies(aggr_adata, group_key)
127
+
128
+ return aggr_adata
129
+
130
+
131
+ def _prepare_dataset(
132
+ adata: AnnData,
133
+ use_cells: dict[str, list[str]] | None,
134
+ ) -> AnnData:
135
+ if use_cells is not None:
136
+ for key, value in use_cells.items():
137
+ adata = adata[adata.obs[key].isin(value)]
138
+
139
+ return adata.copy()
140
+
141
+
142
+ def _get_groups_to_drop(
143
+ adata: AnnData,
144
+ grouping_keys: str | list[str],
145
+ min_cells_per_group: int,
146
+ ):
147
+ group_sizes = adata.obs.groupby(grouping_keys, observed=True).size()
148
+ groups_to_drop = group_sizes[group_sizes < min_cells_per_group].index.to_list()
149
+
150
+ if len(groups_to_drop) > 0:
151
+ print("Dropping the following samples:")
152
+
153
+ groups_to_drop = groups_to_drop + [
154
+ (g,) for g in groups_to_drop if not isinstance(g, tuple)
155
+ ]
156
+
157
+ return groups_to_drop
158
+
159
+
160
+ def _prepare_categorical_column(adata: AnnData, column: str) -> None:
161
+ if not isinstance(adata.obs[column].dtype, pd.CategoricalDtype):
162
+ adata.obs[column] = adata.obs[column].astype("category")
163
+
164
+
165
+ def _create_var_dataframe(
166
+ adata: AnnData,
167
+ layer: str,
168
+ grouping_keys: list[str],
169
+ groups_to_drop: list[str],
170
+ ):
171
+ columns = _get_var_dataframe_columns(adata, grouping_keys, groups_to_drop)
172
+ var_dataframe = pd.DataFrame(index=adata.var_names, columns=columns, dtype=float)
173
+
174
+ groups = adata.obs.groupby(grouping_keys, observed=True).groups
175
+ for group, idx in groups.items():
176
+ if not isinstance(group, tuple):
177
+ group = (group,)
178
+
179
+ if not _including(group, groups_to_drop):
180
+ continue
181
+
182
+ sample_id = "_".join(group)
183
+ rest_id = f"not{sample_id}"
184
+
185
+ adata_subset = adata[idx]
186
+ rest_subset = adata[~adata.obs_names.isin(idx)]
187
+
188
+ X = _get_layer(adata_subset, layer, dense=True)
189
+ Y = _get_layer(rest_subset, layer, dense=True)
190
+
191
+ var_dataframe[f"pct_expr_{sample_id}"] = (X > 0).mean(axis=0)
192
+ var_dataframe[f"pct_expr_{rest_id}"] = (Y > 0).mean(axis=0)
193
+ var_dataframe[f"num_expr_{sample_id}"] = (X > 0).sum(axis=0)
194
+ var_dataframe[f"num_expr_{rest_id}"] = (Y > 0).sum(axis=0)
195
+ var_dataframe[f"tot_expr_{sample_id}"] = X.sum(axis=0)
196
+ var_dataframe[f"tot_expr_{rest_id}"] = Y.sum(axis=0)
197
+
198
+ return var_dataframe
199
+
200
+
201
+ def _get_var_dataframe_columns(
202
+ adata: AnnData, grouping_keys: list[str], groups_to_drop: list[str]
203
+ ) -> list[str]:
204
+ columns = []
205
+
206
+ groups = adata.obs.groupby(grouping_keys, observed=True).groups
207
+ for group, _ in groups.items():
208
+ if not isinstance(group, tuple):
209
+ group = (group,)
210
+
211
+ if not _including(group, groups_to_drop):
212
+ continue
213
+
214
+ sample_id = "_".join(group)
215
+ rest_id = f"not{sample_id}"
216
+
217
+ columns.extend(
218
+ [
219
+ f"pct_expr_{sample_id}",
220
+ f"pct_expr_{rest_id}",
221
+ f"num_expr_{sample_id}",
222
+ f"num_expr_{rest_id}",
223
+ f"tot_expr_{sample_id}",
224
+ f"tot_expr_{rest_id}",
225
+ ]
226
+ )
227
+
228
+ return columns
229
+
230
+
231
+ def _including(group: tuple | str, groups_to_drop: list[str]) -> bool:
232
+ match group:
233
+ case (gid, cid):
234
+ if isinstance(cid, float) and np.isnan(cid):
235
+ return False
236
+
237
+ case (gid,) | gid:
238
+ ...
239
+
240
+ if gid in groups_to_drop:
241
+ return False
242
+
243
+ return True
244
+
245
+
246
+ def _get_replica_idxs(
247
+ adata_group: AnnData,
248
+ replicas_per_group: int,
249
+ bootstrap_sampling: bool,
250
+ ):
251
+ group_size = adata_group.n_obs
252
+ indices = list(adata_group.obs_names)
253
+ if bootstrap_sampling:
254
+ indices = np.array(
255
+ [
256
+ np.random.choice(indices, size=group_size, replace=True)
257
+ for _ in range(replicas_per_group)
258
+ ]
259
+ )
260
+
261
+ else:
262
+ random.shuffle(indices)
263
+ indices = np.array_split(np.array(indices), replicas_per_group)
264
+
265
+ return indices
266
+
267
+
268
+ def _get_layer(adata: AnnData, layer: str | None, dense: bool = False):
269
+ X: ndarray | csr_matrix
270
+
271
+ if layer is None or layer == "X":
272
+ X = adata.X
273
+ else:
274
+ X = adata.layers[layer]
275
+
276
+ if dense:
277
+ if issparse(X):
278
+ X = np.asarray(X.todense())
279
+ else:
280
+ X = np.asarray(X)
281
+
282
+ return X
283
+
284
+
285
+ def _join_dummies(aggr_adata: AnnData, group_key: str) -> None:
286
+ dummies = pd.get_dummies(aggr_adata.obs[group_key], prefix=group_key).astype(str)
287
+ dummies = dummies.astype(str).apply(lambda s: s.map({"True": "", "False": "not"}))
288
+ dummies = dummies + aggr_adata.obs[group_key].cat.categories
289
+
290
+ aggr_adata.obs = aggr_adata.obs.join(dummies)