sclab 0.2.5__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sclab might be problematic. Click here for more details.

Files changed (50) hide show
  1. sclab/__init__.py +1 -1
  2. sclab/dataset/_dataset.py +1 -1
  3. sclab/examples/processor_steps/__init__.py +2 -0
  4. sclab/examples/processor_steps/_doublet_detection.py +68 -0
  5. sclab/examples/processor_steps/_integration.py +37 -4
  6. sclab/examples/processor_steps/_neighbors.py +24 -4
  7. sclab/examples/processor_steps/_pca.py +5 -5
  8. sclab/examples/processor_steps/_preprocess.py +14 -1
  9. sclab/examples/processor_steps/_qc.py +22 -6
  10. sclab/gui/__init__.py +0 -0
  11. sclab/gui/components/__init__.py +5 -0
  12. sclab/gui/components/_guided_pseudotime.py +482 -0
  13. sclab/methods/__init__.py +25 -1
  14. sclab/preprocess/__init__.py +18 -0
  15. sclab/preprocess/_cca.py +154 -0
  16. sclab/preprocess/_cca_integrate.py +77 -0
  17. sclab/preprocess/_filter_obs.py +42 -0
  18. sclab/preprocess/_harmony.py +421 -0
  19. sclab/preprocess/_harmony_integrate.py +50 -0
  20. sclab/preprocess/_normalize_weighted.py +61 -0
  21. sclab/preprocess/_subset.py +208 -0
  22. sclab/preprocess/_transfer_metadata.py +137 -0
  23. sclab/preprocess/_transform.py +82 -0
  24. sclab/preprocess/_utils.py +96 -0
  25. sclab/tools/__init__.py +0 -0
  26. sclab/tools/cellflow/__init__.py +0 -0
  27. sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
  28. sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
  29. sclab/tools/cellflow/pseudotime/__init__.py +0 -0
  30. sclab/tools/cellflow/pseudotime/_pseudotime.py +332 -0
  31. sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
  32. sclab/tools/cellflow/utils/__init__.py +0 -0
  33. sclab/tools/cellflow/utils/density_nd.py +136 -0
  34. sclab/tools/cellflow/utils/interpolate.py +334 -0
  35. sclab/tools/cellflow/utils/smoothen.py +124 -0
  36. sclab/tools/cellflow/utils/times.py +55 -0
  37. sclab/tools/differential_expression/__init__.py +5 -0
  38. sclab/tools/differential_expression/_pseudobulk_edger.py +304 -0
  39. sclab/tools/differential_expression/_pseudobulk_helpers.py +277 -0
  40. sclab/tools/doublet_detection/__init__.py +5 -0
  41. sclab/tools/doublet_detection/_scrublet.py +64 -0
  42. sclab/tools/labeling/__init__.py +6 -0
  43. sclab/tools/labeling/sctype.py +233 -0
  44. sclab/utils/__init__.py +5 -0
  45. sclab/utils/_write_excel.py +510 -0
  46. {sclab-0.2.5.dist-info → sclab-0.3.0.dist-info}/METADATA +6 -2
  47. sclab-0.3.0.dist-info/RECORD +81 -0
  48. sclab-0.2.5.dist-info/RECORD +0 -45
  49. {sclab-0.2.5.dist-info → sclab-0.3.0.dist-info}/WHEEL +0 -0
  50. {sclab-0.2.5.dist-info → sclab-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,124 @@
1
+ import logging
2
+ from typing import Callable
3
+
4
+ import numpy as np
5
+ from numpy import bool_, floating, integer
6
+ from numpy.typing import NDArray
7
+ from tqdm.auto import tqdm
8
+
9
+ logger = logging.getLogger(__name__)
10
+ PIX2 = 2 * np.pi
11
+
12
+
13
+ def count_empty_intervals(t: NDArray[floating], t_grid: NDArray[floating]) -> int:
14
+ n_data_in_intervals = count_data_in_intervals(t, t_grid)
15
+ empty_intervals_count = np.sum(n_data_in_intervals == 0)
16
+ return empty_intervals_count
17
+
18
+
19
+ def count_data_in_intervals(
20
+ t: NDArray[floating], t_grid: NDArray[floating]
21
+ ) -> NDArray[integer]:
22
+ t = t.reshape(-1, 1)
23
+ return np.logical_and(t_grid[:-1] <= t, t <= t_grid[1:]).sum(axis=0)
24
+
25
+
26
+ def choose_grid_size(t: NDArray[floating], t_range: tuple[float, float]) -> int:
27
+ grid_size = 2**10
28
+ for _ in range(10):
29
+ t_grid: NDArray[floating] = np.linspace(*t_range, grid_size + 1)
30
+ empty_intervals = count_empty_intervals(t, t_grid)
31
+ if empty_intervals == 0:
32
+ break
33
+ grid_size //= 2
34
+ else:
35
+ raise ValueError("Could not find a suitable grid size")
36
+
37
+ return grid_size
38
+
39
+
40
+ def smoothen_data(
41
+ t: NDArray[floating],
42
+ X: NDArray[floating],
43
+ t_range: tuple[float, float] | None = None,
44
+ t_grid: NDArray[floating] | None = None,
45
+ fn: Callable[[NDArray[floating]], NDArray[floating]] = np.average,
46
+ window_width: float | None = None,
47
+ weights: NDArray[floating] | None = None,
48
+ zero_weight: float = 1,
49
+ periodic: bool = False,
50
+ quiet: bool = False,
51
+ progress: bool = False,
52
+ ) -> NDArray[floating]:
53
+ if t_grid is None:
54
+ # no grid provided. We will have one output point for each input point
55
+ t_grid = t
56
+ is_grid = False
57
+ else:
58
+ # grid is provided
59
+ is_grid = True
60
+ empty_intervals = count_empty_intervals(t, t_grid)
61
+ if empty_intervals > 0 and not quiet:
62
+ logger.warning(f"Provided grid has {empty_intervals} empty intervals")
63
+
64
+ if t_range is not None:
65
+ # we used a specific t values range
66
+ tmin, tmax = t_range
67
+ else:
68
+ tmin, tmax = t_grid.min(), t_grid.max()
69
+
70
+ # full time window size
71
+ tspan = tmax - tmin
72
+
73
+ if window_width is None and not is_grid:
74
+ window_width = tspan * 0.05
75
+ elif window_width is None and is_grid:
76
+ window_width = tspan / (t_grid.size - 1) * 2
77
+
78
+ # initialize the output matrix with NaNs
79
+ X_smooth: NDArray[floating] = np.full((t_grid.size,) + X.shape[1:], np.nan)
80
+
81
+ generator = enumerate(t_grid)
82
+ if progress:
83
+ generator = tqdm(
84
+ generator,
85
+ total=t_grid.size,
86
+ bar_format="{desc} {percentage:3.0f}%|{bar}|",
87
+ desc="Smoothing data",
88
+ )
89
+
90
+ X = X.astype(float)
91
+ eps = np.finfo(float).eps
92
+ for i, m in generator:
93
+ low = m - window_width / 2
94
+ hig = m + window_width / 2
95
+
96
+ mask: NDArray[bool_] = (t >= low) & (t <= hig)
97
+ if periodic:
98
+ # include points beyond the periodic boundaries
99
+ mask = (
100
+ mask
101
+ | (t >= low + tspan) & (t <= hig + tspan)
102
+ | (t >= low - tspan) & (t <= hig - tspan)
103
+ )
104
+
105
+ if mask.sum() == 0:
106
+ continue
107
+
108
+ x = X[mask] + eps
109
+ if fn == np.average and weights is not None:
110
+ w = weights[mask]
111
+ X_smooth[i] = np.average(x, axis=0, weights=w)
112
+
113
+ elif fn == np.average and zero_weight == 1:
114
+ X_smooth[i] = np.mean(x, axis=0)
115
+
116
+ elif fn == np.average and zero_weight != 1:
117
+ w = np.ones_like(x)
118
+ w[x == eps] = zero_weight + eps
119
+ X_smooth[i] = fn(x, axis=0, weights=w)
120
+
121
+ else:
122
+ X_smooth[i] = fn(x, axis=0)
123
+
124
+ return X_smooth - eps
@@ -0,0 +1,55 @@
1
+ import itertools
2
+
3
+ import numpy as np
4
+ from numpy import floating
5
+ from numpy.typing import NDArray
6
+
7
+
8
+ def guess_trange(
9
+ times: NDArray[floating], verbose: bool = False
10
+ ) -> tuple[float, float]:
11
+ tmin, tmax = times.min(), times.max()
12
+ tspan = tmax - tmin
13
+
14
+ scale = 10.0 ** np.ceil(np.log10(tspan)) / 100
15
+ tspan = np.ceil(tspan / scale) * scale
16
+
17
+ scale = 10.0 ** np.ceil(np.log10(tspan)) / 100
18
+ g_tmin = np.floor(tmin / scale) * scale
19
+ g_tmax = np.ceil(tmax / scale) * scale
20
+
21
+ g_tmin = 0.0 if g_tmin == -0.0 else g_tmin
22
+ g_tmax = 0.0 if g_tmax == -0.0 else g_tmax
23
+
24
+ if verbose:
25
+ print(
26
+ f"tspan: {tspan:10.4f} min-max: {tmin:10.4f} - {tmax:10.4f} | {g_tmin:>8} - {g_tmax:>8}"
27
+ )
28
+
29
+ return g_tmin, g_tmax
30
+
31
+
32
+ def test_guess_trange(N: int = 1000, verbose: bool = False) -> None:
33
+ def _test1(trange: tuple[float, float]) -> bool:
34
+ tmin, tmax = trange
35
+ tspan = tmax - tmin
36
+ g_tmin, g_tmax = guess_trange(np.random.uniform(*trange, N))
37
+ err_min = np.abs(g_tmin - tmin) / tspan
38
+ err_max = np.abs(g_tmax - tmax) / tspan
39
+ return err_min <= 0.01 and err_max <= 0.01
40
+
41
+ scales1 = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
42
+ scales2 = [1, 2, 3, 5, 7]
43
+ for s1, s2 in itertools.product(scales1, scales2):
44
+ scale = s1 * s2
45
+ for lw, hg in [(-2, -1), (-1 / 2, 1 / 2), (1, 2)]:
46
+ trange = lw * scale, hg * scale
47
+ acc1 = np.mean([_test1(trange) for _ in range(500)])
48
+ if verbose:
49
+ print(
50
+ f"scale: {scale: 9.3f} | lw-hg: {lw: 5.1f} - {hg: 5.1f} | {acc1: 8.2%}"
51
+ )
52
+ else:
53
+ assert acc1 > 0.95, (
54
+ f"scale: {scale: 9.3f} | lw-hg: {lw: 5.1f} - {hg: 5.1f} | {acc1: 8.2%}"
55
+ )
@@ -0,0 +1,5 @@
1
+ from ._pseudobulk_edger import pseudobulk_edger
2
+
3
+ __all__ = [
4
+ "pseudobulk_edger",
5
+ ]
@@ -0,0 +1,304 @@
1
+ import pandas as pd
2
+ from anndata import AnnData
3
+
4
+ from ._pseudobulk_helpers import aggregate_and_filter
5
+
6
+
7
+ def pseudobulk_edger(
8
+ adata_: AnnData,
9
+ group_key: str,
10
+ condition_group: str | list[str] | None = None,
11
+ reference_group: str | None = None,
12
+ cell_identity_key: str | None = None,
13
+ batch_key: str | None = None,
14
+ layer: str | None = None,
15
+ replicas_per_group: int = 10,
16
+ min_cells_per_group: int = 30,
17
+ bootstrap_sampling: bool = True,
18
+ use_cells: dict[str, list[str]] | None = None,
19
+ aggregate: bool = True,
20
+ verbosity: int = 0,
21
+ ) -> dict[str, pd.DataFrame]:
22
+ """
23
+ Fits a model using edgeR and computes top tags for a given condition vs
24
+ reference group.
25
+
26
+ Parameters
27
+ ----------
28
+ adata_ : AnnData
29
+ Annotated data matrix.
30
+ group_key : str
31
+ Key in AnnData object to use to group cells.
32
+ condition_group : str | list[str] | None, optional
33
+ Condition group to compare to reference group. If None, each group will be
34
+ contrasted to the corresponding reference group.
35
+ reference_group : str | None, optional
36
+ Reference group to compare condition group(s) to. If None, the condition group
37
+ is compared to the rest of the cells.
38
+ cell_identity_key : str | None, optional
39
+ If provided, separate contrasts will be computed for each identity. Defaults to None.
40
+ layer : str | None, optional
41
+ Layer in AnnData object to use. EdgeR requires raw counts. Defaults to None.
42
+ replicas_per_group : int, optional
43
+ Number of replicas to create for each group. Defaults to 10.
44
+ min_cells_per_group : int, optional
45
+ Minimum number of cells required for a group to be included. Defaults to 30.
46
+ bootstrap_sampling : bool, optional
47
+ Whether to use bootstrap sampling to create replicas. Defaults to True.
48
+ use_cells : dict[str, list[str]] | None, optional
49
+ If not None, only use the specified cells. Defaults to None. Dictionary key
50
+ is a categorical variable in the obs dataframe and the dictionary value is a
51
+ list of categories to include.
52
+ aggregate : bool, optional
53
+ Whether to aggregate cells before fitting the model. EdgeR requires a small
54
+ number of samples, so if adata_ is a single-cell experiment, the cells should
55
+ be aggregated. Defaults to True.
56
+ verbosity : int, optional
57
+ Verbosity level. Defaults to 0.
58
+
59
+ Returns
60
+ -------
61
+ dict[str, pd.DataFrame]
62
+ Dictionary of dataframes, one for each contrast, with the following columns:
63
+
64
+ * gene_ids : str
65
+ Gene IDs.
66
+ * logFC : float
67
+ Log2 fold change.
68
+ * logCPM : float
69
+ Log2 CPM.
70
+ * F: float
71
+ F-statistic.
72
+ * PValue : float
73
+ p-value.
74
+ * FDR : float
75
+ False discovery rate.
76
+ * pct_expr_cnd : float
77
+ Percentage of cells in condition group expressing the gene.
78
+ * pct_expr_ref : float
79
+ Percentage of cells in reference group expressing the gene.
80
+ """
81
+ _try_imports()
82
+ import anndata2ri # noqa: F401
83
+ import rpy2.robjects as robjects
84
+ from rpy2.rinterface_lib.embedded import RRuntimeError # noqa: F401
85
+ from rpy2.robjects import pandas2ri # noqa: F401
86
+ from rpy2.robjects.conversion import localconverter # noqa: F401
87
+
88
+ R = robjects.r
89
+
90
+ if aggregate:
91
+ aggr_adata = aggregate_and_filter(
92
+ adata_,
93
+ group_key,
94
+ cell_identity_key,
95
+ layer,
96
+ replicas_per_group,
97
+ min_cells_per_group,
98
+ bootstrap_sampling,
99
+ use_cells,
100
+ )
101
+ else:
102
+ aggr_adata = adata_.copy()
103
+
104
+ with localconverter(anndata2ri.converter):
105
+ R.assign("aggr_adata", aggr_adata)
106
+
107
+ # defines the R function for fitting the model with edgeR
108
+ R(_fit_model_r_script)
109
+
110
+ if condition_group is None:
111
+ condition_group_list = aggr_adata.obs[group_key].unique()
112
+ elif isinstance(condition_group, str):
113
+ condition_group_list = [condition_group]
114
+ else:
115
+ condition_group_list = condition_group
116
+
117
+ if cell_identity_key is not None:
118
+ cids = aggr_adata.obs[cell_identity_key].unique()
119
+ else:
120
+ cids = [""]
121
+
122
+ tt_dict = {}
123
+ for condition_group in condition_group_list:
124
+ if reference_group is not None and condition_group == reference_group:
125
+ continue
126
+
127
+ if verbosity > 0:
128
+ print(f"Fitting model for {condition_group}...")
129
+
130
+ if reference_group is not None:
131
+ gk = group_key
132
+ else:
133
+ gk = f"{group_key}_{condition_group}"
134
+
135
+ try:
136
+ R(f"""
137
+ outs <- fit_model(aggr_adata, "{gk}", "{cell_identity_key}", "{batch_key}", verbosity = {verbosity})
138
+ fit <- outs$fit
139
+ y <- outs$y
140
+ """)
141
+
142
+ except RRuntimeError as e:
143
+ print("Error fitting model for", condition_group)
144
+ print("Error:", e)
145
+ print("Skipping...", flush=True)
146
+ continue
147
+
148
+ if reference_group is None:
149
+ new_contrasts_tuples = [
150
+ (
151
+ condition_group, # common prefix
152
+ "", # condition group
153
+ "not", # reference group
154
+ cid, # cell identity
155
+ )
156
+ for cid in cids
157
+ ]
158
+
159
+ else:
160
+ new_contrasts_tuples = [
161
+ (
162
+ "", # common prefix
163
+ condition_group, # condition group
164
+ reference_group, # reference group
165
+ cid, # cell identity
166
+ )
167
+ for cid in cids
168
+ ]
169
+
170
+ new_contrasts = [
171
+ f"group{cnd}{prefix}_{cid}".strip("_")
172
+ + "-"
173
+ + f"group{ref}{prefix}_{cid}".strip("_")
174
+ for prefix, cnd, ref, cid in new_contrasts_tuples
175
+ ]
176
+
177
+ for contrast, contrast_tuple in zip(new_contrasts, new_contrasts_tuples):
178
+ prefix, cnd, ref, cid = contrast_tuple
179
+
180
+ if ref == "not":
181
+ cnd, ref = "", "rest"
182
+
183
+ contrast_key = f"{prefix}{cnd}_vs_{ref}"
184
+ if cid:
185
+ contrast_key = f"{cell_identity_key}:{cid}|{contrast_key}"
186
+
187
+ if verbosity > 0:
188
+ print(f"Computing contrast: {contrast_key}... ({contrast})")
189
+
190
+ R(f"myContrast <- makeContrasts('{contrast}', levels = y$design)")
191
+ R("qlf <- glmQLFTest(fit, contrast=myContrast)")
192
+ R("tt <- topTags(qlf, n = Inf)$table")
193
+ tt: pd.DataFrame = pandas2ri.rpy2py(R("tt"))
194
+ tt.index.name = "gene_ids"
195
+
196
+ genes = tt.index
197
+ cnd, ref = [c[5:] for c in contrast.split("-")]
198
+ tt["pct_expr_cnd"] = aggr_adata.var[f"pct_expr_{cnd}"].loc[genes]
199
+ tt["pct_expr_ref"] = aggr_adata.var[f"pct_expr_{ref}"].loc[genes]
200
+ tt["num_expr_cnd"] = aggr_adata.var[f"num_expr_{cnd}"].loc[genes]
201
+ tt["num_expr_ref"] = aggr_adata.var[f"num_expr_{ref}"].loc[genes]
202
+ tt["tot_expr_cnd"] = aggr_adata.var[f"tot_expr_{cnd}"].loc[genes]
203
+ tt["tot_expr_ref"] = aggr_adata.var[f"tot_expr_{ref}"].loc[genes]
204
+ tt["mean_cnd"] = tt["tot_expr_cnd"] / tt["num_expr_cnd"]
205
+ tt["mean_ref"] = tt["tot_expr_ref"] / tt["num_expr_ref"]
206
+ tt_dict[contrast_key] = tt
207
+
208
+ return tt_dict
209
+
210
+
211
+ _fit_model_r_script = """
212
+ suppressPackageStartupMessages({
213
+ library(edgeR)
214
+ library(MAST)
215
+ })
216
+
217
+ fit_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
218
+
219
+ if (verbosity > 0){
220
+ cat("Group key:", group_key, "\n")
221
+ cat("Cell identity key:", cell_identity_key, "\n")
222
+ }
223
+
224
+ # create an edgeR object with counts and grouping factor
225
+ y <- DGEList(assay(adata_, "X"), group = colData(adata_)[[group_key]])
226
+ # filter out genes with low counts
227
+ if (verbosity > 1){
228
+ cat("Dimensions before subsetting:", dim(y), "\n")
229
+ }
230
+ keep <- filterByExpr(y)
231
+ y <- y[keep, , keep.lib.sizes=FALSE]
232
+ if (verbosity > 1){
233
+ cat("Dimensions after subsetting:", dim(y), "\n")
234
+ }
235
+
236
+ # normalize
237
+ y <- calcNormFactors(y)
238
+ # create a vector that is concatentation of condition and cell type that we will later use with contrasts
239
+ if (cell_identity_key == "None"){
240
+ group <- colData(adata_)[[group_key]]
241
+ } else {
242
+ group <- paste0(colData(adata_)[[group_key]], "_", colData(adata_)[[cell_identity_key]])
243
+ }
244
+ if (verbosity > 1){
245
+ cat("Group(s):", group, "\n")
246
+ }
247
+
248
+ replica <- colData(adata_)$replica
249
+
250
+ # create a design matrix
251
+ if (batch_key == "None"){
252
+ design <- model.matrix(~ 0 + group + replica)
253
+ } else {
254
+ batch <- colData(adata_)[[batch_key]]
255
+ design <- model.matrix(~ 0 + group + replica + batch)
256
+ }
257
+
258
+ # estimate dispersion
259
+ y <- estimateDisp(y, design = design)
260
+ # fit the model
261
+ fit <- glmQLFit(y, design)
262
+ return(list("fit"=fit, "design"=design, "y"=y))
263
+ }
264
+ """
265
+
266
+
267
+ def _try_imports():
268
+ try:
269
+ import rpy2.robjects as robjects
270
+ from rpy2.robjects.packages import PackageNotInstalledError, importr
271
+
272
+ robjects.r("options(warn=-1)")
273
+ import anndata2ri # noqa: F401
274
+ from rpy2.rinterface_lib.embedded import RRuntimeError # noqa: F401
275
+ from rpy2.robjects import numpy2ri, pandas2ri # noqa: F401
276
+ from rpy2.robjects.conversion import localconverter # noqa: F401
277
+
278
+ importr("edgeR")
279
+ importr("MAST")
280
+ importr("SingleCellExperiment")
281
+
282
+ except ModuleNotFoundError:
283
+ message = (
284
+ "edger_pseudobulk requires rpy2 and anndata2ri to be installed.\n"
285
+ "or\n"
286
+ "$ pip install rpy2 sclab-tools[r]\n"
287
+ "or\n"
288
+ "$ pip install rpy2 anndata2ri\n"
289
+ "or\n"
290
+ "$ conda install -c conda-forge rpy2 anndata2ri\n"
291
+ )
292
+ print(message)
293
+ raise ModuleNotFoundError(message)
294
+
295
+ except PackageNotInstalledError:
296
+ message = (
297
+ "edger_pseudobulk requires the following R packages to be installed: edgeR, MAST, and SingleCellExperiment.\n"
298
+ "> \n"
299
+ "> if (!require('BiocManager', quietly = TRUE)) install.packages('BiocManager');\n"
300
+ "> BiocManager::install(c('edgeR', 'MAST', 'SingleCellExperiment'));\n"
301
+ "> \n"
302
+ )
303
+ print(message)
304
+ raise ImportError(message)