sclab 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sclab might be problematic. Click here for more details.

sclab/__init__.py CHANGED
@@ -6,4 +6,4 @@ __all__ = [
6
6
  "SCLabDashboard",
7
7
  ]
8
8
 
9
- __version__ = "0.3.2"
9
+ __version__ = "0.3.3"
@@ -2,6 +2,9 @@ from ._cca_integrate import cca_integrate, cca_integrate_pair
2
2
  from ._filter_obs import filter_obs
3
3
  from ._harmony_integrate import harmony_integrate
4
4
  from ._normalize_weighted import normalize_weighted
5
+ from ._pca import pca
6
+ from ._preprocess import preprocess
7
+ from ._qc import qc
5
8
  from ._subset import subset_obs, subset_var
6
9
  from ._transfer_metadata import transfer_metadata
7
10
  from ._transform import pool_neighbors
@@ -12,7 +15,10 @@ __all__ = [
12
15
  "filter_obs",
13
16
  "harmony_integrate",
14
17
  "normalize_weighted",
18
+ "pca",
15
19
  "pool_neighbors",
20
+ "preprocess",
21
+ "qc",
16
22
  "subset_obs",
17
23
  "subset_var",
18
24
  "transfer_metadata",
sclab/preprocess/_cca.py CHANGED
@@ -1,24 +1,31 @@
1
1
  import logging
2
+ import os
2
3
  from typing import Literal
3
4
 
4
5
  import numpy as np
6
+ from joblib import Parallel, delayed
5
7
  from numpy import matrix
6
8
  from numpy.typing import NDArray
7
9
  from scipy.linalg import svd
8
10
  from scipy.sparse import csc_matrix, csr_matrix, issparse
11
+ from scipy.sparse import vstack as sparse_vstack
9
12
  from scipy.sparse.linalg import svds
10
13
  from sklearn.utils.extmath import randomized_svd
11
14
 
12
15
  logger = logging.getLogger(__name__)
13
16
 
14
17
 
18
+ N_CPUS = os.cpu_count()
19
+
20
+
15
21
  def cca(
16
22
  X: NDArray | csr_matrix | csc_matrix,
17
23
  Y: NDArray | csr_matrix | csc_matrix,
18
24
  n_components=None,
19
- svd_solver: Literal["full", "partial", "randomized"] = "partial",
25
+ svd_solver: Literal["full", "partial", "randomized"] = "randomized",
20
26
  normalize: bool = False,
21
27
  random_state=42,
28
+ n_jobs: int = N_CPUS,
22
29
  ) -> tuple[NDArray, NDArray, NDArray]:
23
30
  """
24
31
  CCA-style integration for two single-cell matrices with unequal numbers of cells.
@@ -50,7 +57,7 @@ def cca(
50
57
  k = n_components or min(n1, n2)
51
58
 
52
59
  if issparse(X):
53
- C = _cross_covariance_sparse(X, Y)
60
+ C = _cross_covariance_sparse(X, Y, n_jobs=n_jobs)
54
61
  else:
55
62
  C = _cross_covariance_dense(X, Y)
56
63
 
@@ -103,7 +110,7 @@ def _svd_decomposition(
103
110
  return Uc, s, Vct
104
111
 
105
112
 
106
- def _cross_covariance_sparse(X: csr_matrix, Y: csr_matrix) -> NDArray:
113
+ def _cross_covariance_sparse(X: csr_matrix, Y: csr_matrix, n_jobs=N_CPUS) -> NDArray:
107
114
  _, p1 = X.shape
108
115
  _, p2 = Y.shape
109
116
  if p1 != p2:
@@ -118,7 +125,7 @@ def _cross_covariance_sparse(X: csr_matrix, Y: csr_matrix) -> NDArray:
118
125
  mux: matrix = X.mean(axis=0)
119
126
  muy: matrix = Y.mean(axis=0)
120
127
 
121
- XYt: csr_matrix = X.dot(Y.T)
128
+ XYt: csr_matrix = _spmm_parallel(X, Y.T, n_jobs=n_jobs)
122
129
  Xmuyt: matrix = X.dot(muy.T)
123
130
  muxYt: matrix = Y.dot(mux.T).T
124
131
  muxmuyt: float = (mux @ muy.T)[0, 0]
@@ -152,3 +159,18 @@ def _dense_scale(A: NDArray) -> NDArray:
152
159
  A = np.asarray(A)
153
160
  eps = np.finfo(A.dtype).eps
154
161
  return A / (A.std(axis=0, ddof=1, keepdims=True) + eps)
162
+
163
+
164
+ def _spmm_chunk(A_csr, X, start, stop):
165
+ return A_csr[start:stop, :] @ X
166
+
167
+
168
+ def _spmm_parallel(A_csr: csr_matrix, X_csc: csc_matrix, n_jobs=N_CPUS):
169
+ n = A_csr.shape[0]
170
+
171
+ bounds = np.linspace(0, n, n_jobs + 1, dtype=int)
172
+ Ys = Parallel(n_jobs=n_jobs, prefer="processes")(
173
+ delayed(_spmm_chunk)(A_csr, X_csc, bounds[i], bounds[i + 1])
174
+ for i in range(n_jobs)
175
+ )
176
+ return sparse_vstack(Ys) # result is sparse if X is sparse, dense otherwise
@@ -13,8 +13,8 @@ def cca_integrate(
13
13
  reference_batch: str | list[str] | None = None,
14
14
  mask_var: str | None = None,
15
15
  n_components: int = 30,
16
- svd_solver: str = "partial",
17
- normalize: bool = False,
16
+ svd_solver: str = "randomized",
17
+ normalize: bool = True,
18
18
  random_state: int | None = None,
19
19
  ):
20
20
  n_groups = adata.obs[key].nunique()
@@ -46,8 +46,8 @@ def cca_integrate_pair(
46
46
  adjusted_basis: str | None = None,
47
47
  mask_var: str | None = None,
48
48
  n_components: int = 30,
49
- svd_solver: str = "partial",
50
- normalize: bool = False,
49
+ svd_solver: str = "randomized",
50
+ normalize: bool = True,
51
51
  random_state: int | None = None,
52
52
  ):
53
53
  if basis is None:
@@ -9,6 +9,7 @@ def normalize_weighted(
9
9
  adata: AnnData,
10
10
  target_scale: float | None = None,
11
11
  batch_key: str | None = None,
12
+ q: float = 0.99,
12
13
  ) -> None:
13
14
  if batch_key is not None:
14
15
  for _, idx in adata.obs.groupby(batch_key, observed=True).groups.items():
@@ -22,6 +23,8 @@ def normalize_weighted(
22
23
 
23
24
  return
24
25
 
26
+ target_scale = None
27
+
25
28
  X: csr_matrix
26
29
  Y: csr_matrix
27
30
  Z: csr_matrix
@@ -38,6 +41,7 @@ def normalize_weighted(
38
41
  Y.eliminate_zeros()
39
42
  Y.data = -Y.data * np.log(Y.data)
40
43
  entropy = Y.sum(axis=0)
44
+ entropy[:, entropy.A1 < np.quantile(entropy.A1, q)] *= 0.0
41
45
 
42
46
  Z = X.multiply(entropy)
43
47
  Z = Z.tocsr()
@@ -48,7 +52,7 @@ def normalize_weighted(
48
52
  "ignore", category=RuntimeWarning, message="divide by zero"
49
53
  )
50
54
  scale = Z.sum(axis=1)
51
- Z = Z.multiply(1 / scale)
55
+ Z = X.multiply(1 / scale)
52
56
  Z = Z.tocsr()
53
57
 
54
58
  if target_scale is None:
@@ -0,0 +1,51 @@
1
+ from anndata import AnnData
2
+
3
+
4
+ def pca(
5
+ adata: AnnData,
6
+ layer: str | None = None,
7
+ n_comps: int = 30,
8
+ mask_var: str | None = None,
9
+ batch_key: str | None = None,
10
+ reference_batch: str | None = None,
11
+ zero_center: bool = False,
12
+ ):
13
+ import scanpy as sc
14
+
15
+ pca_kwargs = dict(
16
+ n_comps=n_comps,
17
+ layer=layer,
18
+ mask_var=mask_var,
19
+ svd_solver="arpack",
20
+ )
21
+
22
+ if reference_batch:
23
+ obs_mask = adata.obs[batch_key] == reference_batch
24
+ adata_ref = adata[obs_mask].copy()
25
+ if mask_var == "highly_variable":
26
+ sc.pp.highly_variable_genes(
27
+ adata_ref, layer=f"{layer if layer else 'X'}_log1p", flavor="seurat"
28
+ )
29
+ hvg_seurat = adata_ref.var["highly_variable"]
30
+ sc.pp.highly_variable_genes(
31
+ adata_ref,
32
+ layer=layer,
33
+ flavor="seurat_v3_paper",
34
+ n_top_genes=hvg_seurat.sum(),
35
+ )
36
+ hvg_seurat_v3 = adata_ref.var["highly_variable"]
37
+ adata_ref.var["highly_variable"] = hvg_seurat | hvg_seurat_v3
38
+
39
+ sc.pp.pca(adata_ref, **pca_kwargs)
40
+ uns_pca = adata_ref.uns["pca"]
41
+ uns_pca["reference_batch"] = reference_batch
42
+ PCs = adata_ref.varm["PCs"]
43
+ adata.obsm["X_pca"] = adata.X.dot(PCs)
44
+ adata.uns["pca"] = uns_pca
45
+ adata.varm["PCs"] = PCs
46
+ else:
47
+ sc.pp.pca(adata, **pca_kwargs)
48
+ adata.obsm["X_pca"] = adata.X.dot(adata.varm["PCs"])
49
+
50
+ if zero_center:
51
+ adata.obsm["X_pca"] -= adata.obsm["X_pca"].mean(axis=0, keepdims=True)
@@ -0,0 +1,155 @@
1
+ import warnings
2
+ from typing import Literal
3
+
4
+ import numpy as np
5
+ from anndata import AnnData, ImplicitModificationWarning
6
+ from tqdm.auto import tqdm
7
+
8
+
9
+ def preprocess(
10
+ adata: AnnData,
11
+ counts_layer: str = "counts",
12
+ group_by: str | None = None,
13
+ min_cells: int = 5,
14
+ min_genes: int = 5,
15
+ compute_hvg: bool = True,
16
+ regress_total_counts: bool = False,
17
+ regress_n_genes: bool = False,
18
+ normalization_method: Literal["library", "weighted", "none"] = "library",
19
+ target_scale: float = 1e4,
20
+ weighted_norm_quantile: float = 0.9,
21
+ log1p: bool = True,
22
+ scale: bool = True,
23
+ ):
24
+ import scanpy as sc
25
+
26
+ from ._normalize_weighted import normalize_weighted
27
+
28
+ with tqdm(total=100, bar_format="{percentage:3.0f}%|{bar}|") as pbar:
29
+ if counts_layer not in adata.layers:
30
+ adata.layers[counts_layer] = adata.X.copy()
31
+
32
+ if f"{counts_layer}_log1p" not in adata.layers:
33
+ adata.layers[f"{counts_layer}_log1p"] = sc.pp.log1p(
34
+ adata.layers[counts_layer].copy()
35
+ )
36
+ pbar.update(10)
37
+
38
+ adata.X = adata.layers[counts_layer].copy()
39
+ sc.pp.calculate_qc_metrics(
40
+ adata,
41
+ percent_top=None,
42
+ log1p=False,
43
+ inplace=True,
44
+ )
45
+ sc.pp.filter_cells(adata, min_genes=min_genes)
46
+ sc.pp.filter_genes(adata, min_cells=min_cells)
47
+ pbar.update(10)
48
+
49
+ sc.pp.calculate_qc_metrics(
50
+ adata,
51
+ percent_top=None,
52
+ log1p=False,
53
+ inplace=True,
54
+ )
55
+ pbar.update(10)
56
+
57
+ if compute_hvg:
58
+ if group_by is not None:
59
+ adata.var["highly_variable"] = False
60
+ for name, idx in adata.obs.groupby(
61
+ group_by, observed=True
62
+ ).groups.items():
63
+ hvg_seurat = sc.pp.highly_variable_genes(
64
+ adata[idx],
65
+ layer=f"{counts_layer}_log1p",
66
+ flavor="seurat",
67
+ inplace=False,
68
+ )["highly_variable"]
69
+
70
+ hvg_seurat_v3 = sc.pp.highly_variable_genes(
71
+ adata[idx],
72
+ layer=counts_layer,
73
+ flavor="seurat_v3_paper",
74
+ n_top_genes=hvg_seurat.sum(),
75
+ inplace=False,
76
+ )["highly_variable"]
77
+
78
+ adata.var[f"highly_variable_{name}"] = hvg_seurat | hvg_seurat_v3
79
+ adata.var["highly_variable"] |= adata.var[f"highly_variable_{name}"]
80
+
81
+ else:
82
+ sc.pp.highly_variable_genes(
83
+ adata, layer=f"{counts_layer}_log1p", flavor="seurat"
84
+ )
85
+ hvg_seurat = adata.var["highly_variable"]
86
+
87
+ sc.pp.highly_variable_genes(
88
+ adata,
89
+ layer=counts_layer,
90
+ flavor="seurat_v3_paper",
91
+ n_top_genes=hvg_seurat.sum(),
92
+ )
93
+ hvg_seurat_v3 = adata.var["highly_variable"]
94
+
95
+ adata.var["highly_variable"] = hvg_seurat | hvg_seurat_v3
96
+
97
+ pbar.update(10)
98
+ pbar.update(10)
99
+
100
+ new_layer = counts_layer
101
+ if normalization_method == "library":
102
+ new_layer += "_normt"
103
+ sc.pp.normalize_total(adata, target_sum=target_scale)
104
+ elif normalization_method == "weighted":
105
+ new_layer += "_normw"
106
+ normalize_weighted(
107
+ adata,
108
+ target_scale=target_scale,
109
+ batch_key=group_by,
110
+ q=weighted_norm_quantile,
111
+ )
112
+
113
+ pbar.update(10)
114
+ pbar.update(10)
115
+
116
+ if log1p:
117
+ new_layer += "_log1p"
118
+ adata.uns.pop("log1p", None)
119
+ sc.pp.log1p(adata)
120
+ pbar.update(10)
121
+
122
+ vars_to_regress = []
123
+ if regress_n_genes:
124
+ vars_to_regress.append("n_genes_by_counts")
125
+
126
+ if regress_total_counts and log1p:
127
+ adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])
128
+ vars_to_regress.append("log1p_total_counts")
129
+ elif regress_total_counts:
130
+ vars_to_regress.append("total_counts")
131
+
132
+ if vars_to_regress:
133
+ new_layer += "_regr"
134
+ sc.pp.regress_out(adata, keys=vars_to_regress, n_jobs=1)
135
+ pbar.update(10)
136
+
137
+ if scale:
138
+ new_layer += "_scale"
139
+ if group_by is not None:
140
+ for _, idx in adata.obs.groupby(group_by, observed=True).groups.items():
141
+ with warnings.catch_warnings():
142
+ warnings.filterwarnings(
143
+ "ignore",
144
+ category=ImplicitModificationWarning,
145
+ message="Modifying `X` on a view results in data being overridden",
146
+ )
147
+ adata[idx].X = sc.pp.scale(adata[idx].X, zero_center=False)
148
+ else:
149
+ sc.pp.scale(adata, zero_center=False)
150
+
151
+ adata.layers[new_layer] = adata.X.copy()
152
+
153
+ pbar.update(10)
154
+
155
+ adata.X = adata.X.astype(np.float32)
@@ -0,0 +1,38 @@
1
+ import numpy as np
2
+ from anndata import AnnData
3
+
4
+
5
+ def qc(
6
+ adata: AnnData,
7
+ counts_layer: str = "counts",
8
+ min_counts: int = 50,
9
+ min_genes: int = 5,
10
+ min_cells: int = 5,
11
+ max_rank: int = 0,
12
+ ):
13
+ import scanpy as sc
14
+
15
+ if counts_layer not in adata.layers:
16
+ adata.layers[counts_layer] = adata.X.copy()
17
+
18
+ adata.layers["qc_tmp_current_X"] = adata.X
19
+ adata.X = adata.layers[counts_layer].copy()
20
+ rowsums = np.asarray(adata.X.sum(axis=1)).squeeze()
21
+
22
+ obs_idx = adata.obs_names[rowsums >= min_counts]
23
+ adata._inplace_subset_obs(obs_idx)
24
+
25
+ sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
26
+
27
+ sc.pp.filter_cells(adata, min_genes=min_genes)
28
+ sc.pp.filter_genes(adata, min_cells=min_cells)
29
+ sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
30
+ adata.obs["barcode_rank"] = adata.obs["total_counts"].rank(ascending=False)
31
+
32
+ # Restore original X
33
+ adata.X = adata.layers.pop("qc_tmp_current_X")
34
+
35
+ if max_rank > 0:
36
+ series = adata.obs["barcode_rank"]
37
+ index = series.loc[series < max_rank].index
38
+ adata._inplace_subset_obs(index)
@@ -23,18 +23,19 @@ def transfer_metadata(
23
23
  min_neighs: int = 5,
24
24
  weight_by: Literal["connectivity", "distance", "constant"] = "connectivity",
25
25
  ):
26
- D: csr_matrix = adata.obsp["distances"]
27
- C: csr_matrix = adata.obsp["connectivities"]
26
+ D: csr_matrix = adata.obsp["distances"].copy()
27
+ C: csr_matrix = adata.obsp["connectivities"].copy()
28
28
  D = D.tocsr()
29
+ W: csr_matrix
29
30
 
30
31
  match weight_by:
31
32
  case "connectivity":
32
- W = C.tocsr()
33
+ W = C.tocsr().copy()
33
34
  case "distance":
34
- W = D.tocsr()
35
+ W = D.tocsr().copy()
35
36
  W.data = 1.0 / W.data
36
37
  case "constant":
37
- W = D.tocsr()
38
+ W = D.tocsr().copy()
38
39
  W.data[:] = 1.0
39
40
  case _:
40
41
  raise ValueError(f"Unsupported weight_by {weight_by}")
@@ -280,6 +280,7 @@ def estimate_periodic_pseudotime_start(
280
280
  time_key: str = "pseudotime",
281
281
  bandwidth: float = 1 / 64,
282
282
  show_plot: bool = False,
283
+ nth_root: int = 1,
283
284
  ):
284
285
  # TODO: Test implementation
285
286
  pseudotime = adata.obs[time_key].values.copy()
@@ -316,7 +317,10 @@ def estimate_periodic_pseudotime_start(
316
317
  roots = (x[idx] + x[1:][idx]) / 2
317
318
  heights = yp[idx]
318
319
 
319
- max_peak_x = roots[heights.argmin()]
320
+ roots = roots[heights.argsort()]
321
+ heights = heights[heights.argsort()]
322
+
323
+ max_peak_x = roots[nth_root - 1]
320
324
 
321
325
  if show_plot:
322
326
  plt.hist(
@@ -1,5 +1,7 @@
1
1
  from ._pseudobulk_edger import pseudobulk_edger
2
+ from ._pseudobulk_limma import pseudobulk_limma
2
3
 
3
4
  __all__ = [
4
5
  "pseudobulk_edger",
6
+ "pseudobulk_limma",
5
7
  ]
@@ -12,9 +12,9 @@ def pseudobulk_edger(
12
12
  cell_identity_key: str | None = None,
13
13
  batch_key: str | None = None,
14
14
  layer: str | None = None,
15
- replicas_per_group: int = 10,
15
+ replicas_per_group: int = 5,
16
16
  min_cells_per_group: int = 30,
17
- bootstrap_sampling: bool = True,
17
+ bootstrap_sampling: bool = False,
18
18
  use_cells: dict[str, list[str]] | None = None,
19
19
  aggregate: bool = True,
20
20
  verbosity: int = 0,
@@ -134,7 +134,7 @@ def pseudobulk_edger(
134
134
 
135
135
  try:
136
136
  R(f"""
137
- outs <- fit_model(aggr_adata, "{gk}", "{cell_identity_key}", "{batch_key}", verbosity = {verbosity})
137
+ outs <- fit_edger_model(aggr_adata, "{gk}", "{cell_identity_key}", "{batch_key}", verbosity = {verbosity})
138
138
  fit <- outs$fit
139
139
  y <- outs$y
140
140
  """)
@@ -214,33 +214,20 @@ suppressPackageStartupMessages({
214
214
  library(MAST)
215
215
  })
216
216
 
217
- fit_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
217
+ fit_edger_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
218
218
 
219
219
  if (verbosity > 0){
220
220
  cat("Group key:", group_key, "\n")
221
221
  cat("Cell identity key:", cell_identity_key, "\n")
222
222
  }
223
223
 
224
- # create an edgeR object with counts and grouping factor
225
- y <- DGEList(assay(adata_, "X"), group = colData(adata_)[[group_key]])
226
- # filter out genes with low counts
227
- if (verbosity > 1){
228
- cat("Dimensions before subsetting:", dim(y), "\n")
229
- }
230
- keep <- filterByExpr(y)
231
- y <- y[keep, , keep.lib.sizes=FALSE]
232
- if (verbosity > 1){
233
- cat("Dimensions after subsetting:", dim(y), "\n")
234
- }
235
-
236
- # normalize
237
- y <- calcNormFactors(y)
238
224
  # create a vector that is concatentation of condition and cell type that we will later use with contrasts
239
225
  if (cell_identity_key == "None"){
240
226
  group <- colData(adata_)[[group_key]]
241
227
  } else {
242
228
  group <- paste0(colData(adata_)[[group_key]], "_", colData(adata_)[[cell_identity_key]])
243
229
  }
230
+
244
231
  if (verbosity > 1){
245
232
  cat("Group(s):", group, "\n")
246
233
  }
@@ -255,10 +242,28 @@ fit_model <- function(adata_, group_key, cell_identity_key = "None", batch_key =
255
242
  design <- model.matrix(~ 0 + group + replica + batch)
256
243
  }
257
244
 
245
+ # create an edgeR object with counts and grouping factor
246
+ y <- DGEList(assay(adata_, "X"), group = colData(adata_)[[group_key]])
247
+
248
+ # filter out genes with low counts
249
+ if (verbosity > 1){
250
+ cat("Dimensions before subsetting:", dim(y), "\n")
251
+ }
252
+
253
+ keep <- filterByExpr(y)
254
+ y <- y[keep, , keep.lib.sizes=FALSE]
255
+ if (verbosity > 1){
256
+ cat("Dimensions after subsetting:", dim(y), "\n")
257
+ }
258
+
259
+ # normalize
260
+ y <- calcNormFactors(y)
261
+
258
262
  # estimate dispersion
259
263
  y <- estimateDisp(y, design = design)
260
264
  # fit the model
261
265
  fit <- glmQLFit(y, design)
266
+
262
267
  return(list("fit"=fit, "design"=design, "y"=y))
263
268
  }
264
269
  """
@@ -282,9 +287,7 @@ def _try_imports():
282
287
  except ModuleNotFoundError:
283
288
  message = (
284
289
  "edger_pseudobulk requires rpy2 and anndata2ri to be installed.\n"
285
- "or\n"
286
- "$ pip install rpy2 sclab-tools[r]\n"
287
- "or\n"
290
+ "please install with one of the following:\n"
288
291
  "$ pip install rpy2 anndata2ri\n"
289
292
  "or\n"
290
293
  "$ conda install -c conda-forge rpy2 anndata2ri\n"
@@ -0,0 +1,257 @@
1
+ import pandas as pd
2
+ from anndata import AnnData
3
+
4
+ from ._pseudobulk_helpers import aggregate_and_filter
5
+
6
+
7
+ def pseudobulk_limma(
8
+ adata_: AnnData,
9
+ group_key: str,
10
+ condition_group: str | list[str] | None = None,
11
+ reference_group: str | None = None,
12
+ cell_identity_key: str | None = None,
13
+ batch_key: str | None = None,
14
+ layer: str | None = None,
15
+ replicas_per_group: int = 5,
16
+ min_cells_per_group: int = 30,
17
+ bootstrap_sampling: bool = False,
18
+ use_cells: dict[str, list[str]] | None = None,
19
+ aggregate: bool = True,
20
+ verbosity: int = 0,
21
+ ) -> dict[str, pd.DataFrame]:
22
+ _try_imports()
23
+ import anndata2ri # noqa: F401
24
+ import rpy2.robjects as robjects
25
+ from rpy2.rinterface_lib.embedded import RRuntimeError # noqa: F401
26
+ from rpy2.robjects import pandas2ri # noqa: F401
27
+ from rpy2.robjects.conversion import localconverter # noqa: F401
28
+
29
+ R = robjects.r
30
+
31
+ if aggregate:
32
+ aggr_adata = aggregate_and_filter(
33
+ adata_,
34
+ group_key,
35
+ cell_identity_key,
36
+ layer,
37
+ replicas_per_group,
38
+ min_cells_per_group,
39
+ bootstrap_sampling,
40
+ use_cells,
41
+ )
42
+ else:
43
+ aggr_adata = adata_.copy()
44
+
45
+ with localconverter(anndata2ri.converter):
46
+ R.assign("aggr_adata", aggr_adata)
47
+
48
+ # defines the R function for fitting the model with limma
49
+ R(_fit_model_r_script)
50
+
51
+ if condition_group is None:
52
+ condition_group_list = aggr_adata.obs[group_key].unique()
53
+ elif isinstance(condition_group, str):
54
+ condition_group_list = [condition_group]
55
+ else:
56
+ condition_group_list = condition_group
57
+
58
+ if cell_identity_key is not None:
59
+ cids = aggr_adata.obs[cell_identity_key].unique()
60
+ else:
61
+ cids = [""]
62
+
63
+ tt_dict = {}
64
+ for condition_group in condition_group_list:
65
+ if reference_group is not None and condition_group == reference_group:
66
+ continue
67
+
68
+ if verbosity > 0:
69
+ print(f"Fitting model for {condition_group}...")
70
+
71
+ if reference_group is not None:
72
+ gk = group_key
73
+ else:
74
+ gk = f"{group_key}_{condition_group}"
75
+
76
+ try:
77
+ R(f"""
78
+ outs <- fit_limma_model(aggr_adata, "{gk}", "{cell_identity_key}", verbosity = {verbosity})
79
+ fit <- outs$fit
80
+ v <- outs$v
81
+ """)
82
+
83
+ except RRuntimeError as e:
84
+ print("Error fitting model for", condition_group)
85
+ print("Error:", e)
86
+ print("Skipping...", flush=True)
87
+ continue
88
+
89
+ if reference_group is None:
90
+ new_contrasts_tuples = [
91
+ (
92
+ condition_group, # common prefix
93
+ "", # condition group
94
+ "not", # reference group
95
+ cid, # cell identity
96
+ )
97
+ for cid in cids
98
+ ]
99
+
100
+ else:
101
+ new_contrasts_tuples = [
102
+ (
103
+ "", # common prefix
104
+ condition_group, # condition group
105
+ reference_group, # reference group
106
+ cid, # cell identity
107
+ )
108
+ for cid in cids
109
+ ]
110
+
111
+ new_contrasts = [
112
+ f"group{cnd}{prefix}_{cid}".strip("_")
113
+ + "-"
114
+ + f"group{ref}{prefix}_{cid}".strip("_")
115
+ for prefix, cnd, ref, cid in new_contrasts_tuples
116
+ ]
117
+
118
+ for contrast, contrast_tuple in zip(new_contrasts, new_contrasts_tuples):
119
+ prefix, cnd, ref, cid = contrast_tuple
120
+
121
+ if ref == "not":
122
+ cnd, ref = "", "rest"
123
+
124
+ contrast_key = f"{prefix}{cnd}_vs_{ref}"
125
+ if cid:
126
+ contrast_key = f"{cell_identity_key}:{cid}|{contrast_key}"
127
+
128
+ if verbosity > 0:
129
+ print(f"Computing contrast: {contrast_key}... ({contrast})")
130
+
131
+ R(f"myContrast <- makeContrasts('{contrast}', levels = v$design)")
132
+ R("fit2 <- contrasts.fit(fit, myContrast)")
133
+ R("fit2 <- eBayes(fit2)")
134
+ R("tt <- topTable(fit2, n = Inf)")
135
+ tt: pd.DataFrame = pandas2ri.rpy2py(R("tt"))
136
+ tt.index.name = "gene_ids"
137
+
138
+ genes = tt.index
139
+ cnd, ref = [c[5:] for c in contrast.split("-")]
140
+ tt["pct_expr_cnd"] = aggr_adata.var[f"pct_expr_{cnd}"].loc[genes]
141
+ tt["pct_expr_ref"] = aggr_adata.var[f"pct_expr_{ref}"].loc[genes]
142
+ tt["num_expr_cnd"] = aggr_adata.var[f"num_expr_{cnd}"].loc[genes]
143
+ tt["num_expr_ref"] = aggr_adata.var[f"num_expr_{ref}"].loc[genes]
144
+ tt["tot_expr_cnd"] = aggr_adata.var[f"tot_expr_{cnd}"].loc[genes]
145
+ tt["tot_expr_ref"] = aggr_adata.var[f"tot_expr_{ref}"].loc[genes]
146
+ tt["mean_cnd"] = tt["tot_expr_cnd"] / tt["num_expr_cnd"]
147
+ tt["mean_ref"] = tt["tot_expr_ref"] / tt["num_expr_ref"]
148
+ tt_dict[contrast_key] = tt
149
+
150
+ return tt_dict
151
+
152
+
153
+ _fit_model_r_script = """
154
+ suppressPackageStartupMessages({
155
+ library(edgeR)
156
+ library(limma)
157
+ library(MAST)
158
+ })
159
+
160
+ fit_limma_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
161
+
162
+ if (verbosity > 0){
163
+ cat("Group key:", group_key, "\n")
164
+ cat("Cell identity key:", cell_identity_key, "\n")
165
+ }
166
+
167
+ # create a vector that is concatentation of condition and cell type that we will later use with contrasts
168
+ if (cell_identity_key == "None"){
169
+ group <- colData(adata_)[[group_key]]
170
+ } else {
171
+ group <- paste0(colData(adata_)[[group_key]], "_", colData(adata_)[[cell_identity_key]])
172
+ }
173
+
174
+ if (verbosity > 1){
175
+ cat("Group(s):", group, "\n")
176
+ }
177
+
178
+ group <- factor(group)
179
+ replica <- factor(colData(adata_)$replica)
180
+
181
+ # create a design matrix
182
+ if (batch_key == "None"){
183
+ design <- model.matrix(~ 0 + group + replica)
184
+ } else {
185
+ batch <- factor(colData(adata_)[[batch_key]])
186
+ design <- model.matrix(~ 0 + group + replica + batch)
187
+ }
188
+ colnames(design) <- make.names(colnames(design))
189
+
190
+ # create an edgeR object with counts and grouping factor
191
+ y <- DGEList(assay(adata_, "X"), group = group)
192
+
193
+ # filter out genes with low counts
194
+ if (verbosity > 1){
195
+ cat("Dimensions before subsetting:", dim(y), "\n")
196
+ }
197
+
198
+ keep <- filterByExpr(y, design = design)
199
+ y <- y[keep, , keep.lib.sizes=FALSE]
200
+ if (verbosity > 1){
201
+ cat("Dimensions after subsetting:", dim(y), "\n")
202
+ }
203
+
204
+ # normalize
205
+ y <- calcNormFactors(y)
206
+
207
+ # Apply voom transformation to prepare for linear modeling
208
+ v <- voom(y, design, plot = verbosity > 1)
209
+
210
+ # Fit the linear model
211
+ fit <- lmFit(v, design)
212
+ ne <- limma::nonEstimable(design)
213
+ if (!is.null(ne) && verbosity > 0) cat("Non-estimable:", ne, "\n")
214
+ fit <- eBayes(fit)
215
+
216
+ return(list("fit"=fit, "design"=design, "v"=v))
217
+ }
218
+ """
219
+
220
+
221
+ def _try_imports():
222
+ try:
223
+ import rpy2.robjects as robjects
224
+ from rpy2.robjects.packages import PackageNotInstalledError, importr
225
+
226
+ robjects.r("options(warn=-1)")
227
+ import anndata2ri # noqa: F401
228
+ from rpy2.rinterface_lib.embedded import RRuntimeError # noqa: F401
229
+ from rpy2.robjects import numpy2ri, pandas2ri # noqa: F401
230
+ from rpy2.robjects.conversion import localconverter # noqa: F401
231
+
232
+ importr("edgeR")
233
+ importr("limma")
234
+ importr("MAST")
235
+ importr("SingleCellExperiment")
236
+
237
+ except ModuleNotFoundError:
238
+ message = (
239
+ "pseudobulk_limma requires rpy2 and anndata2ri to be installed.\n"
240
+ "please install with one of the following:\n"
241
+ "$ pip install rpy2 anndata2ri\n"
242
+ "or\n"
243
+ "$ conda install -c conda-forge rpy2 anndata2ri\n"
244
+ )
245
+ print(message)
246
+ raise ModuleNotFoundError(message)
247
+
248
+ except PackageNotInstalledError:
249
+ message = (
250
+ "pseudobulk_limma requires the following R packages to be installed: limma, edgeR, MAST, and SingleCellExperiment.\n"
251
+ "> \n"
252
+ "> if (!require('BiocManager', quietly = TRUE)) install.packages('BiocManager');\n"
253
+ "> BiocManager::install(c('limma', 'edgeR', 'MAST', 'SingleCellExperiment'));\n"
254
+ "> \n"
255
+ )
256
+ print(message)
257
+ raise ImportError(message)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sclab
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: sclab
5
5
  Author-email: Argenis Arriojas <ArriojasMaldonado001@umb.edu>
6
6
  Requires-Python: >=3.10,<3.13
@@ -65,7 +65,6 @@ Open a Jupyter Notebook and run the following:
65
65
  ```python
66
66
  from IPython.display import display
67
67
  from sclab import SCLabDashboard
68
- from sclab.examples.processor_steps import QC, Preprocess, PCA, Neighbors, UMAP, Cluster
69
68
  import scanpy as sc
70
69
 
71
70
  # Load your data
@@ -73,8 +72,6 @@ adata = sc.read_10x_h5("your_data.h5")
73
72
 
74
73
  # Create dashboard
75
74
  dashboard = SCLabDashboard(adata, name="My Analysis")
76
- # Add desired processing steps to the interface
77
- dashboard.pr.add_steps({"Processing": [QC, Preprocess, PCA, Neighbors, UMAP, Cluster]})
78
75
 
79
76
  # Display dashboard
80
77
  display(dashboard)
@@ -84,8 +81,10 @@ display(dashboard)
84
81
  # dashboard.pl # Plotter
85
82
  # dashboard.pr # Processor
86
83
 
87
- # the resulting AnnData object is found within the dataset object:
84
+ # the active AnnData object is found within the dataset object:
88
85
  # dashboard.ds.adata
86
+
87
+ # by default, the dashboard will update the loaded AnnData object in-place
89
88
  ```
90
89
 
91
90
  ## Components
@@ -94,6 +93,7 @@ display(dashboard)
94
93
 
95
94
  The main interface that integrates all components with a tabbed layout:
96
95
  - Main graph for visualizations
96
+ - Results panel
97
97
  - Observations table
98
98
  - Genes table
99
99
  - Event logs
@@ -1,4 +1,4 @@
1
- sclab/__init__.py,sha256=3ni3MpADkty43xRMRFsxvB_jIrmOjxyMKFGhHsYx8Ho,132
1
+ sclab/__init__.py,sha256=HL5i9EwSQftjS4e4qKLVbTHNbKXTGgFhLoY0r1pf938,132
2
2
  sclab/_io.py,sha256=5ISxIPbE233UiOt3QEs9fkLO8DLLEe5HrMnZoR-KLYE,2662
3
3
  sclab/_methods_registry.py,sha256=RcffyRuuLzHqsnAdbBL4W1GmZx80d9AxdGjUnx1mbNg,1704
4
4
  sclab/_sclab.py,sha256=m9y2EgDxFO5JHZAZIK1098bHdrZxaeWfBZNyGQkFCdA,9143
@@ -36,15 +36,18 @@ sclab/gui/components/__init__.py,sha256=X0-cGJmII76qpWHEBe49miS2gPw3esMAwp61z23w
36
36
  sclab/gui/components/_guided_pseudotime.py,sha256=sxI0jmZxD4fxV9CsTJONhINWzIL--YF3nDg0nku2yp8,17670
37
37
  sclab/gui/components/_transfer_metadata.py,sha256=o9t9bQ7tr3G2-vFptUu1IiMxGhvMq-QJb_lF7AsDhQQ,6236
38
38
  sclab/methods/__init__.py,sha256=d_n5SCyzwMEBZttXwnhgkx2FnD7AxhAK9yBre6Rynfk,1215
39
- sclab/preprocess/__init__.py,sha256=zGt-TIDRx3qoKDpFxqnZ9yf2AfP4HBy8ZZSFLx-rGj0,547
40
- sclab/preprocess/_cca.py,sha256=etDk1s3rKX-r0nkRuUaLTJ7NyalauCpWXyKRq1_EwTc,4400
41
- sclab/preprocess/_cca_integrate.py,sha256=14Tu6TyH7wfZYAM2EsII1R92PTxfhaYbGx4bLWlMLP0,2706
39
+ sclab/preprocess/__init__.py,sha256=NrOFnk9olVkwC9mR5orduyY7rMxAgi9Bgywo9-_Sfkk,664
40
+ sclab/preprocess/_cca.py,sha256=77J_v5IJnHsLJnnhYmKtq2e_mJwQlsdGgB6lSIftf4Q,5080
41
+ sclab/preprocess/_cca_integrate.py,sha256=eIvEdUon7OkNY-kbEqRJeuIaj_m6wk72PcRFow1kH9g,2710
42
42
  sclab/preprocess/_filter_obs.py,sha256=uYlcljuaq85G44Si8oxrNRcCCX2nFRdT3RN3ArqnwaY,1166
43
43
  sclab/preprocess/_harmony.py,sha256=wpFQXpr13BvljT04I_Rw5JdBvhzvAuinkrRs152CfvQ,13747
44
44
  sclab/preprocess/_harmony_integrate.py,sha256=cKN_MyYq9FtwgQZyhgxiFNTZl36YuKLQdEoc7ky-ea4,1737
45
- sclab/preprocess/_normalize_weighted.py,sha256=h86rQakNoXoRKOepAEoBbBtHV_F-VMG2-uW_LcaSdWs,1587
45
+ sclab/preprocess/_normalize_weighted.py,sha256=Y_Tk3Dvv-Dd4s6D7JkAsuPLCCucrgmnAZBSrDssNFMU,1696
46
+ sclab/preprocess/_pca.py,sha256=fFR03B1_V7CLcZXSn5Ek9HqG_zSLwVK7DUf8XgPZIoQ,1606
47
+ sclab/preprocess/_preprocess.py,sha256=-Ve6HLbyzuqxwoUW1rd1JGb_ZWRwo4KV6ri7l-hvVjY,5200
48
+ sclab/preprocess/_qc.py,sha256=CblkoK0CB2bkjgLuGAxcyYb89ZjhhS5NEhO4RgIQI4I,1159
46
49
  sclab/preprocess/_subset.py,sha256=8Vc5jty8WzIf8NZ1mleqNJLAp5CRWvEXGVevlT6ekNk,7066
47
- sclab/preprocess/_transfer_metadata.py,sha256=HA11JHHpq4ueFTeXlU4K3kHDDUzcUjyvfxpzdPBRNTo,4307
50
+ sclab/preprocess/_transfer_metadata.py,sha256=loNsBG2HnQCE2-miu7cVPD6AHfj7yAMgxKWWndoDfA8,4360
48
51
  sclab/preprocess/_transform.py,sha256=n2xHJR3T-rRxZneCFH2oMw9RaQcGGBfHpOu1-YP1c1E,2312
49
52
  sclab/preprocess/_utils.py,sha256=dLeS_fIvQGZZfesEtbJKtnPmqjqy3fmyTC4GewRD3Fc,3078
50
53
  sclab/scanpy/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -60,23 +63,24 @@ sclab/tools/cellflow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
60
63
  sclab/tools/cellflow/density_dynamics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
64
  sclab/tools/cellflow/density_dynamics/_density_dynamics.py,sha256=xzmeIAHLV5xIVERpWMClZViDpJcge_dPsx6GWI0j0R8,11038
62
65
  sclab/tools/cellflow/pseudotime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
- sclab/tools/cellflow/pseudotime/_pseudotime.py,sha256=RiwbOduu0InNdh2Rp8DUXG6-r5wL_vVUGTyTkwnYhTY,9842
66
+ sclab/tools/cellflow/pseudotime/_pseudotime.py,sha256=Wud0ooPtNmManLsNiUgMHtx5TzIHs6vIXsWTu2M1hE0,9940
64
67
  sclab/tools/cellflow/pseudotime/timeseries.py,sha256=ZuMAm9LOKksJy2FzsQg3rdYKtLm1G0rbgO6dOdQuIV0,6326
65
68
  sclab/tools/cellflow/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
69
  sclab/tools/cellflow/utils/density_nd.py,sha256=wwYoXOcF2CRxOArW_CQxJjWDE90wiui4NO7EIBG2RGM,6648
67
70
  sclab/tools/cellflow/utils/interpolate.py,sha256=HnpYEBdc4KSPC4QYOglJ2MpipLx3a5ENJQ8uhMnuwRc,9755
68
71
  sclab/tools/cellflow/utils/smoothen.py,sha256=yg2_zBrYKGRmXZY8C3pKmX3xGm0GGMI365IKqhgCmP0,3738
69
72
  sclab/tools/cellflow/utils/times.py,sha256=lV5ZRjCdBaYELGJ1pGdEBeA0w-WD77lOzPC6R7_kUxo,1811
70
- sclab/tools/differential_expression/__init__.py,sha256=KKaDzeGGgE1LxnC5aBcPQYcVX_e2h8qAbfMPQVvYlSA,87
71
- sclab/tools/differential_expression/_pseudobulk_edger.py,sha256=WWqKEA8I1I_YsgjafeznX3tyZoXyu0HV13QkxiYGFgQ,10538
73
+ sclab/tools/differential_expression/__init__.py,sha256=xdmdaCYdJDxiY5g4o8mQHCk4Z6gUZv92gvLiF81nA-M,159
74
+ sclab/tools/differential_expression/_pseudobulk_edger.py,sha256=W5rKnFuLCKTRiuUe3ugRaKfXUvmMS0uAFrYD2bNAq9w,10525
72
75
  sclab/tools/differential_expression/_pseudobulk_helpers.py,sha256=raQ0DBBrmrxBbGTKhOyZpZLmeJRX_tWcn3_mzuQctkw,8424
76
+ sclab/tools/differential_expression/_pseudobulk_limma.py,sha256=Hf864a424CGvPBmogjcwEA-7eJKLeVFU44JFWKX51cY,8416
73
77
  sclab/tools/doublet_detection/__init__.py,sha256=zWyAPScrHVRaBqWaizVsm2H3oi6yr0OQ5gF-fGY2ZrA,63
74
78
  sclab/tools/doublet_detection/_scrublet.py,sha256=koi6MRUS1lWVvdpeNbzpR8njqVFrWEuWoKNMFXQLFec,1953
75
79
  sclab/tools/labeling/__init__.py,sha256=o-FJWonGNr2h_pB0o3YfnGl_y1kKU06_rYLmTt8ktlQ,57
76
80
  sclab/tools/labeling/sctype.py,sha256=jCsCFnqUgb_s1nTSK-N_5pEL_ZvZw-zUo12fUy9RLfs,8164
77
81
  sclab/utils/__init__.py,sha256=Py3dPN9ptMs6D-f7IGYisoxOS2YuX0O1oyw75nci3Os,72
78
82
  sclab/utils/_write_excel.py,sha256=DBZg9Kx7Ex6VqFrZFDZbSgvzMtu84iEwKo4nI3I2AT0,17017
79
- sclab-0.3.2.dist-info/licenses/LICENSE,sha256=LO7qldZoHIo9hc-HMBqclBh5800kZ9US9xTbLAQdHpg,1523
80
- sclab-0.3.2.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
81
- sclab-0.3.2.dist-info/METADATA,sha256=YOX9WvDuWL1ew3JOIaAwY6MvpCHCEHxOdTc6dTCp9jM,4437
82
- sclab-0.3.2.dist-info/RECORD,,
83
+ sclab-0.3.3.dist-info/licenses/LICENSE,sha256=LO7qldZoHIo9hc-HMBqclBh5800kZ9US9xTbLAQdHpg,1523
84
+ sclab-0.3.3.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
85
+ sclab-0.3.3.dist-info/METADATA,sha256=E1rrLRdCRHpdTXbHDas0mhtJwO50BsfDgGJCcISo8Q0,4301
86
+ sclab-0.3.3.dist-info/RECORD,,
File without changes