sclab 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  from collections import Counter
2
2
  from functools import partial
3
- from typing import Literal
3
+ from typing import Callable, Literal
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -23,45 +23,79 @@ def transfer_metadata(
23
23
  min_neighs: int = 5,
24
24
  weight_by: Literal["connectivity", "distance", "constant"] = "connectivity",
25
25
  ):
26
- D: csr_matrix = adata.obsp["distances"]
27
- C: csr_matrix = adata.obsp["connectivities"]
28
- D = D.tocsr()
29
-
30
- match weight_by:
31
- case "connectivity":
32
- W = C.tocsr()
33
- case "distance":
34
- W = D.tocsr()
35
- W.data = 1.0 / W.data
36
- case "constant":
37
- W = D.tocsr()
38
- W.data[:] = 1.0
39
- case _:
40
- raise ValueError(f"Unsupported weight_by {weight_by}")
41
-
42
- meta_values: pd.Series
43
- new_values: pd.Series
26
+ new_values, new_values_err = _propagate_metadata(
27
+ adata,
28
+ column=column,
29
+ periodic=periodic,
30
+ vmin=vmin,
31
+ vmax=vmax,
32
+ min_neighs=min_neighs,
33
+ weight_by=weight_by,
34
+ mask=adata.obs[group_key] != source_group,
35
+ )
36
+
37
+ adata.obs[f"transferred_{new_values.name}"] = new_values
38
+ adata.obs[f"transferred_{new_values_err.name}"] = new_values_err
39
+
40
+
41
+ def propagate_metadata(
42
+ adata: AnnData,
43
+ column: str,
44
+ periodic: bool = False,
45
+ vmin: float = 0,
46
+ vmax: float = 1,
47
+ min_neighs: int = 5,
48
+ weight_by: Literal["connectivity", "distance", "constant"] = "connectivity",
49
+ ):
50
+ new_values, new_values_err = _propagate_metadata(
51
+ adata,
52
+ column=column,
53
+ periodic=periodic,
54
+ vmin=vmin,
55
+ vmax=vmax,
56
+ min_neighs=min_neighs,
57
+ weight_by=weight_by,
58
+ )
59
+
60
+ mask = adata.obs[column].isna()
61
+ adata.obs.loc[mask, column] = new_values.loc[mask]
62
+ adata.obs.loc[mask, new_values_err.name] = new_values_err.loc[mask]
63
+
64
+
65
+ def _propagate_metadata(
66
+ adata: AnnData,
67
+ column: str,
68
+ periodic: bool = False,
69
+ vmin: float = 0,
70
+ vmax: float = 1,
71
+ min_neighs: int = 5,
72
+ weight_by: Literal["connectivity", "distance", "constant"] = "connectivity",
73
+ mask: np.ndarray | pd.Series | None = None,
74
+ ) -> tuple[pd.Series, pd.Series]:
75
+ D, W = _get_neighbors_and_weights(adata, weight_by=weight_by)
44
76
 
77
+ assign_value_fn: Callable
45
78
  series = adata.obs[column]
46
79
  if isinstance(series.dtype, pd.CategoricalDtype) or is_bool_dtype(series.dtype):
47
80
  assign_value_fn = _assign_categorical
48
- new_column = f"transferred_{column}"
49
- new_column_err = f"transferred_{column}_proportion"
50
81
  elif is_numeric_dtype(series.dtype) and periodic:
51
82
  assign_value_fn = partial(_assign_numerical_periodic, vmin=vmin, vmax=vmax)
52
- new_column = f"transferred_{column}"
53
- new_column_err = f"transferred_{column}_error"
54
83
  elif is_numeric_dtype(series.dtype):
55
84
  assign_value_fn = _assign_numerical
56
- new_column = f"transferred_{column}"
57
- new_column_err = f"transferred_{column}_error"
58
85
  else:
59
86
  raise ValueError(f"Unsupported dtype {series.dtype} for column {column}")
60
87
 
61
- meta_values = series.copy()
62
- meta_values[adata.obs[group_key] != source_group] = np.nan
63
- new_values = pd.Series(index=series.index, dtype=series.dtype, name=new_column)
64
- new_values_err = pd.Series(index=series.index, dtype=float, name=new_column_err)
88
+ if isinstance(series.dtype, pd.CategoricalDtype) or is_bool_dtype(series.dtype):
89
+ column_err = f"{column}_proportion"
90
+ else:
91
+ column_err = f"{column}_error"
92
+
93
+ meta_values: pd.Series = series.copy()
94
+ if mask is not None:
95
+ meta_values[mask] = pd.NA
96
+
97
+ new_values = pd.Series(index=series.index, dtype=series.dtype, name=column)
98
+ new_values_err = pd.Series(index=series.index, dtype=float, name=column_err)
65
99
 
66
100
  for i, (d, w) in tqdm(enumerate(zip(D, W)), total=D.shape[0]):
67
101
  if not pd.isna(meta_values.iloc[i]):
@@ -86,8 +120,33 @@ def transfer_metadata(
86
120
  new_values.iloc[i] = assigned_value
87
121
  new_values_err.iloc[i] = assigned_value_err
88
122
 
89
- adata.obs[new_column] = new_values.copy()
90
- adata.obs[new_column_err] = new_values_err.copy()
123
+ new_values = pd.concat([new_values, meta_values], axis=1).bfill(axis=1).iloc[:, 0]
124
+
125
+ return new_values, new_values_err
126
+
127
+
128
+ def _get_neighbors_and_weights(
129
+ adata: AnnData,
130
+ weight_by: Literal["connectivity", "distance", "constant"] = "connectivity",
131
+ ):
132
+ D: csr_matrix = adata.obsp["distances"].copy()
133
+ C: csr_matrix = adata.obsp["connectivities"].copy()
134
+ D = D.tocsr()
135
+ W: csr_matrix
136
+
137
+ match weight_by:
138
+ case "connectivity":
139
+ W = C.tocsr().copy()
140
+ case "distance":
141
+ W = D.tocsr().copy()
142
+ W.data = 1.0 / W.data
143
+ case "constant":
144
+ W = D.tocsr().copy()
145
+ W.data[:] = 1.0
146
+ case _:
147
+ raise ValueError(f"Unsupported weight_by {weight_by}")
148
+
149
+ return D, W
91
150
 
92
151
 
93
152
  def _assign_categorical(values: pd.Series, weights: NDArray):
@@ -280,6 +280,7 @@ def estimate_periodic_pseudotime_start(
280
280
  time_key: str = "pseudotime",
281
281
  bandwidth: float = 1 / 64,
282
282
  show_plot: bool = False,
283
+ nth_root: int = 1,
283
284
  ):
284
285
  # TODO: Test implementation
285
286
  pseudotime = adata.obs[time_key].values.copy()
@@ -316,7 +317,10 @@ def estimate_periodic_pseudotime_start(
316
317
  roots = (x[idx] + x[1:][idx]) / 2
317
318
  heights = yp[idx]
318
319
 
319
- max_peak_x = roots[heights.argmin()]
320
+ roots = roots[heights.argsort()]
321
+ heights = heights[heights.argsort()]
322
+
323
+ max_peak_x = roots[nth_root - 1]
320
324
 
321
325
  if show_plot:
322
326
  plt.hist(
@@ -0,0 +1,106 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from anndata import AnnData
4
+ from numpy.typing import NDArray
5
+ from scipy.signal import get_window, periodogram
6
+ from scipy.sparse import spmatrix
7
+
8
+ from sclab.tools.utils import aggregate_and_filter
9
+
10
+
11
+ def periodic_genes(
12
+ adata: AnnData,
13
+ time_key: str,
14
+ tmin: float,
15
+ tmax: float,
16
+ period: float,
17
+ n: int,
18
+ min_pct_power_below: float = 0.75,
19
+ layer: str | None = None,
20
+ ):
21
+ times = adata.obs[time_key].values.copy()
22
+ if layer is None or layer == "X":
23
+ X = adata.X
24
+ else:
25
+ X = adata.layers[layer]
26
+
27
+ _assert_integer_counts(X)
28
+
29
+ tmp_adata = AnnData(X, obs=adata.obs[[time_key]], var=adata.var[[]])
30
+
31
+ w = (tmax - tmin) / n
32
+ bins = np.arange(-w / 2 + tmin, tmax, w)
33
+ labels = list(map(lambda x: f"{x:.2f}", bins[:-1] + w / 2))
34
+
35
+ times[times >= bins.max()] = times[times >= bins.max()] - tmax
36
+ tmp_adata.obs["timepoint"] = pd.cut(times, bins=bins, labels=labels)
37
+ aggregated = aggregate_and_filter(
38
+ tmp_adata,
39
+ "timepoint",
40
+ replicas_per_group=1,
41
+ make_stats=False,
42
+ make_dummies=False,
43
+ )
44
+ log_cnts = np.log1p(aggregated.X)
45
+ profiles = pd.DataFrame(log_cnts, index=labels, columns=aggregated.var_names)
46
+ ps = power_spectrum_df(profiles)
47
+ pp = pct_power_below(ps, 1 / period)
48
+
49
+ adata.varm["profile"] = profiles.T
50
+ adata.varm["periodogram"] = ps.T
51
+ adata.var["pct_power_below"] = pp
52
+ adata.var["periodic"] = pp > min_pct_power_below
53
+
54
+
55
+ def _assert_integer_counts(X: spmatrix | NDArray):
56
+ message = "Periodic genes requires raw integer counts. E.g. `layer = 'counts'`."
57
+ if isinstance(X, spmatrix):
58
+ assert all(X.data % 1 == 0), message
59
+ else:
60
+ assert all(X % 1 == 0), message
61
+
62
+
63
+ def infer_dt_from_index(idx: pd.Index) -> float:
64
+ # Works for numeric or datetime indexes
65
+ if isinstance(idx, pd.DatetimeIndex):
66
+ dt = np.median(np.diff(idx.view("i8"))) / 1e9 # seconds
67
+ else:
68
+ dt = float(np.median(np.diff(idx.values.astype(float))))
69
+ return dt
70
+
71
+
72
+ def power_spectrum_df(X: pd.DataFrame, window: str = "hann", detrend: str = "constant"):
73
+ # X: rows=timepoints, columns=variables
74
+ Xd = X - X.mean() # remove DC so percent computations are stable
75
+ dt = infer_dt_from_index(X.index) if X.index.size > 1 else 1.0
76
+ fs = 1.0 / dt
77
+ win = get_window(window, X.shape[0], fftbins=True)
78
+
79
+ # Build a tidy dataframe of periodograms for all columns
80
+ out = {}
81
+ for c in Xd.columns:
82
+ f, Pxx = periodogram(
83
+ Xd[c].values,
84
+ fs=fs,
85
+ window=win,
86
+ detrend=detrend,
87
+ scaling="spectrum", # integrates to variance
88
+ return_onesided=True,
89
+ )
90
+ out[c] = Pxx
91
+ ps = pd.DataFrame(out, index=pd.Index(f, name="frequency"))
92
+ return ps # units: (data units)^2, integrates (sum * df) to variance per column
93
+
94
+
95
+ def pct_power_below(ps: pd.DataFrame, max_freq: float) -> pd.Series:
96
+ # ps is spectrum from power_spectrum_df (one-sided, DC included but we demeaned)
97
+ # Compute integrals via the rectangle rule: sum * df (df = freq spacing)
98
+ if len(ps.index) < 2:
99
+ return pd.Series({c: np.nan for c in ps.columns}, name="pct_power_at_low_freq")
100
+ df = ps.index[1] - ps.index[0]
101
+ mask_low = ps.index <= max_freq
102
+ num: pd.Series = ps.loc[mask_low].sum() * df
103
+ den: pd.Series = ps.sum() * df
104
+ s = num / den
105
+ s.name = "pct_power_at_low_freq"
106
+ return s
@@ -1,5 +1,7 @@
1
1
  from ._pseudobulk_edger import pseudobulk_edger
2
+ from ._pseudobulk_limma import pseudobulk_limma
2
3
 
3
4
  __all__ = [
4
5
  "pseudobulk_edger",
6
+ "pseudobulk_limma",
5
7
  ]
@@ -12,9 +12,9 @@ def pseudobulk_edger(
12
12
  cell_identity_key: str | None = None,
13
13
  batch_key: str | None = None,
14
14
  layer: str | None = None,
15
- replicas_per_group: int = 10,
15
+ replicas_per_group: int = 5,
16
16
  min_cells_per_group: int = 30,
17
- bootstrap_sampling: bool = True,
17
+ bootstrap_sampling: bool = False,
18
18
  use_cells: dict[str, list[str]] | None = None,
19
19
  aggregate: bool = True,
20
20
  verbosity: int = 0,
@@ -134,7 +134,7 @@ def pseudobulk_edger(
134
134
 
135
135
  try:
136
136
  R(f"""
137
- outs <- fit_model(aggr_adata, "{gk}", "{cell_identity_key}", "{batch_key}", verbosity = {verbosity})
137
+ outs <- fit_edger_model(aggr_adata, "{gk}", "{cell_identity_key}", "{batch_key}", verbosity = {verbosity})
138
138
  fit <- outs$fit
139
139
  y <- outs$y
140
140
  """)
@@ -214,51 +214,58 @@ suppressPackageStartupMessages({
214
214
  library(MAST)
215
215
  })
216
216
 
217
- fit_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
217
+ fit_edger_model <- function(adata_, group_key, cell_identity_key = "None", batch_key = "None", verbosity = 0){
218
218
 
219
219
  if (verbosity > 0){
220
220
  cat("Group key:", group_key, "\n")
221
221
  cat("Cell identity key:", cell_identity_key, "\n")
222
222
  }
223
223
 
224
- # create an edgeR object with counts and grouping factor
225
- y <- DGEList(assay(adata_, "X"), group = colData(adata_)[[group_key]])
226
- # filter out genes with low counts
227
- if (verbosity > 1){
228
- cat("Dimensions before subsetting:", dim(y), "\n")
229
- }
230
- keep <- filterByExpr(y)
231
- y <- y[keep, , keep.lib.sizes=FALSE]
232
- if (verbosity > 1){
233
- cat("Dimensions after subsetting:", dim(y), "\n")
234
- }
235
-
236
- # normalize
237
- y <- calcNormFactors(y)
238
224
  # create a vector that is concatentation of condition and cell type that we will later use with contrasts
239
225
  if (cell_identity_key == "None"){
240
226
  group <- colData(adata_)[[group_key]]
241
227
  } else {
242
228
  group <- paste0(colData(adata_)[[group_key]], "_", colData(adata_)[[cell_identity_key]])
243
229
  }
230
+
244
231
  if (verbosity > 1){
245
232
  cat("Group(s):", group, "\n")
246
233
  }
247
234
 
248
- replica <- colData(adata_)$replica
235
+ group <- factor(group)
236
+ replica <- factor(colData(adata_)$replica)
249
237
 
250
238
  # create a design matrix
251
239
  if (batch_key == "None"){
252
240
  design <- model.matrix(~ 0 + group + replica)
253
241
  } else {
254
- batch <- colData(adata_)[[batch_key]]
242
+ batch <- factor(colData(adata_)[[batch_key]])
255
243
  design <- model.matrix(~ 0 + group + replica + batch)
256
244
  }
245
+ colnames(design) <- make.names(colnames(design))
246
+
247
+ # create an edgeR object with counts and grouping factor
248
+ y <- DGEList(assay(adata_, "X"), group = group)
249
+
250
+ # filter out genes with low counts
251
+ if (verbosity > 1){
252
+ cat("Dimensions before subsetting:", dim(y), "\n")
253
+ }
254
+
255
+ keep <- filterByExpr(y, design = design)
256
+ y <- y[keep, , keep.lib.sizes=FALSE]
257
+ if (verbosity > 1){
258
+ cat("Dimensions after subsetting:", dim(y), "\n")
259
+ }
260
+
261
+ # normalize
262
+ y <- calcNormFactors(y)
257
263
 
258
264
  # estimate dispersion
259
265
  y <- estimateDisp(y, design = design)
260
266
  # fit the model
261
267
  fit <- glmQLFit(y, design)
268
+
262
269
  return(list("fit"=fit, "design"=design, "y"=y))
263
270
  }
264
271
  """
@@ -282,9 +289,7 @@ def _try_imports():
282
289
  except ModuleNotFoundError:
283
290
  message = (
284
291
  "edger_pseudobulk requires rpy2 and anndata2ri to be installed.\n"
285
- "or\n"
286
- "$ pip install rpy2 sclab-tools[r]\n"
287
- "or\n"
292
+ "please install with one of the following:\n"
288
293
  "$ pip install rpy2 anndata2ri\n"
289
294
  "or\n"
290
295
  "$ conda install -c conda-forge rpy2 anndata2ri\n"
@@ -18,6 +18,8 @@ def aggregate_and_filter(
18
18
  min_cells_per_group: int = 30,
19
19
  bootstrap_sampling: bool = False,
20
20
  use_cells: dict[str, list[str]] | None = None,
21
+ make_stats: bool = True,
22
+ make_dummies: bool = True,
21
23
  ) -> AnnData:
22
24
  """
23
25
  Aggregate and filter cells in an AnnData object into cell populations.
@@ -40,6 +42,10 @@ def aggregate_and_filter(
40
42
  Whether to use bootstrap sampling to create replicas. Defaults to False.
41
43
  use_cells : dict[str, list[str]], optional
42
44
  If not None, only use the specified cells. Defaults to None.
45
+ make_stats : bool, optional
46
+ Whether to create expression statistics for each group. Defaults to True.
47
+ make_dummies : bool, optional
48
+ Whether to make categorical columns into dummies. Defaults to True.
43
49
 
44
50
  Returns
45
51
  -------
@@ -61,7 +67,12 @@ def aggregate_and_filter(
61
67
  _prepare_categorical_column(adata, cell_identity_key)
62
68
  cell_identity_dtype = adata.obs[cell_identity_key].dtype
63
69
 
64
- var_dataframe = _create_var_dataframe(adata, layer, grouping_keys, groups_to_drop)
70
+ if make_stats:
71
+ var_dataframe = _create_var_dataframe(
72
+ adata, layer, grouping_keys, groups_to_drop
73
+ )
74
+ else:
75
+ var_dataframe = pd.DataFrame(index=adata.var_names)
65
76
 
66
77
  data = {}
67
78
  meta = {}
@@ -100,6 +111,7 @@ def aggregate_and_filter(
100
111
  data = pd.DataFrame(data).T
101
112
  meta = pd.DataFrame(meta).T
102
113
  meta["replica"] = meta["replica"].astype("category")
114
+ meta["replica_size"] = meta["replica_size"].astype(int)
103
115
  meta[group_key] = meta[group_key].astype(group_dtype)
104
116
  if cell_identity_key is not None:
105
117
  meta[cell_identity_key] = meta[cell_identity_key].astype(cell_identity_dtype)
@@ -110,7 +122,8 @@ def aggregate_and_filter(
110
122
  var=var_dataframe,
111
123
  )
112
124
 
113
- _join_dummies(aggr_adata, group_key)
125
+ if make_dummies:
126
+ _join_dummies(aggr_adata, group_key)
114
127
 
115
128
  return aggr_adata
116
129