sclab 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sclab might be problematic. Click here for more details.

Files changed (53) hide show
  1. sclab/__init__.py +1 -1
  2. sclab/_sclab.py +7 -3
  3. sclab/dataset/_dataset.py +1 -1
  4. sclab/dataset/processor/_processor.py +19 -4
  5. sclab/examples/processor_steps/__init__.py +2 -0
  6. sclab/examples/processor_steps/_doublet_detection.py +68 -0
  7. sclab/examples/processor_steps/_integration.py +47 -20
  8. sclab/examples/processor_steps/_neighbors.py +24 -4
  9. sclab/examples/processor_steps/_pca.py +11 -6
  10. sclab/examples/processor_steps/_preprocess.py +14 -1
  11. sclab/examples/processor_steps/_qc.py +22 -6
  12. sclab/gui/__init__.py +0 -0
  13. sclab/gui/components/__init__.py +7 -0
  14. sclab/gui/components/_guided_pseudotime.py +482 -0
  15. sclab/gui/components/_transfer_metadata.py +186 -0
  16. sclab/methods/__init__.py +16 -0
  17. sclab/preprocess/__init__.py +19 -0
  18. sclab/preprocess/_cca.py +154 -0
  19. sclab/preprocess/_cca_integrate.py +109 -0
  20. sclab/preprocess/_filter_obs.py +42 -0
  21. sclab/preprocess/_harmony.py +421 -0
  22. sclab/preprocess/_harmony_integrate.py +53 -0
  23. sclab/preprocess/_normalize_weighted.py +61 -0
  24. sclab/preprocess/_subset.py +208 -0
  25. sclab/preprocess/_transfer_metadata.py +137 -0
  26. sclab/preprocess/_transform.py +82 -0
  27. sclab/preprocess/_utils.py +96 -0
  28. sclab/tools/__init__.py +0 -0
  29. sclab/tools/cellflow/__init__.py +0 -0
  30. sclab/tools/cellflow/density_dynamics/__init__.py +0 -0
  31. sclab/tools/cellflow/density_dynamics/_density_dynamics.py +349 -0
  32. sclab/tools/cellflow/pseudotime/__init__.py +0 -0
  33. sclab/tools/cellflow/pseudotime/_pseudotime.py +332 -0
  34. sclab/tools/cellflow/pseudotime/timeseries.py +226 -0
  35. sclab/tools/cellflow/utils/__init__.py +0 -0
  36. sclab/tools/cellflow/utils/density_nd.py +215 -0
  37. sclab/tools/cellflow/utils/interpolate.py +334 -0
  38. sclab/tools/cellflow/utils/smoothen.py +124 -0
  39. sclab/tools/cellflow/utils/times.py +55 -0
  40. sclab/tools/differential_expression/__init__.py +5 -0
  41. sclab/tools/differential_expression/_pseudobulk_edger.py +304 -0
  42. sclab/tools/differential_expression/_pseudobulk_helpers.py +277 -0
  43. sclab/tools/doublet_detection/__init__.py +5 -0
  44. sclab/tools/doublet_detection/_scrublet.py +64 -0
  45. sclab/tools/labeling/__init__.py +6 -0
  46. sclab/tools/labeling/sctype.py +233 -0
  47. sclab/utils/__init__.py +5 -0
  48. sclab/utils/_write_excel.py +510 -0
  49. {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/METADATA +6 -2
  50. sclab-0.3.1.dist-info/RECORD +82 -0
  51. sclab-0.2.5.dist-info/RECORD +0 -45
  52. {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/WHEEL +0 -0
  53. {sclab-0.2.5.dist-info → sclab-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,226 @@
1
+ from typing import Callable, NamedTuple
2
+
3
+ import numpy as np
4
+ from numpy.lib.stride_tricks import sliding_window_view
5
+ from numpy.typing import NDArray
6
+ from scipy.signal import find_peaks
7
+ from scipy.sparse import csr_matrix, issparse
8
+ from tqdm.auto import tqdm
9
+
10
+ from ..utils.interpolate import NDBSpline
11
+
12
+
13
+ def periodic_sliding_window(
14
+ data: NDArray, t: NDArray, window_size: int, fn: Callable[[NDArray], NDArray]
15
+ ) -> NDArray:
16
+ ws = window_size + ((window_size - 1) % 2)
17
+ window_shape = (ws,) + (1,) * (data.ndim - 1)
18
+
19
+ o = np.argsort(t)
20
+ oo = np.argsort(o)
21
+
22
+ d = data[o]
23
+ dd = [*d[-ws // 2 + 1 :], *d, *d[: ws // 2]]
24
+
25
+ windows = sliding_window_view(dd, window_shape=window_shape).squeeze()
26
+ return fn(windows, axis=-1)[oo]
27
+
28
+
29
+ def equalization(
30
+ times: NDArray,
31
+ t_range: tuple[float, float],
32
+ max_bins: int = 200,
33
+ iterations: int = 1e4,
34
+ tolerance: float = 0.02,
35
+ ) -> NDArray:
36
+ if not isinstance(times, np.ndarray):
37
+ raise TypeError("times must be a numpy array")
38
+
39
+ if times.ndim != 1:
40
+ raise ValueError("times must be a 1D array")
41
+
42
+ t_min, t_max = t_range
43
+ t_span = t_max - t_min
44
+
45
+ # for sorting the values
46
+ o = np.argsort(times)
47
+ # and recovering the original order
48
+ oo = np.argsort(o)
49
+
50
+ alpha = 0.1
51
+ scale_offset = 1
52
+
53
+ rng = np.random.default_rng()
54
+ scaled_times = times.copy()
55
+
56
+ for n_bins in tqdm(np.arange(25, max_bins + 1, 25)):
57
+ for it in range(int(iterations)):
58
+ bins = np.linspace(t_min, t_max, n_bins + 1)
59
+ bins[1:-1] += rng.normal(0, t_span / n_bins / 100, bins[1:-1].size)
60
+ counts, _ = np.histogram(scaled_times, bins=bins)
61
+ tmp: NDArray = counts / counts.max()
62
+ rms = np.sqrt(np.mean((tmp - tmp.mean()) ** 2))
63
+ if rms < tolerance:
64
+ break
65
+
66
+ scales = counts / counts.max() * alpha + scale_offset
67
+
68
+ t = scaled_times[o]
69
+ tt = []
70
+ i = 0
71
+ timepoint = 0.0
72
+ for start, end, scale in zip(bins[:-1], bins[1:], scales):
73
+ bin_size = end - start
74
+ new_size = bin_size * scale
75
+ while i < t.size and t[i] < end:
76
+ new_t = (t[i] - start) * scale + timepoint
77
+ tt.append(new_t)
78
+ i += 1
79
+ timepoint += new_size
80
+
81
+ tt = np.array(tt)
82
+ scaled_times = tt[oo] / timepoint * t_span + t_min
83
+
84
+ else:
85
+ cnts_mean, cnts_max, cnts_min = counts.mean(), counts.max(), counts.min()
86
+ print(
87
+ f"Failed to converge. RMS: {rms}. "
88
+ + f"({cnts_mean=:.2f}, {cnts_max=:.2f}, {cnts_min=:.2f})"
89
+ )
90
+
91
+ return scaled_times
92
+
93
+
94
+ def fit_trends(
95
+ X: NDArray | csr_matrix,
96
+ times: NDArray,
97
+ t_range: tuple[float, float],
98
+ periodic: bool,
99
+ grid_size: int = 128,
100
+ roughness: float | None = None,
101
+ zero_weight: float = 0.5,
102
+ window_width: float | None = None,
103
+ n_timesteps: int | None = None,
104
+ timestep_delta: float | None = None,
105
+ progress: bool = True,
106
+ ) -> None:
107
+ if issparse(X):
108
+ X = np.ascontiguousarray(X.todense())
109
+
110
+ tmin, tmax = t_range
111
+
112
+ mask = ~np.isnan(times)
113
+ t = times[mask]
114
+ X = X[mask]
115
+
116
+ F = NDBSpline(
117
+ grid_size=grid_size,
118
+ t_range=t_range,
119
+ periodic=periodic,
120
+ zero_weight=zero_weight,
121
+ roughness=roughness,
122
+ window_width=window_width,
123
+ )
124
+ F.fit(t, X, progress=progress)
125
+
126
+ eps = np.finfo(float).eps
127
+ SNR: NDArray
128
+ SNR = F(t).var(axis=0) / (X.var(axis=0) + eps)
129
+ SNR = SNR / SNR.max()
130
+
131
+ # x = np.linspace(*t_range, 10001)[:-1]
132
+ # peak_time = x[np.argmax(F(x), axis=0)]
133
+
134
+ if n_timesteps is not None and timestep_delta is not None:
135
+ raise ValueError("Cannot specify both n_timesteps and timestep_delta")
136
+ elif n_timesteps is None and timestep_delta is None:
137
+ # default
138
+ x = np.linspace(*t_range, 101)
139
+ elif n_timesteps is not None:
140
+ x = np.linspace(*t_range, n_timesteps)
141
+ elif timestep_delta is not None:
142
+ x = np.arange(tmin, tmax + timestep_delta, timestep_delta)
143
+
144
+ Y = F(x)
145
+
146
+ return x, Y
147
+
148
+
149
+ class SinglePeakResult(NamedTuple):
150
+ times: NDArray
151
+ heights: NDArray
152
+ scores: NDArray
153
+ info: NDArray
154
+
155
+
156
+ def find_single_peaks(
157
+ X: NDArray,
158
+ t: NDArray,
159
+ t_range: tuple[float, float] = (0, 1),
160
+ grid_size: int = 512,
161
+ periodic: bool = True,
162
+ zero_weight: float = 0.2,
163
+ roughness: float = 2,
164
+ n_timesteps: int = 201,
165
+ width_range: tuple[float, float] = (0, 100),
166
+ score_threshold: float = 2.5,
167
+ progress: bool = True,
168
+ ) -> tuple[NDArray, NDArray]:
169
+ X = X / np.percentile(X + 1, 99, axis=0, keepdims=True)
170
+ x, Y = fit_trends(
171
+ X,
172
+ t,
173
+ t_range=t_range,
174
+ periodic=periodic,
175
+ grid_size=grid_size,
176
+ zero_weight=zero_weight,
177
+ roughness=roughness,
178
+ n_timesteps=n_timesteps,
179
+ progress=progress,
180
+ )
181
+
182
+ peak_times = np.full(X.shape[1], np.nan)
183
+ peak_heights = np.full(X.shape[1], np.nan)
184
+ peak_scores = np.full(X.shape[1], np.nan)
185
+ peak_info_data = [{}] * X.shape[1]
186
+
187
+ idx_sequence = range(X.shape[1])
188
+ if progress:
189
+ idx_sequence = tqdm(idx_sequence)
190
+
191
+ for i in idx_sequence:
192
+ y = Y[:, i]
193
+ k, info = find_peaks(y, prominence=0.05, width=width_range, height=0)
194
+ m = np.median(y)
195
+ s = y[k] / m
196
+ k = k[s > score_threshold]
197
+ if len(k) == 1:
198
+ peak_times[i] = x[k]
199
+ peak_heights[i] = y[k]
200
+ peak_scores[i] = np.log2(s[0])
201
+ peak_info_data[i] = info
202
+
203
+ return SinglePeakResult(peak_times, peak_heights, peak_scores, peak_info_data)
204
+
205
+
206
+ def piecewise_scaling(
207
+ times: NDArray,
208
+ t_range: tuple[float, float],
209
+ start: float,
210
+ end: float,
211
+ new_end: float,
212
+ ) -> NDArray:
213
+ tmin, tmax = t_range
214
+
215
+ times_pws = np.full(times.shape, np.nan)
216
+
217
+ mask = (times >= tmin) & (times < start)
218
+ times_pws[mask] = times[mask]
219
+
220
+ mask = (times >= start) & (times < end)
221
+ times_pws[mask] = (times[mask] - start) / (end - start) * (new_end - start) + start
222
+
223
+ mask = (times >= end) & (times < tmax)
224
+ times_pws[mask] = (times[mask] - end) / (tmax - end) * (tmax - new_end) + new_end
225
+
226
+ return times_pws
File without changes
@@ -0,0 +1,215 @@
1
+ from itertools import product
2
+ from typing import Literal, NamedTuple
3
+
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ from numpy.typing import NDArray
7
+ from scipy.integrate import trapezoid
8
+ from scipy.interpolate import BSpline
9
+ from sklearn.neighbors import KernelDensity
10
+
11
+ from .interpolate import fit_smoothing_spline
12
+
13
+
14
+ class DensityResult(NamedTuple):
15
+ kde: KernelDensity
16
+ grid_size: int
17
+ bounds: tuple[tuple[float, float], ...]
18
+ grid: NDArray
19
+ density: NDArray
20
+ scale: float
21
+ periodic: bool
22
+
23
+
24
+ def density_nd(
25
+ data: NDArray,
26
+ bandwidth: float | Literal["scott", "silverman"] | None = None,
27
+ algorithm: Literal["kd_tree", "ball_tree", "auto"] = "auto",
28
+ kernel: str = "gaussian",
29
+ metric: str = "euclidean",
30
+ grid_size: tuple | None = None,
31
+ max_grid_size: int = 2**5 + 1,
32
+ periodic: bool = False,
33
+ bounds: tuple[tuple[float, float], ...] | None = None,
34
+ normalize: bool = False,
35
+ ) -> DensityResult:
36
+ if data.ndim == 1:
37
+ data = data.reshape(-1, 1)
38
+
39
+ nsamples, ndims = data.shape
40
+ if bounds is None:
41
+ assert not periodic, "bounds must be specified if periodic=True"
42
+ lower, upper = data.min(axis=0), data.max(axis=0)
43
+ span = upper - lower
44
+ margins = span / 10
45
+ bounds = tuple(zip(lower - margins, upper + margins))
46
+ assert len(bounds) == ndims, "must provide bounds for each dimension"
47
+
48
+ if periodic:
49
+ offsets = np.array(list(product([-1, 0, 1], repeat=ndims)))
50
+ offsets = offsets * np.diff(bounds).T
51
+ dat = np.empty((nsamples * 3**ndims, ndims))
52
+ for i, offset in enumerate(offsets):
53
+ dat[i * nsamples : (i + 1) * nsamples] = data + offset[None, :]
54
+ else:
55
+ dat = data
56
+
57
+ if bandwidth is None:
58
+ bandwidth = np.diff(bounds).max() / 64
59
+
60
+ kde = KernelDensity(
61
+ bandwidth=bandwidth,
62
+ algorithm=algorithm,
63
+ kernel=kernel,
64
+ metric=metric,
65
+ )
66
+ kde.fit(dat)
67
+
68
+ if grid_size is None:
69
+ max_span = np.diff(bounds).max()
70
+ rel_span = np.diff(bounds).flatten() / max_span
71
+ grid_size = tuple((rel_span * max_grid_size).astype(int))
72
+
73
+ grid = np.meshgrid(
74
+ *[np.linspace(*b, n) for b, n in zip(bounds, grid_size)], indexing="ij"
75
+ )
76
+ grid = np.vstack([x.ravel() for x in grid]).T
77
+ d = np.exp(kde.score_samples(grid))
78
+
79
+ if normalize and ndims == 1:
80
+ scale = trapezoid(d, grid.reshape(-1))
81
+ elif normalize:
82
+ # perform simple Riemmann sum for higher dimensions
83
+ deltas = np.diff(bounds).T / (np.array(grid_size) - 1)
84
+ tmp = d.reshape(grid_size).copy()
85
+ for i, s in enumerate(grid_size):
86
+ # take left corners for the sum
87
+ tmp = tmp.take(np.arange(s - 1), axis=i)
88
+ scale = tmp.sum() * np.prod(deltas)
89
+ else:
90
+ scale = 1
91
+
92
+ d /= scale
93
+
94
+ return DensityResult(kde, grid_size, bounds, grid, d, scale, periodic)
95
+
96
+
97
+ def fit_density_1d(
98
+ times: NDArray[np.floating],
99
+ t_range: tuple[float, float],
100
+ periodic: bool,
101
+ bandwidth: float | None = None,
102
+ algorithm: str = "auto",
103
+ kernel: str = "gaussian",
104
+ metric: str = "euclidean",
105
+ max_grid_size: int = 2**8 + 1,
106
+ lam: float = 1e-5,
107
+ ) -> tuple[DensityResult, BSpline]:
108
+ tmin, tmax = t_range
109
+ tspan = tmax - tmin
110
+
111
+ times_mask = (tmin <= times) * (times <= tmax)
112
+ times = times[times_mask]
113
+
114
+ if bandwidth is None:
115
+ bandwidth = tspan / 64
116
+
117
+ rslt = density_nd(
118
+ times.reshape(-1, 1),
119
+ bandwidth=bandwidth,
120
+ algorithm=algorithm,
121
+ kernel=kernel,
122
+ metric=metric,
123
+ max_grid_size=max_grid_size,
124
+ periodic=periodic,
125
+ bounds=(t_range,),
126
+ normalize=True,
127
+ )
128
+
129
+ bspl = fit_smoothing_spline(
130
+ rslt.grid[:, 0],
131
+ rslt.density,
132
+ t_range,
133
+ lam=lam,
134
+ periodic=periodic,
135
+ )
136
+
137
+ return rslt, bspl
138
+
139
+
140
+ def density_result_1d(
141
+ rslt: DensityResult,
142
+ data: NDArray | None = None,
143
+ density_fit_lam: float = 1e-6,
144
+ plot_density: bool = False,
145
+ plot_density_fit: bool = True,
146
+ plot_density_fit_derivative: bool = False,
147
+ plot_histogram: bool = False,
148
+ histogram_nbins: int = 50,
149
+ ax: plt.Axes | None = None,
150
+ show: bool = True,
151
+ ):
152
+ if plot_density | plot_density_fit | plot_density_fit_derivative | plot_histogram:
153
+ pass
154
+ else:
155
+ raise ValueError("At least one of the plotting options must be True")
156
+
157
+ tmin, tmax = rslt.grid.min(), rslt.grid.max()
158
+ bspl = fit_smoothing_spline(
159
+ rslt.grid[:, 0],
160
+ rslt.density,
161
+ t_range=(tmin, tmax),
162
+ lam=density_fit_lam,
163
+ periodic=rslt.periodic,
164
+ )
165
+ if ax is None:
166
+ plt.figure(figsize=(10, 3))
167
+ else:
168
+ plt.sca(ax)
169
+
170
+ ax = plt.gca()
171
+ if plot_density:
172
+ ax.plot(rslt.grid.flatten(), rslt.density, color="black", linewidth=0.5)
173
+
174
+ if plot_histogram:
175
+ assert data is not None, "data must be provided if plot_histogram=True"
176
+ # we expand the time vector to make sure that the first and last point
177
+ # are not cut by the boundary. This also helps to avoid the problem of
178
+ # the first and last point having different values (should be periodic).
179
+ tt = np.concatenate([data - tmax, data, data + tmax])
180
+ bins = np.linspace(-tmax, 2 * tmax, histogram_nbins * 3 + 1)
181
+ dd = np.histogram(tt, bins=bins, density=True)[0]
182
+ # we take the middle points of the bins
183
+ xx = bins[:-1] + np.diff(bins) / 2
184
+ # we recover the original time vector and corresponding density
185
+ x = xx[histogram_nbins : 2 * histogram_nbins]
186
+ d = dd[histogram_nbins : 2 * histogram_nbins] * 3 # correct the density
187
+ ax.bar(x, d, width=1 / histogram_nbins, fill=False, linewidth=0.5)
188
+
189
+ x = np.linspace(tmin, tmax, 2**10 + 1)
190
+ if plot_density_fit:
191
+ plt.plot(x, bspl(x), color="blue")
192
+
193
+ ax.set_ylabel("Density", color="blue")
194
+ ax.set_yticks([])
195
+
196
+ ymin, ymax = plt.ylim()
197
+ # add a bit of padding in the y axis (about 10% of current range)
198
+ plt.ylim(ymin, ymax + 0.10 * (ymax - ymin))
199
+
200
+ if plot_density_fit_derivative:
201
+ if plot_density or histogram_nbins or plot_density_fit:
202
+ plt.twinx()
203
+ plt.plot(x, bspl.derivative()(x), color="red")
204
+ plt.hlines(0, tmin, tmax, linestyles="dashed", linewidth=0.5, color="black")
205
+ plt.ylabel("Derivative", color="red")
206
+ plt.gca().set_yticks([])
207
+
208
+ # add padding in the y axis to make zero be in the middle
209
+ ymax = np.abs(plt.ylim()).max() * 1.05
210
+ plt.ylim(-ymax, ymax)
211
+
212
+ if show:
213
+ plt.show()
214
+ else:
215
+ return plt.gca()