M3Drop 0.4.42__tar.gz → 0.4.44__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.42
3
+ Version: 0.4.44
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -21,12 +21,14 @@ Requires-Dist: matplotlib-venn>=0.11
21
21
  Requires-Dist: memory_profiler>=0.60.0
22
22
  Requires-Dist: numpy>=1.21.0
23
23
  Requires-Dist: pandas>=1.5.0
24
- Requires-Dist: py-cpuinfo
25
24
  Requires-Dist: scanpy>=1.9.0
26
25
  Requires-Dist: scikit-learn>=1.0.0
27
26
  Requires-Dist: scipy>=1.8.0
28
27
  Requires-Dist: seaborn>=0.11.0
29
28
  Requires-Dist: statsmodels>=0.13.0
29
+ Requires-Dist: numba>=0.57.0
30
+ Requires-Dist: psutil>=5.9.0
31
+ Requires-Dist: py-cpuinfo
30
32
  Provides-Extra: gpu
31
33
  Requires-Dist: cupy-cuda12x; extra == "gpu"
32
34
  Dynamic: author
@@ -7,10 +7,10 @@ M3Drop.egg-info/SOURCES.txt
7
7
  M3Drop.egg-info/dependency_links.txt
8
8
  M3Drop.egg-info/requires.txt
9
9
  M3Drop.egg-info/top_level.txt
10
- m3Drop/__init__.py
11
- m3Drop/coreCPU.py
12
- m3Drop/coreGPU.py
13
- m3Drop/diagnosticsCPU.py
14
- m3Drop/diagnosticsGPU.py
15
- m3Drop/normalizationCPU.py
16
- m3Drop/normalizationGPU.py
10
+ m3Drop/CoreCPU.py
11
+ m3Drop/CoreGPU.py
12
+ m3Drop/DiagnosticsCPU.py
13
+ m3Drop/DiagnosticsGPU.py
14
+ m3Drop/NormalizationCPU.py
15
+ m3Drop/NormalizationGPU.py
16
+ m3Drop/__init__.py
@@ -5,12 +5,14 @@ matplotlib-venn>=0.11
5
5
  memory_profiler>=0.60.0
6
6
  numpy>=1.21.0
7
7
  pandas>=1.5.0
8
- py-cpuinfo
9
8
  scanpy>=1.9.0
10
9
  scikit-learn>=1.0.0
11
10
  scipy>=1.8.0
12
11
  seaborn>=0.11.0
13
12
  statsmodels>=0.13.0
13
+ numba>=0.57.0
14
+ psutil>=5.9.0
15
+ py-cpuinfo
14
16
 
15
17
  [gpu]
16
18
  cupy-cuda12x
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.42
3
+ Version: 0.4.44
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -21,12 +21,14 @@ Requires-Dist: matplotlib-venn>=0.11
21
21
  Requires-Dist: memory_profiler>=0.60.0
22
22
  Requires-Dist: numpy>=1.21.0
23
23
  Requires-Dist: pandas>=1.5.0
24
- Requires-Dist: py-cpuinfo
25
24
  Requires-Dist: scanpy>=1.9.0
26
25
  Requires-Dist: scikit-learn>=1.0.0
27
26
  Requires-Dist: scipy>=1.8.0
28
27
  Requires-Dist: seaborn>=0.11.0
29
28
  Requires-Dist: statsmodels>=0.13.0
29
+ Requires-Dist: numba>=0.57.0
30
+ Requires-Dist: psutil>=5.9.0
31
+ Requires-Dist: py-cpuinfo
30
32
  Provides-Extra: gpu
31
33
  Requires-Dist: cupy-cuda12x; extra == "gpu"
32
34
  Dynamic: author
@@ -0,0 +1,510 @@
1
+ import time
2
+ import psutil
3
+ import h5py
4
+ import numpy as np
5
+ import anndata
6
+ import pandas as pd
7
+ import os
8
+ import sys
9
+ import pickle
10
+
11
+ # [OPTIMIZATION] Use Numba for near-C++ speed on CPU
12
+ try:
13
+ import numba
14
+ from numba import jit, prange
15
+ except ImportError:
16
+ print("CRITICAL ERROR: 'numba' not found. Please install it (pip install numba) for CPU optimization.")
17
+ sys.exit(1)
18
+
19
+ import statsmodels.api as sm
20
+ import matplotlib.pyplot as plt
21
+ from scipy.stats import norm
22
+ from scipy import sparse
23
+ from statsmodels.stats.multitest import multipletests
24
+
25
+ # [REFACTOR] Relative Import
26
+ try:
27
+ from .ControlDeviceCPU import ControlDevice
28
+ except ImportError:
29
+ # Fallback for running script directly
30
+ from ControlDeviceCPU import ControlDevice
31
+
32
+ # ==========================================
33
+ # NUMBA KERNELS (CPU OPTIMIZED)
34
+ # ==========================================
35
+
36
+ @jit(nopython=True, cache=True)
37
+ def nan_replace_cpu(x):
38
+ """Replaces NaNs with 0 and Infs with 0 or 1."""
39
+ flat = x.ravel()
40
+ for i in range(flat.size):
41
+ val = flat[i]
42
+ if np.isnan(val):
43
+ flat[i] = 0.0
44
+ elif np.isinf(val):
45
+ flat[i] = 1.0 if val > 0 else 0.0
46
+ return x.reshape(x.shape)
47
+
48
+ @jit(nopython=True, parallel=True, fastmath=True)
49
+ def dropout_prob_kernel_cpu(tj, ti, total, exp_size, out_matrix):
50
+ """
51
+ Calculates dropout probabilities using Negative Binomial logic.
52
+ Parallelized across CPU cores.
53
+ """
54
+ rows = out_matrix.shape[0]
55
+ cols = out_matrix.shape[1]
56
+
57
+ # Numba handles the broadcasting loops explicitly for max speed
58
+ for r in prange(rows):
59
+ ti_val = ti[r]
60
+ for c in range(cols):
61
+ mu = (tj[c] * ti_val) / total
62
+ size_val = exp_size[c]
63
+
64
+ base = (mu / size_val) + 1.0
65
+ if base < 1e-12:
66
+ base = 1e-12
67
+
68
+ # pow(base, -size_val)
69
+ val = base ** (-size_val)
70
+
71
+ if np.isnan(val):
72
+ out_matrix[r, c] = 0.0
73
+ elif np.isinf(val):
74
+ out_matrix[r, c] = 1.0 if val > 0 else 0.0
75
+ else:
76
+ out_matrix[r, c] = val
77
+
78
+ @jit(nopython=True, cache=True)
79
+ def dropout_variance_inplace_cpu(p):
80
+ """Calculates variance p * (1 - p) in-place."""
81
+ flat = p.ravel()
82
+ for i in range(flat.size):
83
+ val = flat[i]
84
+ flat[i] = val - (val * val)
85
+
86
+ # ==========================================
87
+ # STAGE 1: MASK GENERATION
88
+ # ==========================================
89
+
90
+ def ConvertDataSparseCPU(input_filename: str, output_mask_filename: str, mode: str = "auto", manual_target: int = 3000):
91
+ start_time = time.perf_counter()
92
+ print(f"FUNCTION: ConvertDataSparseCPU() | FILE: {input_filename}")
93
+
94
+ device = ControlDevice.from_h5ad(input_filename, mode=mode, manual_target=manual_target)
95
+ n_cells = device.total_rows
96
+ n_genes = device.n_genes
97
+
98
+ with h5py.File(input_filename, 'r') as f_in:
99
+ x_group_in = f_in['X']
100
+ print(f"Phase [1/1]: identifying expressed genes...")
101
+ genes_to_keep_mask = np.zeros(n_genes, dtype=bool)
102
+
103
+ h5_indptr = x_group_in['indptr']
104
+ h5_indices = x_group_in['indices']
105
+
106
+ current_row = 0
107
+ while current_row < n_cells:
108
+ # Overhead 1.0 is fine for sparse scan on CPU
109
+ end_row = device.get_next_chunk(current_row, mode='sparse', overhead_multiplier=1.0)
110
+ if end_row is None or end_row <= current_row: break
111
+
112
+ chunk_size = end_row - current_row
113
+ print(f"Phase [1/1]: Scanning rows {end_row} of {n_cells} | Chunk: {chunk_size}", end='\r')
114
+
115
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
116
+ if start_idx == end_idx:
117
+ current_row = end_row
118
+ continue
119
+
120
+ indices = h5_indices[start_idx:end_idx]
121
+ unique_indices = np.unique(indices)
122
+ genes_to_keep_mask[unique_indices] = True
123
+
124
+ current_row = end_row
125
+
126
+ n_genes_to_keep = int(np.sum(genes_to_keep_mask))
127
+ print(f"\nPhase [1/1]: COMPLETE | Result: {n_genes_to_keep} / {n_genes} genes retained.")
128
+
129
+ print(f"Saving mask to {output_mask_filename}...")
130
+ with open(output_mask_filename, 'wb') as f:
131
+ pickle.dump(genes_to_keep_mask, f)
132
+
133
+ end_time = time.perf_counter()
134
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
135
+
136
+ # ==========================================
137
+ # STAGE 2: STATISTICS
138
+ # ==========================================
139
+
140
+ def hidden_calc_valsCPU(filename: str, mask_filename: str, mode: str = "auto", manual_target: int = 3000) -> dict:
141
+ start_time = time.perf_counter()
142
+ print(f"FUNCTION: hidden_calc_valsCPU() | FILE: {filename}")
143
+
144
+ # 1. Load Mask
145
+ with open(mask_filename, 'rb') as f: mask = pickle.load(f)
146
+ ng_filtered = int(np.sum(mask))
147
+
148
+ # 2. Init Device
149
+ with h5py.File(filename, 'r') as f:
150
+ indptr_cpu = f['X']['indptr'][:]
151
+ total_rows = len(indptr_cpu) - 1
152
+
153
+ device = ControlDevice(
154
+ indptr=indptr_cpu,
155
+ total_rows=total_rows,
156
+ n_genes=ng_filtered,
157
+ mode=mode,
158
+ manual_target=manual_target
159
+ )
160
+ nc = device.total_rows
161
+
162
+ adata_meta = anndata.read_h5ad(filename, backed='r')
163
+ tis = np.zeros(nc, dtype='float64')
164
+ cell_non_zeros = np.zeros(nc, dtype='int64')
165
+ tjs = np.zeros(ng_filtered, dtype=np.float64)
166
+ gene_non_zeros = np.zeros(ng_filtered, dtype=np.int32)
167
+
168
+ print("Phase [1/2]: Calculating statistics...")
169
+ with h5py.File(filename, 'r') as f_in:
170
+ x_group = f_in['X']
171
+ h5_indptr = x_group['indptr']
172
+ h5_data = x_group['data']
173
+ h5_indices = x_group['indices']
174
+
175
+ current_row = 0
176
+ while current_row < nc:
177
+ end_row = device.get_next_chunk(current_row, mode='sparse', overhead_multiplier=1.1)
178
+ if end_row is None or end_row <= current_row: break
179
+
180
+ chunk_size = end_row - current_row
181
+ print(f"Phase [1/2]: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
182
+
183
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
184
+ data = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
185
+ indices = np.array(h5_indices[start_idx:end_idx])
186
+ indptr = np.array(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
187
+
188
+ # Use Scipy CSR for CPU operations
189
+ chunk_csr = sparse.csr_matrix((data, indices, indptr), shape=(chunk_size, len(mask)))
190
+
191
+ # --- VIRTUAL FILTER + CEIL ---
192
+ chunk_csr = chunk_csr[:, mask]
193
+ chunk_csr.data = np.ceil(chunk_csr.data)
194
+ # -----------------------------
195
+
196
+ tis[current_row:end_row] = np.array(chunk_csr.sum(axis=1)).flatten()
197
+ cell_non_zeros[current_row:end_row] = np.diff(chunk_csr.indptr)
198
+
199
+ # Numpy 'add.at' equivalent for sparse accumulation
200
+ np.add.at(tjs, chunk_csr.indices, chunk_csr.data)
201
+
202
+ unique_indices, counts = np.unique(chunk_csr.indices, return_counts=True)
203
+ np.add.at(gene_non_zeros, unique_indices, counts)
204
+
205
+ current_row = end_row
206
+
207
+ print(f"\nPhase [1/2]: COMPLETE{' ' * 50}")
208
+
209
+ print("Phase [2/2]: Finalizing stats...")
210
+ dis = ng_filtered - cell_non_zeros
211
+ djs = nc - gene_non_zeros
212
+ total = tjs.sum()
213
+ print("Phase [2/2]: COMPLETE")
214
+
215
+ end_time = time.perf_counter()
216
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
217
+
218
+ filtered_var_index = adata_meta.var.index[mask]
219
+
220
+ return {
221
+ "tis": pd.Series(tis, index=adata_meta.obs.index),
222
+ "tjs": pd.Series(tjs, index=filtered_var_index),
223
+ "dis": pd.Series(dis, index=adata_meta.obs.index),
224
+ "djs": pd.Series(djs, index=filtered_var_index),
225
+ "total": total,
226
+ "nc": nc,
227
+ "ng": ng_filtered
228
+ }
229
+
230
+ def NBumiFitModelCPU(raw_filename: str, mask_filename: str, stats: dict, mode: str = "auto", manual_target: int = 3000) -> dict:
231
+ start_time = time.perf_counter()
232
+ print(f"FUNCTION: NBumiFitModelCPU() | FILE: {raw_filename}")
233
+
234
+ with open(mask_filename, 'rb') as f: mask = pickle.load(f)
235
+ ng_filtered = stats['ng']
236
+
237
+ with h5py.File(raw_filename, 'r') as f:
238
+ indptr_cpu = f['X']['indptr'][:]
239
+ total_rows = len(indptr_cpu) - 1
240
+ device = ControlDevice(indptr=indptr_cpu, total_rows=total_rows, n_genes=ng_filtered, mode=mode, manual_target=manual_target)
241
+ nc = device.total_rows
242
+
243
+ tjs = stats['tjs'].values
244
+ tis = stats['tis'].values
245
+ total = stats['total']
246
+
247
+ # Numpy arrays
248
+ sum_x_sq = np.zeros(ng_filtered, dtype=np.float64)
249
+ sum_2xmu = np.zeros(ng_filtered, dtype=np.float64)
250
+
251
+ print("Phase [1/3]: Pre-calculating sum of squared expectations...")
252
+ sum_tis_sq = np.sum(tis**2)
253
+ sum_mu_sq = (tjs**2 / total**2) * sum_tis_sq
254
+ print("Phase [1/3]: COMPLETE")
255
+
256
+ print("Phase [2/3]: Calculating variance components...")
257
+ with h5py.File(raw_filename, 'r') as f_in:
258
+ x_group = f_in['X']
259
+ h5_indptr = x_group['indptr']
260
+ h5_data = x_group['data']
261
+ h5_indices = x_group['indices']
262
+
263
+ current_row = 0
264
+ while current_row < nc:
265
+ # L3 optimization is critical here for CPU performance
266
+ end_row = device.get_next_chunk(current_row, mode='sparse', overhead_multiplier=1.1)
267
+ if end_row is None or end_row <= current_row: break
268
+
269
+ chunk_size = end_row - current_row
270
+ print(f"Phase [2/3]: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
271
+
272
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
273
+ data = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
274
+ indices = np.array(h5_indices[start_idx:end_idx])
275
+ indptr = np.array(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
276
+
277
+ chunk_csr = sparse.csr_matrix((data, indices, indptr), shape=(chunk_size, len(mask)))
278
+ chunk_csr = chunk_csr[:, mask]
279
+ chunk_csr.data = np.ceil(chunk_csr.data)
280
+
281
+ # Accumulate X^2
282
+ np.add.at(sum_x_sq, chunk_csr.indices, chunk_csr.data**2)
283
+
284
+ # Vectorized term calculation for 2 * x * mu
285
+ # To avoid expanding dense matrices, we iterate over CSR structure manually or use broadcasting
286
+ # For CPU, iterating over the non-zeros is efficient enough
287
+
288
+ # Map row indices to global cell indices
289
+ row_indices = np.repeat(np.arange(chunk_size), np.diff(chunk_csr.indptr)) + current_row
290
+ global_tis = tis[row_indices]
291
+
292
+ term_vals = 2 * chunk_csr.data * tjs[chunk_csr.indices] * global_tis / total
293
+ np.add.at(sum_2xmu, chunk_csr.indices, term_vals)
294
+
295
+ current_row = end_row
296
+
297
+ print(f"\nPhase [2/3]: COMPLETE {' ' * 50}")
298
+
299
+ print("Phase [3/3]: Finalizing dispersion...")
300
+ sum_sq_dev = sum_x_sq - sum_2xmu + sum_mu_sq
301
+ var_obs = sum_sq_dev / (nc - 1)
302
+
303
+ sizes = np.full(ng_filtered, 10000.0)
304
+ numerator = (tjs**2 / total**2) * sum_tis_sq
305
+ denominator = sum_sq_dev - tjs
306
+
307
+ stable_mask = denominator > 1e-6
308
+ sizes[stable_mask] = numerator[stable_mask] / denominator[stable_mask]
309
+ sizes[sizes <= 0] = 10000.0
310
+
311
+ print("Phase [3/3]: COMPLETE")
312
+
313
+ end_time = time.perf_counter()
314
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
315
+
316
+ return {
317
+ 'var_obs': pd.Series(var_obs, index=stats['tjs'].index),
318
+ 'sizes': pd.Series(sizes, index=stats['tjs'].index),
319
+ 'vals': stats
320
+ }
321
+
322
+ def NBumiFitDispVsMeanCPU(fit: dict, suppress_plot=True):
323
+ vals = fit['vals']
324
+ size_g = fit['sizes'].values
325
+ tjs = vals['tjs'].values
326
+ mean_expression = tjs / vals['nc']
327
+
328
+ forfit = (np.isfinite(size_g)) & (size_g < 1e6) & (mean_expression > 1e-3) & (size_g > 0)
329
+ log2_mean_expr = np.log2(mean_expression, where=(mean_expression > 0))
330
+
331
+ higher = log2_mean_expr > 4
332
+ if np.sum(higher & forfit) > 2000:
333
+ forfit = higher & forfit
334
+
335
+ y = np.log(size_g[forfit])
336
+ x = np.log(mean_expression[forfit])
337
+
338
+ X = sm.add_constant(x)
339
+ model = sm.OLS(y, X).fit()
340
+
341
+ if not suppress_plot:
342
+ plt.figure(figsize=(7, 6))
343
+ plt.scatter(x, y, alpha=0.5, s=1)
344
+ plt.plot(x, model.fittedvalues, color='red')
345
+ plt.show()
346
+
347
+ return model.params
348
+
349
+ def NBumiFeatureSelectionHighVarCPU(fit: dict) -> pd.DataFrame:
350
+ start_time = time.perf_counter()
351
+ print(f"FUNCTION: NBumiFeatureSelectionHighVarCPU()")
352
+
353
+ vals = fit['vals']
354
+ coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
355
+ mean_expression = vals['tjs'].values / vals['nc']
356
+
357
+ with np.errstate(divide='ignore', invalid='ignore'):
358
+ log_mean_expression = np.log(mean_expression)
359
+ log_mean_expression[np.isneginf(log_mean_expression)] = 0
360
+ exp_size = np.exp(coeffs[0] + coeffs[1] * log_mean_expression)
361
+ res = np.log(fit['sizes'].values) - np.log(exp_size)
362
+
363
+ results_df = pd.DataFrame({'Gene': fit['sizes'].index, 'Residual': res})
364
+ final_table = results_df.sort_values(by='Residual', ascending=True)
365
+
366
+ end_time = time.perf_counter()
367
+ print(f"Total time: {end_time - start_time:.4f} seconds.\n")
368
+ return final_table
369
+
370
+ def NBumiFeatureSelectionCombinedDropCPU(
371
+ fit: dict,
372
+ raw_filename: str,
373
+ method="fdr_bh",
374
+ qval_thresh=0.05,
375
+ mode: str = "auto",
376
+ manual_target: int = 3000
377
+ ) -> pd.DataFrame:
378
+
379
+ start_time = time.perf_counter()
380
+ print(f"FUNCTION: NBumiFeatureSelectionCombinedDropCPU() | FILE: {raw_filename}")
381
+
382
+ ng_filtered = fit['vals']['ng']
383
+
384
+ with h5py.File(raw_filename, 'r') as f:
385
+ indptr_cpu = f['X']['indptr'][:]
386
+ total_rows = len(indptr_cpu) - 1
387
+ device = ControlDevice(indptr=indptr_cpu, total_rows=total_rows, n_genes=ng_filtered, mode=mode, manual_target=manual_target)
388
+ nc = device.total_rows
389
+
390
+ print("Phase [1/3]: Initializing arrays...")
391
+ vals = fit['vals']
392
+ coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
393
+
394
+ tjs = vals['tjs'].values
395
+ tis = vals['tis'].values
396
+ total = vals['total']
397
+
398
+ mean_expression = vals['tjs'].values / nc
399
+ with np.errstate(divide='ignore'):
400
+ exp_size = np.exp(coeffs[0] + coeffs[1] * np.log(mean_expression))
401
+
402
+ # Pre-allocate accumulators
403
+ p_sum = np.zeros(ng_filtered, dtype=np.float64)
404
+ p_var_sum = np.zeros(ng_filtered, dtype=np.float64)
405
+ print("Phase [1/3]: COMPLETE")
406
+
407
+ print("Phase [2/3]: Calculating dropout stats (Virtual)...")
408
+
409
+ current_row = 0
410
+ while current_row < nc:
411
+ # Dense mode allows Numba to rip through the data
412
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=1.1)
413
+ if end_row is None or end_row <= current_row: break
414
+
415
+ chunk_size = end_row - current_row
416
+ print(f"Phase [2/3]: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
417
+
418
+ tis_chunk = tis[current_row:end_row]
419
+ work_matrix = np.empty((chunk_size, ng_filtered), dtype=np.float64)
420
+
421
+ # CALL NUMBA KERNEL
422
+ dropout_prob_kernel_cpu(
423
+ tjs,
424
+ tis_chunk,
425
+ total,
426
+ exp_size,
427
+ work_matrix
428
+ )
429
+
430
+ p_sum += work_matrix.sum(axis=0)
431
+
432
+ # In-place variance calc
433
+ dropout_variance_inplace_cpu(work_matrix)
434
+ p_var_sum += work_matrix.sum(axis=0)
435
+
436
+ current_row = end_row
437
+
438
+ print(f"\nPhase [2/3]: COMPLETE {' ' * 50}")
439
+
440
+ print("Phase [3/3]: Statistical testing...")
441
+
442
+ droprate_exp = p_sum / nc
443
+ droprate_exp_err = np.sqrt(p_var_sum / (nc**2))
444
+ droprate_obs = vals['djs'].values / nc
445
+
446
+ diff = droprate_obs - droprate_exp
447
+ combined_err = np.sqrt(droprate_exp_err**2 + (droprate_obs * (1 - droprate_obs) / nc))
448
+
449
+ with np.errstate(divide='ignore', invalid='ignore'):
450
+ Zed = diff / combined_err
451
+
452
+ pvalue = norm.sf(Zed)
453
+
454
+ results_df = pd.DataFrame({'Gene': vals['tjs'].index, 'p.value': pvalue, 'effect_size': diff})
455
+ results_df = results_df.sort_values(by='p.value')
456
+
457
+ qval = multipletests(results_df['p.value'].fillna(1), method=method)[1]
458
+ results_df['q.value'] = qval
459
+ final_table = results_df[results_df['q.value'] < qval_thresh]
460
+
461
+ print("Phase [3/3]: COMPLETE")
462
+ end_time = time.perf_counter()
463
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
464
+
465
+ return final_table[['Gene', 'effect_size', 'p.value', 'q.value']]
466
+
467
+ def NBumiCombinedDropVolcanoCPU(results_df: pd.DataFrame, qval_thresh=0.05, effect_size_thresh=0.25, top_n_genes=10, suppress_plot=False, plot_filename=None):
468
+ start_time = time.perf_counter()
469
+ print(f"FUNCTION: NBumiCombinedDropVolcanoCPU()")
470
+
471
+ # Standard Matplotlib code - safe for CPU
472
+ df = results_df.copy()
473
+ if (df['q.value'] == 0).any():
474
+ non_zero_min = df[df['q.value'] > 0]['q.value'].min()
475
+ df['q.value'] = df['q.value'].replace(0, non_zero_min)
476
+
477
+ df['-log10_qval'] = -np.log10(df['q.value'])
478
+ df['color'] = 'grey'
479
+ df.loc[(df['q.value'] < qval_thresh) & (df['effect_size'] > effect_size_thresh), 'color'] = 'red'
480
+ df.loc[(df['q.value'] < qval_thresh) & (df['effect_size'] < -effect_size_thresh), 'color'] = 'blue'
481
+
482
+ plt.figure(figsize=(10, 8))
483
+ plt.scatter(df['effect_size'], df['-log10_qval'], c=df['color'], s=10, alpha=0.6, edgecolors='none')
484
+
485
+ plt.axvline(x=effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
486
+ plt.axvline(x=-effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
487
+ plt.axhline(y=-np.log10(qval_thresh), linestyle='--', color='grey', linewidth=0.8)
488
+
489
+ top_genes = df.nsmallest(top_n_genes, 'q.value')
490
+ for i, row in top_genes.iterrows():
491
+ plt.text(row['effect_size'], row['-log10_qval'], row['Gene'], fontsize=9, fontweight='bold')
492
+
493
+ plt.title('Volcano Plot: Dropout Rate vs Significance (CPU)')
494
+ plt.xlabel('Effect Size (Observed - Expected Dropout Rate)')
495
+ plt.ylabel('-log10 (FDR Adjusted p-value)')
496
+ plt.grid(True, linestyle='--', alpha=0.3)
497
+ ax = plt.gca()
498
+
499
+ if plot_filename:
500
+ print(f"Saving plot to: {plot_filename}")
501
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
502
+
503
+ if not suppress_plot:
504
+ plt.show()
505
+
506
+ plt.close()
507
+
508
+ end_time = time.perf_counter()
509
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
510
+ return ax