M3Drop 0.4.42__py3-none-any.whl → 0.4.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
m3Drop/CoreGPU.py ADDED
@@ -0,0 +1,506 @@
1
+ import time
2
+ import psutil
3
+ import h5py
4
+ import numpy as np
5
+ import anndata
6
+ import pandas as pd
7
+ import os
8
+ import sys
9
+ import pickle
10
+
11
+ try:
12
+ import cupy
13
+ import cupy.sparse as csp
14
+ from cupy.sparse import csr_matrix as cp_csr_matrix
15
+ import cupyx
16
+ except ImportError:
17
+ print("CRITICAL ERROR: CuPy not found. This script requires a GPU.")
18
+ sys.exit(1)
19
+
20
+ import statsmodels.api as sm
21
+ import matplotlib.pyplot as plt
22
+ from scipy.stats import norm
23
+ from statsmodels.stats.multitest import multipletests
24
+
25
+ # Package-compatible import
26
+ from .ControlDeviceGPU import ControlDevice
27
+
28
+ # ==========================================
29
+ # FUSED KERNELS
30
+ # ==========================================
31
+
32
+ nan_replace_kernel = cupy.ElementwiseKernel(
33
+ 'float64 x', 'float64 out',
34
+ '''if (isnan(x)) { out = 0.0; } else if (isinf(x)) { out = (x > 0) ? 1.0 : 0.0; } else { out = x; }''',
35
+ 'nan_replace_kernel'
36
+ )
37
+
38
+ dropout_prob_kernel = cupy.ElementwiseKernel(
39
+ 'float64 tj, float64 ti, float64 total, float64 exp_size', 'float64 out',
40
+ '''
41
+ double mu = (tj * ti) / total;
42
+ double base = (mu / exp_size) + 1.0;
43
+ if (base < 1e-12) base = 1e-12;
44
+ out = pow(base, -exp_size);
45
+ if (isnan(out)) out = 0.0;
46
+ else if (isinf(out)) out = (out > 0) ? 1.0 : 0.0;
47
+ ''',
48
+ 'dropout_prob_kernel'
49
+ )
50
+
51
+ dropout_variance_inplace_kernel = cupy.ElementwiseKernel(
52
+ 'float64 p', 'float64 out',
53
+ ''' out = p - (p * p); ''',
54
+ 'dropout_variance_inplace_kernel'
55
+ )
56
+
57
+ # ==========================================
58
+ # STAGE 1: MASK GENERATION
59
+ # ==========================================
60
+
61
+ def ConvertDataSparseGPU(input_filename: str, output_mask_filename: str, mode: str = "auto", manual_target: int = 3000):
62
+ """
63
+ Scans RAW data to identify genes with non-zero counts.
64
+ Saves a boolean mask to disk instead of rewriting the dataset.
65
+ """
66
+ start_time = time.perf_counter()
67
+ print(f"FUNCTION: ConvertDataSparseGPU() | FILE: {input_filename}")
68
+
69
+ # Standard init is fine here (we don't know ng yet)
70
+ device = ControlDevice.from_h5ad(input_filename, mode=mode, manual_target=manual_target)
71
+ n_cells = device.total_rows
72
+ n_genes = device.n_genes
73
+
74
+ with h5py.File(input_filename, 'r') as f_in:
75
+ x_group_in = f_in['X']
76
+ print(f"Phase [1/1]: identifying expressed genes...")
77
+ genes_to_keep_mask_gpu = cupy.zeros(n_genes, dtype=bool)
78
+
79
+ h5_indptr = x_group_in['indptr']
80
+ h5_indices = x_group_in['indices']
81
+
82
+ current_row = 0
83
+ while current_row < n_cells:
84
+ end_row = device.get_next_chunk(current_row, mode='sparse', overhead_multiplier=1.1)
85
+ if end_row is None or end_row <= current_row: break
86
+
87
+ chunk_size = end_row - current_row
88
+ print(f"Phase [1/1]: Scanning rows {end_row} of {n_cells} | Chunk: {chunk_size}", end='\r')
89
+
90
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
91
+ if start_idx == end_idx:
92
+ current_row = end_row
93
+ continue
94
+
95
+ indices_gpu = cupy.asarray(h5_indices[start_idx:end_idx])
96
+ unique_gpu = cupy.unique(indices_gpu)
97
+ genes_to_keep_mask_gpu[unique_gpu] = True
98
+
99
+ del indices_gpu, unique_gpu
100
+ current_row = end_row
101
+
102
+ n_genes_to_keep = int(cupy.sum(genes_to_keep_mask_gpu))
103
+ print(f"\nPhase [1/1]: COMPLETE | Result: {n_genes_to_keep} / {n_genes} genes retained.")
104
+
105
+ print(f"Saving mask to {output_mask_filename}...")
106
+ mask_cpu = cupy.asnumpy(genes_to_keep_mask_gpu)
107
+ with open(output_mask_filename, 'wb') as f:
108
+ pickle.dump(mask_cpu, f)
109
+
110
+ end_time = time.perf_counter()
111
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
112
+
113
+ # ==========================================
114
+ # STAGE 2: STATISTICS
115
+ # ==========================================
116
+
117
+ def hidden_calc_valsGPU(filename: str, mask_filename: str, mode: str = "auto", manual_target: int = 3000) -> dict:
118
+ start_time = time.perf_counter()
119
+ print(f"FUNCTION: hidden_calc_vals() | FILE: {filename}")
120
+
121
+ # 1. Load Mask
122
+ with open(mask_filename, 'rb') as f: mask_cpu = pickle.load(f)
123
+ mask_gpu = cupy.asarray(mask_cpu)
124
+ ng_filtered = int(cupy.sum(mask_gpu))
125
+
126
+ # 2. Manual Device Init (Crucial for VRAM logic)
127
+ with h5py.File(filename, 'r') as f:
128
+ indptr_cpu = f['X']['indptr'][:]
129
+ total_rows = len(indptr_cpu) - 1
130
+
131
+ device = ControlDevice(
132
+ indptr=indptr_cpu,
133
+ total_rows=total_rows,
134
+ n_genes=ng_filtered, # Force device to see real data size
135
+ mode=mode,
136
+ manual_target=manual_target
137
+ )
138
+ nc = device.total_rows
139
+
140
+ adata_meta = anndata.read_h5ad(filename, backed='r')
141
+ tis = np.zeros(nc, dtype='int64')
142
+ cell_non_zeros = np.zeros(nc, dtype='int64')
143
+ tjs_gpu = cupy.zeros(ng_filtered, dtype=cupy.float64)
144
+ gene_non_zeros_gpu = cupy.zeros(ng_filtered, dtype=cupy.int32)
145
+
146
+ print("Phase [1/2]: Calculating statistics...")
147
+ with h5py.File(filename, 'r') as f_in:
148
+ x_group = f_in['X']
149
+ h5_indptr = x_group['indptr']
150
+ h5_data = x_group['data']
151
+ h5_indices = x_group['indices']
152
+
153
+ current_row = 0
154
+ while current_row < nc:
155
+ end_row = device.get_next_chunk(current_row, mode='sparse', overhead_multiplier=1.1)
156
+ if end_row is None or end_row <= current_row: break
157
+
158
+ chunk_size = end_row - current_row
159
+ print(f"Phase [1/2]: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
160
+
161
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
162
+ data_gpu = cupy.asarray(h5_data[start_idx:end_idx], dtype=cupy.float64)
163
+ indices_gpu = cupy.asarray(h5_indices[start_idx:end_idx])
164
+ indptr_gpu = cupy.asarray(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
165
+
166
+ chunk_gpu = cp_csr_matrix((data_gpu, indices_gpu, indptr_gpu), shape=(chunk_size, len(mask_cpu)))
167
+
168
+ # --- VIRTUAL FILTER + CEIL ---
169
+ chunk_gpu = chunk_gpu[:, mask_gpu]
170
+ chunk_gpu.data = cupy.ceil(chunk_gpu.data)
171
+ # -----------------------------
172
+
173
+ tis[current_row:end_row] = chunk_gpu.sum(axis=1).get().flatten()
174
+ cell_non_zeros_chunk = cupy.diff(chunk_gpu.indptr)
175
+ cell_non_zeros[current_row:end_row] = cell_non_zeros_chunk.get()
176
+
177
+ cupy.add.at(tjs_gpu, chunk_gpu.indices, chunk_gpu.data)
178
+ unique_indices_gpu, counts_gpu = cupy.unique(chunk_gpu.indices, return_counts=True)
179
+ cupy.add.at(gene_non_zeros_gpu, unique_indices_gpu, counts_gpu)
180
+
181
+ del data_gpu, indices_gpu, indptr_gpu, chunk_gpu
182
+ cupy.get_default_memory_pool().free_all_blocks()
183
+ current_row = end_row
184
+
185
+ tjs = cupy.asnumpy(tjs_gpu)
186
+ gene_non_zeros = cupy.asnumpy(gene_non_zeros_gpu)
187
+ print(f"\nPhase [1/2]: COMPLETE{' ' * 50}")
188
+
189
+ print("Phase [2/2]: Finalizing stats...")
190
+ dis = ng_filtered - cell_non_zeros
191
+ djs = nc - gene_non_zeros
192
+ total = tjs.sum()
193
+ print("Phase [2/2]: COMPLETE")
194
+
195
+ end_time = time.perf_counter()
196
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
197
+
198
+ filtered_var_index = adata_meta.var.index[mask_cpu]
199
+
200
+ return {
201
+ "tis": pd.Series(tis, index=adata_meta.obs.index),
202
+ "tjs": pd.Series(tjs, index=filtered_var_index),
203
+ "dis": pd.Series(dis, index=adata_meta.obs.index),
204
+ "djs": pd.Series(djs, index=filtered_var_index),
205
+ "total": total,
206
+ "nc": nc,
207
+ "ng": ng_filtered
208
+ }
209
+
210
+
211
+ def NBumiFitModelGPU(raw_filename: str, mask_filename: str, stats: dict, mode: str = "auto", manual_target: int = 3000) -> dict:
212
+ start_time = time.perf_counter()
213
+ print(f"FUNCTION: NBumiFitModelGPU() | FILE: {raw_filename}")
214
+
215
+ with open(mask_filename, 'rb') as f: mask_cpu = pickle.load(f)
216
+ mask_gpu = cupy.asarray(mask_cpu)
217
+ ng_filtered = stats['ng']
218
+
219
+ # MANUAL INIT
220
+ with h5py.File(raw_filename, 'r') as f:
221
+ indptr_cpu = f['X']['indptr'][:]
222
+ total_rows = len(indptr_cpu) - 1
223
+ device = ControlDevice(indptr=indptr_cpu, total_rows=total_rows, n_genes=ng_filtered, mode=mode, manual_target=manual_target)
224
+ nc = device.total_rows
225
+
226
+ tjs = stats['tjs'].values
227
+ tis = stats['tis'].values
228
+ total = stats['total']
229
+
230
+ tjs_gpu = cupy.asarray(tjs, dtype=cupy.float64)
231
+ tis_gpu = cupy.asarray(tis, dtype=cupy.float64)
232
+ sum_x_sq_gpu = cupy.zeros(ng_filtered, dtype=cupy.float64)
233
+ sum_2xmu_gpu = cupy.zeros(ng_filtered, dtype=cupy.float64)
234
+
235
+ print("Phase [1/3]: Pre-calculating sum of squared expectations...")
236
+ sum_tis_sq_gpu = cupy.sum(tis_gpu**2)
237
+ sum_mu_sq_gpu = (tjs_gpu**2 / total**2) * sum_tis_sq_gpu
238
+ print("Phase [1/3]: COMPLETE")
239
+
240
+ print("Phase [2/3]: Calculating variance components...")
241
+ with h5py.File(raw_filename, 'r') as f_in:
242
+ x_group = f_in['X']
243
+ h5_indptr = x_group['indptr']
244
+ h5_data = x_group['data']
245
+ h5_indices = x_group['indices']
246
+
247
+ current_row = 0
248
+ while current_row < nc:
249
+ end_row = device.get_next_chunk(current_row, mode='sparse', overhead_multiplier=1.1)
250
+ if end_row is None or end_row <= current_row: break
251
+
252
+ chunk_size = end_row - current_row
253
+ print(f"Phase [2/3]: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
254
+
255
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
256
+ data_gpu_raw = cupy.asarray(h5_data[start_idx:end_idx], dtype=cupy.float64)
257
+ indices_gpu_raw = cupy.asarray(h5_indices[start_idx:end_idx])
258
+ indptr_gpu_raw = cupy.asarray(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
259
+
260
+ chunk_gpu = cp_csr_matrix((data_gpu_raw, indices_gpu_raw, indptr_gpu_raw), shape=(chunk_size, len(mask_cpu)))
261
+
262
+ # --- VIRTUAL FILTER + CEIL ---
263
+ chunk_gpu = chunk_gpu[:, mask_gpu]
264
+ chunk_gpu.data = cupy.ceil(chunk_gpu.data)
265
+ # -----------------------------
266
+
267
+ cupy.add.at(sum_x_sq_gpu, chunk_gpu.indices, chunk_gpu.data**2)
268
+
269
+ nnz_in_chunk = chunk_gpu.indptr[-1].item()
270
+ cell_boundary_markers = cupy.zeros(nnz_in_chunk, dtype=cupy.int32)
271
+ if len(chunk_gpu.indptr) > 1:
272
+ cell_boundary_markers[chunk_gpu.indptr[:-1]] = 1
273
+ cell_indices_gpu = (cupy.cumsum(cell_boundary_markers, axis=0) - 1) + current_row
274
+
275
+ term_vals = 2 * chunk_gpu.data * tjs_gpu[chunk_gpu.indices] * tis_gpu[cell_indices_gpu] / total
276
+ cupy.add.at(sum_2xmu_gpu, chunk_gpu.indices, term_vals)
277
+
278
+ del chunk_gpu, data_gpu_raw, indices_gpu_raw, indptr_gpu_raw, cell_indices_gpu, term_vals
279
+ cupy.get_default_memory_pool().free_all_blocks()
280
+
281
+ current_row = end_row
282
+
283
+ print(f"\nPhase [2/3]: COMPLETE {' ' * 50}")
284
+
285
+ print("Phase [3/3]: Finalizing dispersion...")
286
+ sum_sq_dev_gpu = sum_x_sq_gpu - sum_2xmu_gpu + sum_mu_sq_gpu
287
+ var_obs_gpu = sum_sq_dev_gpu / (nc - 1)
288
+
289
+ sizes_gpu = cupy.full(ng_filtered, 10000.0)
290
+ numerator_gpu = (tjs_gpu**2 / total**2) * sum_tis_sq_gpu
291
+ denominator_gpu = sum_sq_dev_gpu - tjs_gpu
292
+ stable_mask = denominator_gpu > 1e-6
293
+ sizes_gpu[stable_mask] = numerator_gpu[stable_mask] / denominator_gpu[stable_mask]
294
+ sizes_gpu[sizes_gpu <= 0] = 10000.0
295
+
296
+ var_obs_cpu = var_obs_gpu.get()
297
+ sizes_cpu = sizes_gpu.get()
298
+ print("Phase [3/3]: COMPLETE")
299
+
300
+ end_time = time.perf_counter()
301
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
302
+
303
+ return {
304
+ 'var_obs': pd.Series(var_obs_cpu, index=stats['tjs'].index),
305
+ 'sizes': pd.Series(sizes_cpu, index=stats['tjs'].index),
306
+ 'vals': stats
307
+ }
308
+
309
+
310
+ def NBumiFitDispVsMeanGPU(fit: dict, suppress_plot=True):
311
+ vals = fit['vals']
312
+ size_g = fit['sizes'].values
313
+ tjs = vals['tjs'].values
314
+ mean_expression = tjs / vals['nc']
315
+
316
+ forfit = (np.isfinite(size_g)) & (size_g < 1e6) & (mean_expression > 1e-3) & (size_g > 0)
317
+ log2_mean_expr = np.log2(mean_expression, where=(mean_expression > 0))
318
+
319
+ higher = log2_mean_expr > 4
320
+ if np.sum(higher & forfit) > 2000:
321
+ forfit = higher & forfit
322
+
323
+ y = np.log(size_g[forfit])
324
+ x = np.log(mean_expression[forfit])
325
+
326
+ X = sm.add_constant(x)
327
+ model = sm.OLS(y, X).fit()
328
+
329
+ if not suppress_plot:
330
+ plt.figure(figsize=(7, 6))
331
+ plt.scatter(x, y, alpha=0.5, s=1)
332
+ plt.plot(x, model.fittedvalues, color='red')
333
+ plt.show()
334
+
335
+ return model.params
336
+
337
+
338
+ def NBumiFeatureSelectionHighVarGPU(fit: dict) -> pd.DataFrame:
339
+ start_time = time.perf_counter()
340
+ print(f"FUNCTION: NBumiFeatureSelectionHighVar()")
341
+
342
+ vals = fit['vals']
343
+ coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
344
+ mean_expression = vals['tjs'].values / vals['nc']
345
+
346
+ with np.errstate(divide='ignore', invalid='ignore'):
347
+ log_mean_expression = np.log(mean_expression)
348
+ log_mean_expression[np.isneginf(log_mean_expression)] = 0
349
+ exp_size = np.exp(coeffs[0] + coeffs[1] * log_mean_expression)
350
+ res = np.log(fit['sizes'].values) - np.log(exp_size)
351
+
352
+ results_df = pd.DataFrame({'Gene': fit['sizes'].index, 'Residual': res})
353
+ final_table = results_df.sort_values(by='Residual', ascending=True)
354
+
355
+ end_time = time.perf_counter()
356
+ print(f"Total time: {end_time - start_time:.4f} seconds.\n")
357
+ return final_table
358
+
359
+
360
+ def NBumiFeatureSelectionCombinedDropGPU(
361
+ fit: dict,
362
+ raw_filename: str,
363
+ # Mask not strictly needed for calc (uses vectors),
364
+ # but needed if we want consistent API.
365
+ # However, we DO need ng_filtered for ControlDevice.
366
+ method="fdr_bh",
367
+ qval_thresh=0.05,
368
+ mode: str = "auto",
369
+ manual_target: int = 3000
370
+ ) -> pd.DataFrame:
371
+
372
+ start_time = time.perf_counter()
373
+ print(f"FUNCTION: NBumiFeatureSelectionCombinedDrop() | FILE: {raw_filename}")
374
+
375
+ ng_filtered = fit['vals']['ng']
376
+
377
+ # MANUAL INIT
378
+ with h5py.File(raw_filename, 'r') as f:
379
+ indptr_cpu = f['X']['indptr'][:]
380
+ total_rows = len(indptr_cpu) - 1
381
+ device = ControlDevice(indptr=indptr_cpu, total_rows=total_rows, n_genes=ng_filtered, mode=mode, manual_target=manual_target)
382
+ nc = device.total_rows
383
+
384
+ print("Phase [1/3]: Initializing arrays...")
385
+ vals = fit['vals']
386
+ coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
387
+
388
+ tjs_gpu = cupy.asarray(vals['tjs'].values, dtype=cupy.float64)
389
+ tis_gpu = cupy.asarray(vals['tis'].values, dtype=cupy.float64)
390
+ total = vals['total']
391
+
392
+ mean_expression_cpu = vals['tjs'].values / nc
393
+ with np.errstate(divide='ignore'):
394
+ exp_size_cpu = np.exp(coeffs[0] + coeffs[1] * np.log(mean_expression_cpu))
395
+ exp_size_gpu = cupy.asarray(exp_size_cpu, dtype=cupy.float64)
396
+
397
+ p_sum_gpu = cupy.zeros(ng_filtered, dtype=cupy.float64)
398
+ p_var_sum_gpu = cupy.zeros(ng_filtered, dtype=cupy.float64)
399
+ print("Phase [1/3]: COMPLETE")
400
+
401
+ print("Phase [2/3]: Calculating dropout stats (Virtual)...")
402
+
403
+ current_row = 0
404
+ while current_row < nc:
405
+ # Dense mode check is safe here because device sees ng_filtered
406
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=1.1)
407
+ if end_row is None or end_row <= current_row: break
408
+
409
+ chunk_size = end_row - current_row
410
+ print(f"Phase [2/3]: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
411
+
412
+ tis_chunk_gpu = tis_gpu[current_row:end_row]
413
+ work_matrix = cupy.empty((chunk_size, ng_filtered), dtype=cupy.float64)
414
+
415
+ dropout_prob_kernel(
416
+ tjs_gpu,
417
+ tis_chunk_gpu[:, cupy.newaxis],
418
+ total,
419
+ exp_size_gpu,
420
+ work_matrix
421
+ )
422
+
423
+ p_sum_gpu += work_matrix.sum(axis=0)
424
+ dropout_variance_inplace_kernel(work_matrix, work_matrix)
425
+ p_var_sum_gpu += work_matrix.sum(axis=0)
426
+
427
+ del work_matrix, tis_chunk_gpu
428
+ cupy.get_default_memory_pool().free_all_blocks()
429
+
430
+ current_row = end_row
431
+
432
+ print(f"\nPhase [2/3]: COMPLETE {' ' * 50}")
433
+
434
+ print("Phase [3/3]: Statistical testing...")
435
+ p_sum_cpu = p_sum_gpu.get()
436
+ p_var_sum_cpu = p_var_sum_gpu.get()
437
+
438
+ droprate_exp = p_sum_cpu / nc
439
+ droprate_exp_err = np.sqrt(p_var_sum_cpu / (nc**2))
440
+ droprate_obs = vals['djs'].values / nc
441
+
442
+ diff = droprate_obs - droprate_exp
443
+ combined_err = np.sqrt(droprate_exp_err**2 + (droprate_obs * (1 - droprate_obs) / nc))
444
+
445
+ with np.errstate(divide='ignore', invalid='ignore'):
446
+ Zed = diff / combined_err
447
+
448
+ pvalue = norm.sf(Zed)
449
+
450
+ results_df = pd.DataFrame({'Gene': vals['tjs'].index, 'p.value': pvalue, 'effect_size': diff})
451
+ results_df = results_df.sort_values(by='p.value')
452
+
453
+ qval = multipletests(results_df['p.value'].fillna(1), method=method)[1]
454
+ results_df['q.value'] = qval
455
+ final_table = results_df[results_df['q.value'] < qval_thresh]
456
+
457
+ print("Phase [3/3]: COMPLETE")
458
+ end_time = time.perf_counter()
459
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
460
+
461
+ return final_table[['Gene', 'effect_size', 'p.value', 'q.value']]
462
+
463
+ def NBumiCombinedDropVolcanoGPU(results_df: pd.DataFrame, qval_thresh=0.05, effect_size_thresh=0.25, top_n_genes=10, suppress_plot=False, plot_filename=None):
464
+ start_time = time.perf_counter()
465
+ print(f"FUNCTION: NBumiCombinedDropVolcano()")
466
+
467
+ df = results_df.copy()
468
+ if (df['q.value'] == 0).any():
469
+ non_zero_min = df[df['q.value'] > 0]['q.value'].min()
470
+ df['q.value'] = df['q.value'].replace(0, non_zero_min)
471
+
472
+ df['-log10_qval'] = -np.log10(df['q.value'])
473
+ df['color'] = 'grey'
474
+ df.loc[(df['q.value'] < qval_thresh) & (df['effect_size'] > effect_size_thresh), 'color'] = 'red'
475
+ df.loc[(df['q.value'] < qval_thresh) & (df['effect_size'] < -effect_size_thresh), 'color'] = 'blue'
476
+
477
+ plt.figure(figsize=(10, 8))
478
+ plt.scatter(df['effect_size'], df['-log10_qval'], c=df['color'], s=10, alpha=0.6, edgecolors='none')
479
+
480
+ plt.axvline(x=effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
481
+ plt.axvline(x=-effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
482
+ plt.axhline(y=-np.log10(qval_thresh), linestyle='--', color='grey', linewidth=0.8)
483
+
484
+ top_genes = df.nsmallest(top_n_genes, 'q.value')
485
+ for i, row in top_genes.iterrows():
486
+ plt.text(row['effect_size'], row['-log10_qval'], row['Gene'], fontsize=9, fontweight='bold')
487
+
488
+ plt.title('Volcano Plot: Dropout Rate vs Significance')
489
+ plt.xlabel('Effect Size (Observed - Expected Dropout Rate)')
490
+ plt.ylabel('-log10 (FDR Adjusted p-value)')
491
+ plt.grid(True, linestyle='--', alpha=0.3)
492
+
493
+ ax = plt.gca()
494
+
495
+ if plot_filename:
496
+ print(f"Saving plot to: {plot_filename}")
497
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
498
+
499
+ if not suppress_plot:
500
+ plt.show()
501
+
502
+ plt.close()
503
+
504
+ end_time = time.perf_counter()
505
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
506
+ return ax