M3Drop 0.4.42__py3-none-any.whl → 0.4.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
m3Drop/coreGPU.py DELETED
@@ -1,591 +0,0 @@
1
- import time
2
- import psutil
3
- import h5py
4
- import numpy as np
5
- import anndata
6
- import pandas as pd
7
- import os
8
- import scipy.sparse as sp
9
- from scipy.sparse import csr_matrix as sp_csr_matrix
10
-
11
- import statsmodels.api as sm
12
- import matplotlib.pyplot as plt
13
- from scipy.stats import norm
14
- from statsmodels.stats.multitest import multipletests
15
-
16
- # Safe Import for Local vs Supercomputer
17
- try:
18
- import cupy
19
- import cupy.sparse as csp
20
- from cupy.sparse import csr_matrix as cp_csr_matrix
21
- HAS_GPU = True
22
- except ImportError:
23
- cupy = None
24
- HAS_GPU = False
25
- print(" [WARNING] CuPy not found. GPU acceleration disabled.")
26
-
27
- # --- (PING & GOVERNOR PROTOCOL) ---
28
- def get_optimal_chunk_size(filename: str, multiplier: float, is_dense: bool = False, override_cap: int = 50000) -> int:
29
- """
30
- AUTO-TUNER ENGINE (PING & GOVERNOR).
31
-
32
- Sensors:
33
- 1. Data Weight (Exact bytes per row)
34
- 2. RAM Pressure (psutil)
35
- 3. VRAM Pressure (cupy)
36
- 4. Context (SLURM Check)
37
-
38
- Governor:
39
- - Cluster: Maximize Throughput (Min 5k rows, Ignore CPU Cache)
40
- - Local: Maximize Responsiveness (Target 10MB Chunk, Protect CPU Cache)
41
- """
42
-
43
- # --- SENSOR A: DATA WEIGHT ---
44
- with h5py.File(filename, 'r') as f:
45
- x_group = f['X']
46
- shape = x_group.attrs['shape']
47
- n_cells, n_genes = shape[0], shape[1]
48
-
49
- # Detect exact byte size (4 for float32, 8 for float64)
50
- if 'data' in x_group:
51
- dtype_size = x_group['data'].dtype.itemsize
52
- else:
53
- dtype_size = 4 # Default safety
54
-
55
- # Calculate Load
56
- if is_dense:
57
- # Dense: Width * Bytes * Overhead
58
- bytes_per_row = n_genes * dtype_size * multiplier
59
- else:
60
- # Sparse: (Val + Col + Ptr) * Density
61
- if 'indptr' in x_group:
62
- nnz = x_group['indptr'][-1]
63
- density = nnz / (n_cells * n_genes)
64
- else:
65
- density = 0.1 # Safety default
66
-
67
- # Sparse Row = (Bytes_Data + 4_Index) * density * n_genes
68
- bytes_per_row = (n_genes * density * (dtype_size + 4)) * multiplier
69
-
70
- if bytes_per_row < 1: bytes_per_row = 1
71
-
72
- # --- SENSOR B: RAM CAPACITY ---
73
- avail_ram = psutil.virtual_memory().available
74
- limit_ram = int((avail_ram * 0.30) / bytes_per_row) # Cap at 30% RAM
75
-
76
- # --- SENSOR C: VRAM CAPACITY ---
77
- limit_vram = float('inf')
78
- if HAS_GPU:
79
- try:
80
- mempool = cupy.get_default_memory_pool()
81
- mempool.free_all_blocks()
82
- free_vram = cupy.cuda.Device(0).mem_info[0]
83
- limit_vram = int((free_vram * 0.60) / bytes_per_row) # Cap at 60% VRAM
84
- except:
85
- pass
86
-
87
- # --- SENSOR D: CONTEXT CHECK (SLURM) ---
88
- # This is the Ticket Stub check.
89
- is_cluster = "SLURM_JOB_ID" in os.environ
90
-
91
- # --- THE GOVERNOR ---
92
-
93
- if is_cluster and HAS_GPU:
94
- # SCENARIO 1: CLUSTER (Beast Mode)
95
- # Goal: Throughput. Ignore CPU Cache.
96
- optimal = min(limit_ram, limit_vram)
97
-
98
- # ANTI-STALL FLOOR: Force 3,000 rows minimum to overcome latency
99
- # (Lowered to 3,000 to prevent OOM on massive dense files)
100
- if optimal < 3000 and optimal > 100: optimal = 3000
101
-
102
- mode_msg = "CLUSTER (SLURM Detected)"
103
-
104
- else:
105
- # SCENARIO 2: LOCAL (Safe Harbor)
106
- # Goal: Responsiveness. Protect L3 Cache.
107
-
108
- # Sensor 4: CPU Cache Target (10MB)
109
- # 10MB fits in almost all L3 caches (preventing thrashing)
110
- target_10mb_rows = int(10_000_000 / bytes_per_row)
111
-
112
- optimal = min(limit_ram, limit_vram, target_10mb_rows)
113
-
114
- # ANTI-FREEZE FLOOR: Force 500 rows minimum
115
- if optimal < 500: optimal = 500
116
-
117
- mode_msg = "LOCAL (Safe Harbor)"
118
-
119
- # GLOBAL CAP (Transport Safety & Function Specific Override)
120
- if optimal > override_cap: optimal = override_cap
121
-
122
- # Cap at total file size
123
- if optimal > n_cells: optimal = n_cells
124
-
125
- # --- TELEMETRY OUTPUT ---
126
- print(f"\n------------------------------------------------------------")
127
- print(f" CHUNK SIZE OPTIMIZER (PING & GOVERNOR) [{time.strftime('%H:%M:%S')}]")
128
- print(f"------------------------------------------------------------")
129
- print(f" CONTEXT : {mode_msg}")
130
- print(f" DATA LOAD : {int(bytes_per_row):,} bytes/row (dtype={dtype_size})")
131
- print(f" MULTIPLIER : {multiplier}x")
132
- print(f" OVERRIDE CAP : {override_cap:,} rows")
133
- print(f" RAM LIMIT : {limit_ram:,} rows")
134
- if HAS_GPU:
135
- print(f" VRAM LIMIT : {limit_vram if limit_vram != float('inf') else 'N/A':,} rows")
136
- else:
137
- print(f" VRAM LIMIT : N/A (No GPU)")
138
- print(f"------------------------------------------------------------")
139
- print(f" >> CHUNK SIZE : {int(optimal):,} rows")
140
- print(f"------------------------------------------------------------\n")
141
-
142
- return int(optimal)
143
-
144
-
145
- def ConvertDataSparseGPU(
146
- input_filename: str,
147
- output_filename: str
148
- ):
149
- """
150
- GPU-ACCELERATED CLEANING.
151
- Now properly shifts gears between Phase 1 (Fast Read) and Phase 2 (Slow Write).
152
- """
153
- start_time = time.perf_counter()
154
- print(f"FUNCTION: ConvertDataSparseGPU() | FILE: {input_filename}")
155
-
156
- with h5py.File(input_filename, 'r') as f_in:
157
- x_group_in = f_in['X']
158
- n_cells, n_genes = x_group_in.attrs['shape']
159
-
160
- # --- GEAR 2: FAST READ (Phase 1) ---
161
- # We are only reading indices. No writing. Let it fly.
162
- # Max cap 50k to saturate PCIe bus without timeout.
163
- read_chunk_size = get_optimal_chunk_size(input_filename, multiplier=2.5, is_dense=False, override_cap=50000)
164
-
165
- print(f"Phase [1/2]: Identifying genes with non-zero counts... (Chunk: {read_chunk_size})")
166
-
167
- if HAS_GPU:
168
- genes_to_keep_mask = cupy.zeros(n_genes, dtype=bool)
169
- else:
170
- genes_to_keep_mask = np.zeros(n_genes, dtype=bool)
171
-
172
- h5_indptr = x_group_in['indptr']
173
- h5_indices = x_group_in['indices']
174
-
175
- for i in range(0, n_cells, read_chunk_size):
176
- end_row = min(i + read_chunk_size, n_cells)
177
- print(f"Phase [1/2]: Processing: {end_row} of {n_cells} cells.", end='\r')
178
-
179
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
180
- if start_idx == end_idx:
181
- continue
182
-
183
- indices_cpu = h5_indices[start_idx:end_idx]
184
-
185
- if HAS_GPU:
186
- indices_gpu = cupy.asarray(indices_cpu)
187
- unique_gpu = cupy.unique(indices_gpu)
188
- genes_to_keep_mask[unique_gpu] = True
189
- del indices_gpu, unique_gpu
190
- cupy.get_default_memory_pool().free_all_blocks()
191
- else:
192
- unique_cpu = np.unique(indices_cpu)
193
- genes_to_keep_mask[unique_cpu] = True
194
-
195
- if HAS_GPU:
196
- genes_to_keep_mask_cpu = cupy.asnumpy(genes_to_keep_mask)
197
- else:
198
- genes_to_keep_mask_cpu = genes_to_keep_mask
199
-
200
- n_genes_to_keep = np.sum(genes_to_keep_mask_cpu)
201
- print(f"\nPhase [1/2]: COMPLETE | Result: {n_genes_to_keep} / {n_genes} genes retained.")
202
-
203
- # --- GEAR 1: FORKLIFT WRITE (Phase 2) ---
204
- # We are writing to disk. We MUST slow down to 5,000 to save the hard drive.
205
- write_chunk_size = get_optimal_chunk_size(input_filename, multiplier=2.5, is_dense=False, override_cap=5000)
206
-
207
- print(f"Phase [2/2]: Rounding up decimals and saving filtered output to disk... (Chunk: {write_chunk_size})")
208
- adata_meta = anndata.read_h5ad(input_filename, backed='r')
209
- filtered_var_df = adata_meta.var[genes_to_keep_mask_cpu]
210
-
211
- adata_out_template = anndata.AnnData(obs=adata_meta.obs, var=filtered_var_df, uns=adata_meta.uns)
212
- adata_out_template.write_h5ad(output_filename, compression="gzip")
213
-
214
- with h5py.File(output_filename, 'a') as f_out:
215
- if 'X' in f_out: del f_out['X']
216
- x_group_out = f_out.create_group('X')
217
-
218
- out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
219
- out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
220
- out_indptr = x_group_out.create_dataset('indptr', shape=(n_cells + 1,), dtype='int64')
221
- out_indptr[0] = 0
222
- current_nnz = 0
223
-
224
- h5_data = x_group_in['data']
225
-
226
- for i in range(0, n_cells, write_chunk_size):
227
- end_row = min(i + write_chunk_size, n_cells)
228
- print(f"Phase [2/2]: Processing: {end_row} of {n_cells} cells.", end='\r')
229
-
230
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
231
-
232
- data_slice = h5_data[start_idx:end_idx]
233
- indices_slice = h5_indices[start_idx:end_idx]
234
- indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
235
-
236
- chunk = sp_csr_matrix((data_slice, indices_slice, indptr_slice), shape=(end_row-i, n_genes))
237
- filtered_chunk = chunk[:, genes_to_keep_mask_cpu]
238
- filtered_chunk.data = np.ceil(filtered_chunk.data).astype('float32')
239
-
240
- out_data.resize(current_nnz + filtered_chunk.nnz, axis=0)
241
- out_data[current_nnz:] = filtered_chunk.data
242
-
243
- out_indices.resize(current_nnz + filtered_chunk.nnz, axis=0)
244
- out_indices[current_nnz:] = filtered_chunk.indices
245
-
246
- new_indptr_list = filtered_chunk.indptr[1:].astype(np.int64) + current_nnz
247
- out_indptr[i + 1 : end_row + 1] = new_indptr_list
248
-
249
- current_nnz += filtered_chunk.nnz
250
-
251
- x_group_out.attrs['encoding-type'] = 'csr_matrix'
252
- x_group_out.attrs['encoding-version'] = '0.1.0'
253
- x_group_out.attrs['shape'] = np.array([n_cells, n_genes_to_keep], dtype='int64')
254
- print(f"\nPhase [2/2]: COMPLETE | Output: {output_filename} {' ' * 50}")
255
-
256
- end_time = time.perf_counter()
257
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
258
-
259
- def hidden_calc_valsGPU(filename: str) -> dict:
260
- """ Calculates key statistics using memory-safe, GPU-accelerated algorithm. """
261
- start_time = time.perf_counter()
262
- print(f"FUNCTION: hidden_calc_vals() | FILE: {filename}")
263
-
264
- # GEAR 3: CRUISER MODE (Transport Bound)
265
- # Simple math. Maximize throughput with 50k cap.
266
- chunk_size = get_optimal_chunk_size(filename, multiplier=3.0, is_dense=False, override_cap=50000)
267
-
268
- adata_meta = anndata.read_h5ad(filename, backed='r')
269
- print("Phase [1/3]: Finding nc and ng...")
270
- nc, ng = adata_meta.shape
271
- print(f"Phase [1/3]: COMPLETE")
272
-
273
- tis = np.zeros(nc, dtype='int64')
274
- cell_non_zeros = np.zeros(nc, dtype='int64')
275
- tjs_gpu = cupy.zeros(ng, dtype=cupy.float32)
276
- gene_non_zeros_gpu = cupy.zeros(ng, dtype=cupy.int32)
277
-
278
- print("Phase [2/3]: Calculating tis and tjs...")
279
- with h5py.File(filename, 'r') as f_in:
280
- x_group = f_in['X']
281
- h5_indptr = x_group['indptr']
282
- h5_data = x_group['data']
283
- h5_indices = x_group['indices']
284
-
285
- for i in range(0, nc, chunk_size):
286
- end_row = min(i + chunk_size, nc)
287
- print(f"Phase [2/3]: Processing: {end_row} of {nc} cells.", end='\r')
288
-
289
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
290
- data_slice = h5_data[start_idx:end_idx]
291
- indices_slice = h5_indices[start_idx:end_idx]
292
- indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
293
-
294
- data_gpu = cupy.asarray(data_slice.copy(), dtype=cupy.float32)
295
- indices_gpu = cupy.asarray(indices_slice.copy())
296
- indptr_gpu = cupy.asarray(indptr_slice.copy())
297
-
298
- chunk_gpu = cp_csr_matrix((data_gpu, indices_gpu, indptr_gpu), shape=(end_row-i, ng))
299
-
300
- tis[i:end_row] = chunk_gpu.sum(axis=1).get().flatten()
301
- cell_non_zeros_chunk = cupy.diff(indptr_gpu)
302
- cell_non_zeros[i:end_row] = cell_non_zeros_chunk.get()
303
-
304
- cupy.add.at(tjs_gpu, indices_gpu, data_gpu)
305
- unique_indices_gpu, counts_gpu = cupy.unique(indices_gpu, return_counts=True)
306
- cupy.add.at(gene_non_zeros_gpu, unique_indices_gpu, counts_gpu)
307
-
308
- del data_gpu, indices_gpu, indptr_gpu, chunk_gpu
309
- cupy.get_default_memory_pool().free_all_blocks()
310
-
311
- tjs = cupy.asnumpy(tjs_gpu)
312
- gene_non_zeros = cupy.asnumpy(gene_non_zeros_gpu)
313
- print(f"Phase [2/3]: COMPLETE{' ' * 50}")
314
-
315
- print("Phase [3/3]: Calculating dis, djs, and total...")
316
- dis = ng - cell_non_zeros
317
- djs = nc - gene_non_zeros
318
- total = tjs.sum()
319
- print("Phase [3/3]: COMPLETE")
320
-
321
- end_time = time.perf_counter()
322
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
323
-
324
- return {
325
- "tis": pd.Series(tis, index=adata_meta.obs.index),
326
- "tjs": pd.Series(tjs, index=adata_meta.var.index),
327
- "dis": pd.Series(dis, index=adata_meta.obs.index),
328
- "djs": pd.Series(djs, index=adata_meta.var.index),
329
- "total": total,
330
- "nc": nc,
331
- "ng": ng
332
- }
333
-
334
- def NBumiFitModelGPU(cleaned_filename: str, stats: dict) -> dict:
335
- start_time = time.perf_counter()
336
- print(f"FUNCTION: NBumiFitModel() | FILE: {cleaned_filename}")
337
-
338
- # GEAR 2: HEAVY LIFT MODE (Memory Bound)
339
- # High Multiplier (12.0) to account for heavy intermediate matrices (x, x^2, mean).
340
- # No artificial cap (50k) - Let it scale with VRAM.
341
- # If 12GB VRAM -> ~8k rows. If 80GB VRAM -> ~50k rows.
342
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=12.0, is_dense=False, override_cap=50000)
343
-
344
- tjs = stats['tjs'].values
345
- tis = stats['tis'].values
346
- nc, ng = stats['nc'], stats['ng']
347
- total = stats['total']
348
-
349
- tjs_gpu = cupy.asarray(tjs, dtype=cupy.float64)
350
- tis_gpu = cupy.asarray(tis, dtype=cupy.float64)
351
-
352
- sum_x_sq_gpu = cupy.zeros(ng, dtype=cupy.float64)
353
- sum_2xmu_gpu = cupy.zeros(ng, dtype=cupy.float64)
354
-
355
- print("Phase [1/3]: Pre-calculating sum of squared expectations...")
356
- sum_tis_sq_gpu = cupy.sum(tis_gpu**2)
357
- sum_mu_sq_gpu = (tjs_gpu**2 / total**2) * sum_tis_sq_gpu
358
- print("Phase [1/3]: COMPLETE")
359
-
360
- print("Phase [2/3]: Calculating variance components from data chunks...")
361
- with h5py.File(cleaned_filename, 'r') as f_in:
362
- x_group = f_in['X']
363
- h5_indptr = x_group['indptr']
364
- h5_data = x_group['data']
365
- h5_indices = x_group['indices']
366
-
367
- for i in range(0, nc, chunk_size):
368
- end_row = min(i + chunk_size, nc)
369
- print(f"Phase [2/3]: Processing: {end_row} of {nc} cells.", end='\r')
370
-
371
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
372
- if start_idx == end_idx: continue
373
-
374
- data_gpu = cupy.asarray(h5_data[start_idx:end_idx], dtype=cupy.float64)
375
- indices_gpu = cupy.asarray(h5_indices[start_idx:end_idx])
376
- indptr_gpu = cupy.asarray(h5_indptr[i:end_row+1] - h5_indptr[i])
377
-
378
- cupy.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
379
-
380
- nnz_in_chunk = indptr_gpu[-1].item()
381
- cell_boundary_markers = cupy.zeros(nnz_in_chunk, dtype=cupy.int32)
382
- if len(indptr_gpu) > 1:
383
- cell_boundary_markers[indptr_gpu[:-1]] = 1
384
- cell_indices_gpu = (cupy.cumsum(cell_boundary_markers, axis=0) - 1) + i
385
-
386
- tis_per_nz = tis_gpu[cell_indices_gpu]
387
- tjs_per_nz = tjs_gpu[indices_gpu]
388
- term_vals = 2 * data_gpu * tjs_per_nz * tis_per_nz / total
389
- cupy.add.at(sum_2xmu_gpu, indices_gpu, term_vals)
390
-
391
- del data_gpu, indices_gpu, indptr_gpu, cell_indices_gpu
392
- del tis_per_nz, tjs_per_nz, term_vals
393
- cupy.get_default_memory_pool().free_all_blocks()
394
-
395
- print(f"Phase [2/3]: COMPLETE {' ' * 50}")
396
-
397
- print("Phase [3/3]: Finalizing dispersion and variance calculations...")
398
- sum_sq_dev_gpu = sum_x_sq_gpu - sum_2xmu_gpu + sum_mu_sq_gpu
399
- var_obs_gpu = sum_sq_dev_gpu / (nc - 1)
400
-
401
- sizes_gpu = cupy.full(ng, 10000.0)
402
- numerator_gpu = (tjs_gpu**2 / total**2) * sum_tis_sq_gpu
403
- denominator_gpu = sum_sq_dev_gpu - tjs_gpu
404
- stable_mask = denominator_gpu > 1e-6
405
- sizes_gpu[stable_mask] = numerator_gpu[stable_mask] / denominator_gpu[stable_mask]
406
- sizes_gpu[sizes_gpu <= 0] = 10000.0
407
-
408
- var_obs_cpu = var_obs_gpu.get()
409
- sizes_cpu = sizes_gpu.get()
410
- print("Phase [3/3]: COMPLETE")
411
-
412
- end_time = time.perf_counter()
413
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
414
-
415
- return {
416
- 'var_obs': pd.Series(var_obs_cpu, index=stats['tjs'].index),
417
- 'sizes': pd.Series(sizes_cpu, index=stats['tjs'].index),
418
- 'vals': stats
419
- }
420
-
421
- def NBumiFitDispVsMeanGPU(fit, suppress_plot=True):
422
- vals = fit['vals']
423
- size_g = fit['sizes'].values
424
- tjs = vals['tjs'].values
425
- mean_expression = tjs / vals['nc']
426
-
427
- forfit = (np.isfinite(size_g)) & (size_g < 1e6) & (mean_expression > 1e-3) & (size_g > 0)
428
- log2_mean_expr = np.log2(mean_expression, where=(mean_expression > 0))
429
- higher = log2_mean_expr > 4
430
- if np.sum(higher & forfit) > 2000:
431
- forfit = higher & forfit
432
-
433
- y = np.log(size_g[forfit])
434
- x = np.log(mean_expression[forfit])
435
-
436
- X = sm.add_constant(x)
437
- model = sm.OLS(y, X).fit()
438
-
439
- if not suppress_plot:
440
- plt.figure(figsize=(7, 6))
441
- plt.scatter(x, y, alpha=0.5)
442
- plt.plot(x, model.fittedvalues, color='red')
443
- plt.show()
444
-
445
- return model.params
446
-
447
- def NBumiFeatureSelectionHighVarGPU(fit: dict) -> pd.DataFrame:
448
- start_time = time.perf_counter()
449
- print(f"FUNCTION: NBumiFeatureSelectionHighVar()")
450
-
451
- print("Phase [1/1]: Calculating residuals...")
452
- vals = fit['vals']
453
- coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
454
- mean_expression = vals['tjs'].values / vals['nc']
455
-
456
- with np.errstate(divide='ignore', invalid='ignore'):
457
- log_mean_expression = np.log(mean_expression)
458
- log_mean_expression[np.isneginf(log_mean_expression)] = 0
459
- exp_size = np.exp(coeffs[0] + coeffs[1] * log_mean_expression)
460
- res = np.log(fit['sizes'].values) - np.log(exp_size)
461
-
462
- results_df = pd.DataFrame({'Gene': fit['sizes'].index, 'Residual': res})
463
- final_table = results_df.sort_values(by='Residual', ascending=True)
464
- print("Phase [1/1]: COMPLETE")
465
- end_time = time.perf_counter()
466
- print(f"Total time: {end_time - start_time:.4f} seconds.\n")
467
-
468
- return final_table
469
-
470
- def NBumiFeatureSelectionCombinedDropGPU(fit: dict, cleaned_filename: str, method="fdr_bh", qval_thresh=0.05) -> pd.DataFrame:
471
- start_time = time.perf_counter()
472
- print(f"FUNCTION: NBumiFeatureSelectionCombinedDrop() | FILE: {cleaned_filename}")
473
-
474
- # GEAR 4: DENSE MATH MODE (Memory Critical)
475
- # Multiplier 20.0x:
476
- # 1. We assume data promotes to float64 (double memory).
477
- # 2. We broadcast dense matrices (ng * chunk).
478
- # 3. We hold ~5 copies (mu, exp_size, p_is, p_var, temp).
479
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=20.0, is_dense=True, override_cap=20000)
480
-
481
- print("Phase [1/3]: Initializing arrays...")
482
- vals = fit['vals']
483
- coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
484
-
485
- tjs_gpu = cupy.asarray(vals['tjs'].values)
486
- tis_gpu = cupy.asarray(vals['tis'].values)
487
- total = vals['total']
488
- nc = vals['nc']
489
- ng = vals['ng']
490
-
491
- mean_expression_cpu = vals['tjs'].values / nc
492
- with np.errstate(divide='ignore'):
493
- exp_size_cpu = np.exp(coeffs[0] + coeffs[1] * np.log(mean_expression_cpu))
494
- exp_size_gpu = cupy.asarray(exp_size_cpu)
495
-
496
- p_sum_gpu = cupy.zeros(ng, dtype=cupy.float64)
497
- p_var_sum_gpu = cupy.zeros(ng, dtype=cupy.float64)
498
- print("Phase [1/3]: COMPLETE")
499
-
500
- print("Phase [2/3]: Calculating expected dropout sums...")
501
- for i in range(0, nc, chunk_size):
502
- end_col = min(i + chunk_size, nc)
503
- print(f"Phase [2/3]: Processing: {end_col} of {nc} cells.", end='\r')
504
-
505
- tis_chunk_gpu = tis_gpu[i:end_col]
506
- # Memory Intense: Creates dense (chunk x genes) float64 matrices
507
- mu_chunk_gpu = tjs_gpu[:, cupy.newaxis] * tis_chunk_gpu[cupy.newaxis, :] / total
508
-
509
- # Calculate p_is and p_var in steps to allow memory recycling if possible
510
-
511
- # [PATCH START] Restored safety clamping from CPU version to prevent NaN/Inf crashes
512
- base = 1 + mu_chunk_gpu / exp_size_gpu[:, cupy.newaxis]
513
- base = cupy.maximum(base, 1e-12)
514
-
515
- p_is_chunk_gpu = cupy.power(base, -exp_size_gpu[:, cupy.newaxis])
516
- p_is_chunk_gpu = cupy.nan_to_num(p_is_chunk_gpu, nan=0.0, posinf=1.0, neginf=0.0)
517
- # [PATCH END]
518
-
519
- p_sum_gpu += p_is_chunk_gpu.sum(axis=1)
520
-
521
- # Calculate Variance
522
- p_var_is_chunk_gpu = p_is_chunk_gpu * (1 - p_is_chunk_gpu)
523
- p_var_sum_gpu += p_var_is_chunk_gpu.sum(axis=1)
524
-
525
- # Aggressive cleanup
526
- del mu_chunk_gpu, p_is_chunk_gpu, p_var_is_chunk_gpu, tis_chunk_gpu
527
- cupy.get_default_memory_pool().free_all_blocks()
528
-
529
- print(f"Phase [2/3]: COMPLETE {' ' * 50}")
530
-
531
- print("Phase [3/3]: Statistical testing...")
532
- p_sum_cpu = p_sum_gpu.get()
533
- p_var_sum_cpu = p_var_sum_gpu.get()
534
-
535
- droprate_exp = p_sum_cpu / nc
536
- droprate_exp_err = np.sqrt(p_var_sum_cpu / (nc**2))
537
- droprate_obs = vals['djs'].values / nc
538
-
539
- diff = droprate_obs - droprate_exp
540
- combined_err = np.sqrt(droprate_exp_err**2 + (droprate_obs * (1 - droprate_obs) / nc))
541
-
542
- with np.errstate(divide='ignore', invalid='ignore'):
543
- Zed = diff / combined_err
544
-
545
- pvalue = norm.sf(Zed)
546
- results_df = pd.DataFrame({'Gene': vals['tjs'].index, 'p.value': pvalue, 'effect_size': diff})
547
- results_df = results_df.sort_values(by='p.value')
548
- qval = multipletests(results_df['p.value'].fillna(1), method=method)[1]
549
- results_df['q.value'] = qval
550
- final_table = results_df[results_df['q.value'] < qval_thresh]
551
- print("Phase [3/3]: COMPLETE")
552
-
553
- end_time = time.perf_counter()
554
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
555
- return final_table[['Gene', 'effect_size', 'p.value', 'q.value']]
556
-
557
- def NBumiCombinedDropVolcanoGPU(results_df, qval_thresh=0.05, effect_size_thresh=0.25, top_n_genes=10, suppress_plot=False, plot_filename=None):
558
- start_time = time.perf_counter()
559
- print(f"FUNCTION: NBumiCombinedDropVolcano()")
560
-
561
- df = results_df.copy()
562
- non_zero_min = df[df['q.value'] > 0]['q.value'].min()
563
- df['q.value'] = df['q.value'].replace(0, non_zero_min)
564
- df['-log10_qval'] = -np.log10(df['q.value'])
565
- df['color'] = 'grey'
566
- df.loc[(df['q.value'] < qval_thresh) & (df['effect_size'] > effect_size_thresh), 'color'] = 'red'
567
- df.loc[(df['q.value'] < qval_thresh) & (df['effect_size'] < -effect_size_thresh), 'color'] = 'blue'
568
-
569
- plt.figure(figsize=(10, 8))
570
- plt.scatter(df['effect_size'], df['-log10_qval'], c=df['color'], s=10, alpha=0.6)
571
- plt.axvline(x=effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
572
- plt.axvline(x=-effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
573
- plt.axhline(y=-np.log10(qval_thresh), linestyle='--', color='grey', linewidth=0.8)
574
-
575
- top_genes = df.nsmallest(top_n_genes, 'q.value')
576
- for i, row in top_genes.iterrows():
577
- plt.text(row['effect_size'], row['-log10_qval'], row['Gene'], fontsize=9)
578
-
579
- plt.title('Volcano Plot of Dropout Feature Selection')
580
- plt.xlabel('Effect Size (Observed - Expected Dropout Rate)')
581
- plt.ylabel('-log10 (Adjusted p-value)')
582
- plt.grid(True, linestyle='--', alpha=0.3)
583
- ax = plt.gca()
584
-
585
- if plot_filename: plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
586
- if not suppress_plot: plt.show()
587
- plt.close()
588
-
589
- end_time = time.perf_counter()
590
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
591
- return ax