M3Drop 0.4.42__py3-none-any.whl → 0.4.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- m3Drop/ControlDeviceCPU.py +218 -0
- m3Drop/ControlDeviceGPU.py +236 -0
- m3Drop/CoreCPU.py +508 -0
- m3Drop/CoreGPU.py +506 -0
- m3Drop/DiagnosticsCPU.py +401 -0
- m3Drop/DiagnosticsGPU.py +420 -0
- m3Drop/NormalizationCPU.py +199 -0
- m3Drop/{normalizationGPU.py → NormalizationGPU.py} +3 -5
- m3Drop/__init__.py +42 -51
- {m3drop-0.4.42.dist-info → m3drop-0.4.45.dist-info}/METADATA +4 -2
- m3drop-0.4.45.dist-info/RECORD +14 -0
- {m3drop-0.4.42.dist-info → m3drop-0.4.45.dist-info}/WHEEL +1 -1
- m3Drop/coreCPU.py +0 -477
- m3Drop/coreGPU.py +0 -591
- m3Drop/diagnosticsCPU.py +0 -391
- m3Drop/diagnosticsGPU.py +0 -481
- m3Drop/normalizationCPU.py +0 -146
- m3drop-0.4.42.dist-info/RECORD +0 -12
- {m3drop-0.4.42.dist-info → m3drop-0.4.45.dist-info}/licenses/LICENSE +0 -0
- {m3drop-0.4.42.dist-info → m3drop-0.4.45.dist-info}/top_level.txt +0 -0
m3Drop/coreGPU.py
DELETED
|
@@ -1,591 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
import psutil
|
|
3
|
-
import h5py
|
|
4
|
-
import numpy as np
|
|
5
|
-
import anndata
|
|
6
|
-
import pandas as pd
|
|
7
|
-
import os
|
|
8
|
-
import scipy.sparse as sp
|
|
9
|
-
from scipy.sparse import csr_matrix as sp_csr_matrix
|
|
10
|
-
|
|
11
|
-
import statsmodels.api as sm
|
|
12
|
-
import matplotlib.pyplot as plt
|
|
13
|
-
from scipy.stats import norm
|
|
14
|
-
from statsmodels.stats.multitest import multipletests
|
|
15
|
-
|
|
16
|
-
# Safe Import for Local vs Supercomputer
|
|
17
|
-
try:
|
|
18
|
-
import cupy
|
|
19
|
-
import cupy.sparse as csp
|
|
20
|
-
from cupy.sparse import csr_matrix as cp_csr_matrix
|
|
21
|
-
HAS_GPU = True
|
|
22
|
-
except ImportError:
|
|
23
|
-
cupy = None
|
|
24
|
-
HAS_GPU = False
|
|
25
|
-
print(" [WARNING] CuPy not found. GPU acceleration disabled.")
|
|
26
|
-
|
|
27
|
-
# --- (PING & GOVERNOR PROTOCOL) ---
|
|
28
|
-
def get_optimal_chunk_size(filename: str, multiplier: float, is_dense: bool = False, override_cap: int = 50000) -> int:
|
|
29
|
-
"""
|
|
30
|
-
AUTO-TUNER ENGINE (PING & GOVERNOR).
|
|
31
|
-
|
|
32
|
-
Sensors:
|
|
33
|
-
1. Data Weight (Exact bytes per row)
|
|
34
|
-
2. RAM Pressure (psutil)
|
|
35
|
-
3. VRAM Pressure (cupy)
|
|
36
|
-
4. Context (SLURM Check)
|
|
37
|
-
|
|
38
|
-
Governor:
|
|
39
|
-
- Cluster: Maximize Throughput (Min 5k rows, Ignore CPU Cache)
|
|
40
|
-
- Local: Maximize Responsiveness (Target 10MB Chunk, Protect CPU Cache)
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
# --- SENSOR A: DATA WEIGHT ---
|
|
44
|
-
with h5py.File(filename, 'r') as f:
|
|
45
|
-
x_group = f['X']
|
|
46
|
-
shape = x_group.attrs['shape']
|
|
47
|
-
n_cells, n_genes = shape[0], shape[1]
|
|
48
|
-
|
|
49
|
-
# Detect exact byte size (4 for float32, 8 for float64)
|
|
50
|
-
if 'data' in x_group:
|
|
51
|
-
dtype_size = x_group['data'].dtype.itemsize
|
|
52
|
-
else:
|
|
53
|
-
dtype_size = 4 # Default safety
|
|
54
|
-
|
|
55
|
-
# Calculate Load
|
|
56
|
-
if is_dense:
|
|
57
|
-
# Dense: Width * Bytes * Overhead
|
|
58
|
-
bytes_per_row = n_genes * dtype_size * multiplier
|
|
59
|
-
else:
|
|
60
|
-
# Sparse: (Val + Col + Ptr) * Density
|
|
61
|
-
if 'indptr' in x_group:
|
|
62
|
-
nnz = x_group['indptr'][-1]
|
|
63
|
-
density = nnz / (n_cells * n_genes)
|
|
64
|
-
else:
|
|
65
|
-
density = 0.1 # Safety default
|
|
66
|
-
|
|
67
|
-
# Sparse Row = (Bytes_Data + 4_Index) * density * n_genes
|
|
68
|
-
bytes_per_row = (n_genes * density * (dtype_size + 4)) * multiplier
|
|
69
|
-
|
|
70
|
-
if bytes_per_row < 1: bytes_per_row = 1
|
|
71
|
-
|
|
72
|
-
# --- SENSOR B: RAM CAPACITY ---
|
|
73
|
-
avail_ram = psutil.virtual_memory().available
|
|
74
|
-
limit_ram = int((avail_ram * 0.30) / bytes_per_row) # Cap at 30% RAM
|
|
75
|
-
|
|
76
|
-
# --- SENSOR C: VRAM CAPACITY ---
|
|
77
|
-
limit_vram = float('inf')
|
|
78
|
-
if HAS_GPU:
|
|
79
|
-
try:
|
|
80
|
-
mempool = cupy.get_default_memory_pool()
|
|
81
|
-
mempool.free_all_blocks()
|
|
82
|
-
free_vram = cupy.cuda.Device(0).mem_info[0]
|
|
83
|
-
limit_vram = int((free_vram * 0.60) / bytes_per_row) # Cap at 60% VRAM
|
|
84
|
-
except:
|
|
85
|
-
pass
|
|
86
|
-
|
|
87
|
-
# --- SENSOR D: CONTEXT CHECK (SLURM) ---
|
|
88
|
-
# This is the Ticket Stub check.
|
|
89
|
-
is_cluster = "SLURM_JOB_ID" in os.environ
|
|
90
|
-
|
|
91
|
-
# --- THE GOVERNOR ---
|
|
92
|
-
|
|
93
|
-
if is_cluster and HAS_GPU:
|
|
94
|
-
# SCENARIO 1: CLUSTER (Beast Mode)
|
|
95
|
-
# Goal: Throughput. Ignore CPU Cache.
|
|
96
|
-
optimal = min(limit_ram, limit_vram)
|
|
97
|
-
|
|
98
|
-
# ANTI-STALL FLOOR: Force 3,000 rows minimum to overcome latency
|
|
99
|
-
# (Lowered to 3,000 to prevent OOM on massive dense files)
|
|
100
|
-
if optimal < 3000 and optimal > 100: optimal = 3000
|
|
101
|
-
|
|
102
|
-
mode_msg = "CLUSTER (SLURM Detected)"
|
|
103
|
-
|
|
104
|
-
else:
|
|
105
|
-
# SCENARIO 2: LOCAL (Safe Harbor)
|
|
106
|
-
# Goal: Responsiveness. Protect L3 Cache.
|
|
107
|
-
|
|
108
|
-
# Sensor 4: CPU Cache Target (10MB)
|
|
109
|
-
# 10MB fits in almost all L3 caches (preventing thrashing)
|
|
110
|
-
target_10mb_rows = int(10_000_000 / bytes_per_row)
|
|
111
|
-
|
|
112
|
-
optimal = min(limit_ram, limit_vram, target_10mb_rows)
|
|
113
|
-
|
|
114
|
-
# ANTI-FREEZE FLOOR: Force 500 rows minimum
|
|
115
|
-
if optimal < 500: optimal = 500
|
|
116
|
-
|
|
117
|
-
mode_msg = "LOCAL (Safe Harbor)"
|
|
118
|
-
|
|
119
|
-
# GLOBAL CAP (Transport Safety & Function Specific Override)
|
|
120
|
-
if optimal > override_cap: optimal = override_cap
|
|
121
|
-
|
|
122
|
-
# Cap at total file size
|
|
123
|
-
if optimal > n_cells: optimal = n_cells
|
|
124
|
-
|
|
125
|
-
# --- TELEMETRY OUTPUT ---
|
|
126
|
-
print(f"\n------------------------------------------------------------")
|
|
127
|
-
print(f" CHUNK SIZE OPTIMIZER (PING & GOVERNOR) [{time.strftime('%H:%M:%S')}]")
|
|
128
|
-
print(f"------------------------------------------------------------")
|
|
129
|
-
print(f" CONTEXT : {mode_msg}")
|
|
130
|
-
print(f" DATA LOAD : {int(bytes_per_row):,} bytes/row (dtype={dtype_size})")
|
|
131
|
-
print(f" MULTIPLIER : {multiplier}x")
|
|
132
|
-
print(f" OVERRIDE CAP : {override_cap:,} rows")
|
|
133
|
-
print(f" RAM LIMIT : {limit_ram:,} rows")
|
|
134
|
-
if HAS_GPU:
|
|
135
|
-
print(f" VRAM LIMIT : {limit_vram if limit_vram != float('inf') else 'N/A':,} rows")
|
|
136
|
-
else:
|
|
137
|
-
print(f" VRAM LIMIT : N/A (No GPU)")
|
|
138
|
-
print(f"------------------------------------------------------------")
|
|
139
|
-
print(f" >> CHUNK SIZE : {int(optimal):,} rows")
|
|
140
|
-
print(f"------------------------------------------------------------\n")
|
|
141
|
-
|
|
142
|
-
return int(optimal)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def ConvertDataSparseGPU(
|
|
146
|
-
input_filename: str,
|
|
147
|
-
output_filename: str
|
|
148
|
-
):
|
|
149
|
-
"""
|
|
150
|
-
GPU-ACCELERATED CLEANING.
|
|
151
|
-
Now properly shifts gears between Phase 1 (Fast Read) and Phase 2 (Slow Write).
|
|
152
|
-
"""
|
|
153
|
-
start_time = time.perf_counter()
|
|
154
|
-
print(f"FUNCTION: ConvertDataSparseGPU() | FILE: {input_filename}")
|
|
155
|
-
|
|
156
|
-
with h5py.File(input_filename, 'r') as f_in:
|
|
157
|
-
x_group_in = f_in['X']
|
|
158
|
-
n_cells, n_genes = x_group_in.attrs['shape']
|
|
159
|
-
|
|
160
|
-
# --- GEAR 2: FAST READ (Phase 1) ---
|
|
161
|
-
# We are only reading indices. No writing. Let it fly.
|
|
162
|
-
# Max cap 50k to saturate PCIe bus without timeout.
|
|
163
|
-
read_chunk_size = get_optimal_chunk_size(input_filename, multiplier=2.5, is_dense=False, override_cap=50000)
|
|
164
|
-
|
|
165
|
-
print(f"Phase [1/2]: Identifying genes with non-zero counts... (Chunk: {read_chunk_size})")
|
|
166
|
-
|
|
167
|
-
if HAS_GPU:
|
|
168
|
-
genes_to_keep_mask = cupy.zeros(n_genes, dtype=bool)
|
|
169
|
-
else:
|
|
170
|
-
genes_to_keep_mask = np.zeros(n_genes, dtype=bool)
|
|
171
|
-
|
|
172
|
-
h5_indptr = x_group_in['indptr']
|
|
173
|
-
h5_indices = x_group_in['indices']
|
|
174
|
-
|
|
175
|
-
for i in range(0, n_cells, read_chunk_size):
|
|
176
|
-
end_row = min(i + read_chunk_size, n_cells)
|
|
177
|
-
print(f"Phase [1/2]: Processing: {end_row} of {n_cells} cells.", end='\r')
|
|
178
|
-
|
|
179
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
180
|
-
if start_idx == end_idx:
|
|
181
|
-
continue
|
|
182
|
-
|
|
183
|
-
indices_cpu = h5_indices[start_idx:end_idx]
|
|
184
|
-
|
|
185
|
-
if HAS_GPU:
|
|
186
|
-
indices_gpu = cupy.asarray(indices_cpu)
|
|
187
|
-
unique_gpu = cupy.unique(indices_gpu)
|
|
188
|
-
genes_to_keep_mask[unique_gpu] = True
|
|
189
|
-
del indices_gpu, unique_gpu
|
|
190
|
-
cupy.get_default_memory_pool().free_all_blocks()
|
|
191
|
-
else:
|
|
192
|
-
unique_cpu = np.unique(indices_cpu)
|
|
193
|
-
genes_to_keep_mask[unique_cpu] = True
|
|
194
|
-
|
|
195
|
-
if HAS_GPU:
|
|
196
|
-
genes_to_keep_mask_cpu = cupy.asnumpy(genes_to_keep_mask)
|
|
197
|
-
else:
|
|
198
|
-
genes_to_keep_mask_cpu = genes_to_keep_mask
|
|
199
|
-
|
|
200
|
-
n_genes_to_keep = np.sum(genes_to_keep_mask_cpu)
|
|
201
|
-
print(f"\nPhase [1/2]: COMPLETE | Result: {n_genes_to_keep} / {n_genes} genes retained.")
|
|
202
|
-
|
|
203
|
-
# --- GEAR 1: FORKLIFT WRITE (Phase 2) ---
|
|
204
|
-
# We are writing to disk. We MUST slow down to 5,000 to save the hard drive.
|
|
205
|
-
write_chunk_size = get_optimal_chunk_size(input_filename, multiplier=2.5, is_dense=False, override_cap=5000)
|
|
206
|
-
|
|
207
|
-
print(f"Phase [2/2]: Rounding up decimals and saving filtered output to disk... (Chunk: {write_chunk_size})")
|
|
208
|
-
adata_meta = anndata.read_h5ad(input_filename, backed='r')
|
|
209
|
-
filtered_var_df = adata_meta.var[genes_to_keep_mask_cpu]
|
|
210
|
-
|
|
211
|
-
adata_out_template = anndata.AnnData(obs=adata_meta.obs, var=filtered_var_df, uns=adata_meta.uns)
|
|
212
|
-
adata_out_template.write_h5ad(output_filename, compression="gzip")
|
|
213
|
-
|
|
214
|
-
with h5py.File(output_filename, 'a') as f_out:
|
|
215
|
-
if 'X' in f_out: del f_out['X']
|
|
216
|
-
x_group_out = f_out.create_group('X')
|
|
217
|
-
|
|
218
|
-
out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
|
|
219
|
-
out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
|
|
220
|
-
out_indptr = x_group_out.create_dataset('indptr', shape=(n_cells + 1,), dtype='int64')
|
|
221
|
-
out_indptr[0] = 0
|
|
222
|
-
current_nnz = 0
|
|
223
|
-
|
|
224
|
-
h5_data = x_group_in['data']
|
|
225
|
-
|
|
226
|
-
for i in range(0, n_cells, write_chunk_size):
|
|
227
|
-
end_row = min(i + write_chunk_size, n_cells)
|
|
228
|
-
print(f"Phase [2/2]: Processing: {end_row} of {n_cells} cells.", end='\r')
|
|
229
|
-
|
|
230
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
231
|
-
|
|
232
|
-
data_slice = h5_data[start_idx:end_idx]
|
|
233
|
-
indices_slice = h5_indices[start_idx:end_idx]
|
|
234
|
-
indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
|
|
235
|
-
|
|
236
|
-
chunk = sp_csr_matrix((data_slice, indices_slice, indptr_slice), shape=(end_row-i, n_genes))
|
|
237
|
-
filtered_chunk = chunk[:, genes_to_keep_mask_cpu]
|
|
238
|
-
filtered_chunk.data = np.ceil(filtered_chunk.data).astype('float32')
|
|
239
|
-
|
|
240
|
-
out_data.resize(current_nnz + filtered_chunk.nnz, axis=0)
|
|
241
|
-
out_data[current_nnz:] = filtered_chunk.data
|
|
242
|
-
|
|
243
|
-
out_indices.resize(current_nnz + filtered_chunk.nnz, axis=0)
|
|
244
|
-
out_indices[current_nnz:] = filtered_chunk.indices
|
|
245
|
-
|
|
246
|
-
new_indptr_list = filtered_chunk.indptr[1:].astype(np.int64) + current_nnz
|
|
247
|
-
out_indptr[i + 1 : end_row + 1] = new_indptr_list
|
|
248
|
-
|
|
249
|
-
current_nnz += filtered_chunk.nnz
|
|
250
|
-
|
|
251
|
-
x_group_out.attrs['encoding-type'] = 'csr_matrix'
|
|
252
|
-
x_group_out.attrs['encoding-version'] = '0.1.0'
|
|
253
|
-
x_group_out.attrs['shape'] = np.array([n_cells, n_genes_to_keep], dtype='int64')
|
|
254
|
-
print(f"\nPhase [2/2]: COMPLETE | Output: {output_filename} {' ' * 50}")
|
|
255
|
-
|
|
256
|
-
end_time = time.perf_counter()
|
|
257
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
258
|
-
|
|
259
|
-
def hidden_calc_valsGPU(filename: str) -> dict:
|
|
260
|
-
""" Calculates key statistics using memory-safe, GPU-accelerated algorithm. """
|
|
261
|
-
start_time = time.perf_counter()
|
|
262
|
-
print(f"FUNCTION: hidden_calc_vals() | FILE: {filename}")
|
|
263
|
-
|
|
264
|
-
# GEAR 3: CRUISER MODE (Transport Bound)
|
|
265
|
-
# Simple math. Maximize throughput with 50k cap.
|
|
266
|
-
chunk_size = get_optimal_chunk_size(filename, multiplier=3.0, is_dense=False, override_cap=50000)
|
|
267
|
-
|
|
268
|
-
adata_meta = anndata.read_h5ad(filename, backed='r')
|
|
269
|
-
print("Phase [1/3]: Finding nc and ng...")
|
|
270
|
-
nc, ng = adata_meta.shape
|
|
271
|
-
print(f"Phase [1/3]: COMPLETE")
|
|
272
|
-
|
|
273
|
-
tis = np.zeros(nc, dtype='int64')
|
|
274
|
-
cell_non_zeros = np.zeros(nc, dtype='int64')
|
|
275
|
-
tjs_gpu = cupy.zeros(ng, dtype=cupy.float32)
|
|
276
|
-
gene_non_zeros_gpu = cupy.zeros(ng, dtype=cupy.int32)
|
|
277
|
-
|
|
278
|
-
print("Phase [2/3]: Calculating tis and tjs...")
|
|
279
|
-
with h5py.File(filename, 'r') as f_in:
|
|
280
|
-
x_group = f_in['X']
|
|
281
|
-
h5_indptr = x_group['indptr']
|
|
282
|
-
h5_data = x_group['data']
|
|
283
|
-
h5_indices = x_group['indices']
|
|
284
|
-
|
|
285
|
-
for i in range(0, nc, chunk_size):
|
|
286
|
-
end_row = min(i + chunk_size, nc)
|
|
287
|
-
print(f"Phase [2/3]: Processing: {end_row} of {nc} cells.", end='\r')
|
|
288
|
-
|
|
289
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
290
|
-
data_slice = h5_data[start_idx:end_idx]
|
|
291
|
-
indices_slice = h5_indices[start_idx:end_idx]
|
|
292
|
-
indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
|
|
293
|
-
|
|
294
|
-
data_gpu = cupy.asarray(data_slice.copy(), dtype=cupy.float32)
|
|
295
|
-
indices_gpu = cupy.asarray(indices_slice.copy())
|
|
296
|
-
indptr_gpu = cupy.asarray(indptr_slice.copy())
|
|
297
|
-
|
|
298
|
-
chunk_gpu = cp_csr_matrix((data_gpu, indices_gpu, indptr_gpu), shape=(end_row-i, ng))
|
|
299
|
-
|
|
300
|
-
tis[i:end_row] = chunk_gpu.sum(axis=1).get().flatten()
|
|
301
|
-
cell_non_zeros_chunk = cupy.diff(indptr_gpu)
|
|
302
|
-
cell_non_zeros[i:end_row] = cell_non_zeros_chunk.get()
|
|
303
|
-
|
|
304
|
-
cupy.add.at(tjs_gpu, indices_gpu, data_gpu)
|
|
305
|
-
unique_indices_gpu, counts_gpu = cupy.unique(indices_gpu, return_counts=True)
|
|
306
|
-
cupy.add.at(gene_non_zeros_gpu, unique_indices_gpu, counts_gpu)
|
|
307
|
-
|
|
308
|
-
del data_gpu, indices_gpu, indptr_gpu, chunk_gpu
|
|
309
|
-
cupy.get_default_memory_pool().free_all_blocks()
|
|
310
|
-
|
|
311
|
-
tjs = cupy.asnumpy(tjs_gpu)
|
|
312
|
-
gene_non_zeros = cupy.asnumpy(gene_non_zeros_gpu)
|
|
313
|
-
print(f"Phase [2/3]: COMPLETE{' ' * 50}")
|
|
314
|
-
|
|
315
|
-
print("Phase [3/3]: Calculating dis, djs, and total...")
|
|
316
|
-
dis = ng - cell_non_zeros
|
|
317
|
-
djs = nc - gene_non_zeros
|
|
318
|
-
total = tjs.sum()
|
|
319
|
-
print("Phase [3/3]: COMPLETE")
|
|
320
|
-
|
|
321
|
-
end_time = time.perf_counter()
|
|
322
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
323
|
-
|
|
324
|
-
return {
|
|
325
|
-
"tis": pd.Series(tis, index=adata_meta.obs.index),
|
|
326
|
-
"tjs": pd.Series(tjs, index=adata_meta.var.index),
|
|
327
|
-
"dis": pd.Series(dis, index=adata_meta.obs.index),
|
|
328
|
-
"djs": pd.Series(djs, index=adata_meta.var.index),
|
|
329
|
-
"total": total,
|
|
330
|
-
"nc": nc,
|
|
331
|
-
"ng": ng
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
def NBumiFitModelGPU(cleaned_filename: str, stats: dict) -> dict:
|
|
335
|
-
start_time = time.perf_counter()
|
|
336
|
-
print(f"FUNCTION: NBumiFitModel() | FILE: {cleaned_filename}")
|
|
337
|
-
|
|
338
|
-
# GEAR 2: HEAVY LIFT MODE (Memory Bound)
|
|
339
|
-
# High Multiplier (12.0) to account for heavy intermediate matrices (x, x^2, mean).
|
|
340
|
-
# No artificial cap (50k) - Let it scale with VRAM.
|
|
341
|
-
# If 12GB VRAM -> ~8k rows. If 80GB VRAM -> ~50k rows.
|
|
342
|
-
chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=12.0, is_dense=False, override_cap=50000)
|
|
343
|
-
|
|
344
|
-
tjs = stats['tjs'].values
|
|
345
|
-
tis = stats['tis'].values
|
|
346
|
-
nc, ng = stats['nc'], stats['ng']
|
|
347
|
-
total = stats['total']
|
|
348
|
-
|
|
349
|
-
tjs_gpu = cupy.asarray(tjs, dtype=cupy.float64)
|
|
350
|
-
tis_gpu = cupy.asarray(tis, dtype=cupy.float64)
|
|
351
|
-
|
|
352
|
-
sum_x_sq_gpu = cupy.zeros(ng, dtype=cupy.float64)
|
|
353
|
-
sum_2xmu_gpu = cupy.zeros(ng, dtype=cupy.float64)
|
|
354
|
-
|
|
355
|
-
print("Phase [1/3]: Pre-calculating sum of squared expectations...")
|
|
356
|
-
sum_tis_sq_gpu = cupy.sum(tis_gpu**2)
|
|
357
|
-
sum_mu_sq_gpu = (tjs_gpu**2 / total**2) * sum_tis_sq_gpu
|
|
358
|
-
print("Phase [1/3]: COMPLETE")
|
|
359
|
-
|
|
360
|
-
print("Phase [2/3]: Calculating variance components from data chunks...")
|
|
361
|
-
with h5py.File(cleaned_filename, 'r') as f_in:
|
|
362
|
-
x_group = f_in['X']
|
|
363
|
-
h5_indptr = x_group['indptr']
|
|
364
|
-
h5_data = x_group['data']
|
|
365
|
-
h5_indices = x_group['indices']
|
|
366
|
-
|
|
367
|
-
for i in range(0, nc, chunk_size):
|
|
368
|
-
end_row = min(i + chunk_size, nc)
|
|
369
|
-
print(f"Phase [2/3]: Processing: {end_row} of {nc} cells.", end='\r')
|
|
370
|
-
|
|
371
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
372
|
-
if start_idx == end_idx: continue
|
|
373
|
-
|
|
374
|
-
data_gpu = cupy.asarray(h5_data[start_idx:end_idx], dtype=cupy.float64)
|
|
375
|
-
indices_gpu = cupy.asarray(h5_indices[start_idx:end_idx])
|
|
376
|
-
indptr_gpu = cupy.asarray(h5_indptr[i:end_row+1] - h5_indptr[i])
|
|
377
|
-
|
|
378
|
-
cupy.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
|
|
379
|
-
|
|
380
|
-
nnz_in_chunk = indptr_gpu[-1].item()
|
|
381
|
-
cell_boundary_markers = cupy.zeros(nnz_in_chunk, dtype=cupy.int32)
|
|
382
|
-
if len(indptr_gpu) > 1:
|
|
383
|
-
cell_boundary_markers[indptr_gpu[:-1]] = 1
|
|
384
|
-
cell_indices_gpu = (cupy.cumsum(cell_boundary_markers, axis=0) - 1) + i
|
|
385
|
-
|
|
386
|
-
tis_per_nz = tis_gpu[cell_indices_gpu]
|
|
387
|
-
tjs_per_nz = tjs_gpu[indices_gpu]
|
|
388
|
-
term_vals = 2 * data_gpu * tjs_per_nz * tis_per_nz / total
|
|
389
|
-
cupy.add.at(sum_2xmu_gpu, indices_gpu, term_vals)
|
|
390
|
-
|
|
391
|
-
del data_gpu, indices_gpu, indptr_gpu, cell_indices_gpu
|
|
392
|
-
del tis_per_nz, tjs_per_nz, term_vals
|
|
393
|
-
cupy.get_default_memory_pool().free_all_blocks()
|
|
394
|
-
|
|
395
|
-
print(f"Phase [2/3]: COMPLETE {' ' * 50}")
|
|
396
|
-
|
|
397
|
-
print("Phase [3/3]: Finalizing dispersion and variance calculations...")
|
|
398
|
-
sum_sq_dev_gpu = sum_x_sq_gpu - sum_2xmu_gpu + sum_mu_sq_gpu
|
|
399
|
-
var_obs_gpu = sum_sq_dev_gpu / (nc - 1)
|
|
400
|
-
|
|
401
|
-
sizes_gpu = cupy.full(ng, 10000.0)
|
|
402
|
-
numerator_gpu = (tjs_gpu**2 / total**2) * sum_tis_sq_gpu
|
|
403
|
-
denominator_gpu = sum_sq_dev_gpu - tjs_gpu
|
|
404
|
-
stable_mask = denominator_gpu > 1e-6
|
|
405
|
-
sizes_gpu[stable_mask] = numerator_gpu[stable_mask] / denominator_gpu[stable_mask]
|
|
406
|
-
sizes_gpu[sizes_gpu <= 0] = 10000.0
|
|
407
|
-
|
|
408
|
-
var_obs_cpu = var_obs_gpu.get()
|
|
409
|
-
sizes_cpu = sizes_gpu.get()
|
|
410
|
-
print("Phase [3/3]: COMPLETE")
|
|
411
|
-
|
|
412
|
-
end_time = time.perf_counter()
|
|
413
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
414
|
-
|
|
415
|
-
return {
|
|
416
|
-
'var_obs': pd.Series(var_obs_cpu, index=stats['tjs'].index),
|
|
417
|
-
'sizes': pd.Series(sizes_cpu, index=stats['tjs'].index),
|
|
418
|
-
'vals': stats
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
def NBumiFitDispVsMeanGPU(fit, suppress_plot=True):
|
|
422
|
-
vals = fit['vals']
|
|
423
|
-
size_g = fit['sizes'].values
|
|
424
|
-
tjs = vals['tjs'].values
|
|
425
|
-
mean_expression = tjs / vals['nc']
|
|
426
|
-
|
|
427
|
-
forfit = (np.isfinite(size_g)) & (size_g < 1e6) & (mean_expression > 1e-3) & (size_g > 0)
|
|
428
|
-
log2_mean_expr = np.log2(mean_expression, where=(mean_expression > 0))
|
|
429
|
-
higher = log2_mean_expr > 4
|
|
430
|
-
if np.sum(higher & forfit) > 2000:
|
|
431
|
-
forfit = higher & forfit
|
|
432
|
-
|
|
433
|
-
y = np.log(size_g[forfit])
|
|
434
|
-
x = np.log(mean_expression[forfit])
|
|
435
|
-
|
|
436
|
-
X = sm.add_constant(x)
|
|
437
|
-
model = sm.OLS(y, X).fit()
|
|
438
|
-
|
|
439
|
-
if not suppress_plot:
|
|
440
|
-
plt.figure(figsize=(7, 6))
|
|
441
|
-
plt.scatter(x, y, alpha=0.5)
|
|
442
|
-
plt.plot(x, model.fittedvalues, color='red')
|
|
443
|
-
plt.show()
|
|
444
|
-
|
|
445
|
-
return model.params
|
|
446
|
-
|
|
447
|
-
def NBumiFeatureSelectionHighVarGPU(fit: dict) -> pd.DataFrame:
|
|
448
|
-
start_time = time.perf_counter()
|
|
449
|
-
print(f"FUNCTION: NBumiFeatureSelectionHighVar()")
|
|
450
|
-
|
|
451
|
-
print("Phase [1/1]: Calculating residuals...")
|
|
452
|
-
vals = fit['vals']
|
|
453
|
-
coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
|
|
454
|
-
mean_expression = vals['tjs'].values / vals['nc']
|
|
455
|
-
|
|
456
|
-
with np.errstate(divide='ignore', invalid='ignore'):
|
|
457
|
-
log_mean_expression = np.log(mean_expression)
|
|
458
|
-
log_mean_expression[np.isneginf(log_mean_expression)] = 0
|
|
459
|
-
exp_size = np.exp(coeffs[0] + coeffs[1] * log_mean_expression)
|
|
460
|
-
res = np.log(fit['sizes'].values) - np.log(exp_size)
|
|
461
|
-
|
|
462
|
-
results_df = pd.DataFrame({'Gene': fit['sizes'].index, 'Residual': res})
|
|
463
|
-
final_table = results_df.sort_values(by='Residual', ascending=True)
|
|
464
|
-
print("Phase [1/1]: COMPLETE")
|
|
465
|
-
end_time = time.perf_counter()
|
|
466
|
-
print(f"Total time: {end_time - start_time:.4f} seconds.\n")
|
|
467
|
-
|
|
468
|
-
return final_table
|
|
469
|
-
|
|
470
|
-
def NBumiFeatureSelectionCombinedDropGPU(fit: dict, cleaned_filename: str, method="fdr_bh", qval_thresh=0.05) -> pd.DataFrame:
|
|
471
|
-
start_time = time.perf_counter()
|
|
472
|
-
print(f"FUNCTION: NBumiFeatureSelectionCombinedDrop() | FILE: {cleaned_filename}")
|
|
473
|
-
|
|
474
|
-
# GEAR 4: DENSE MATH MODE (Memory Critical)
|
|
475
|
-
# Multiplier 20.0x:
|
|
476
|
-
# 1. We assume data promotes to float64 (double memory).
|
|
477
|
-
# 2. We broadcast dense matrices (ng * chunk).
|
|
478
|
-
# 3. We hold ~5 copies (mu, exp_size, p_is, p_var, temp).
|
|
479
|
-
chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=20.0, is_dense=True, override_cap=20000)
|
|
480
|
-
|
|
481
|
-
print("Phase [1/3]: Initializing arrays...")
|
|
482
|
-
vals = fit['vals']
|
|
483
|
-
coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
|
|
484
|
-
|
|
485
|
-
tjs_gpu = cupy.asarray(vals['tjs'].values)
|
|
486
|
-
tis_gpu = cupy.asarray(vals['tis'].values)
|
|
487
|
-
total = vals['total']
|
|
488
|
-
nc = vals['nc']
|
|
489
|
-
ng = vals['ng']
|
|
490
|
-
|
|
491
|
-
mean_expression_cpu = vals['tjs'].values / nc
|
|
492
|
-
with np.errstate(divide='ignore'):
|
|
493
|
-
exp_size_cpu = np.exp(coeffs[0] + coeffs[1] * np.log(mean_expression_cpu))
|
|
494
|
-
exp_size_gpu = cupy.asarray(exp_size_cpu)
|
|
495
|
-
|
|
496
|
-
p_sum_gpu = cupy.zeros(ng, dtype=cupy.float64)
|
|
497
|
-
p_var_sum_gpu = cupy.zeros(ng, dtype=cupy.float64)
|
|
498
|
-
print("Phase [1/3]: COMPLETE")
|
|
499
|
-
|
|
500
|
-
print("Phase [2/3]: Calculating expected dropout sums...")
|
|
501
|
-
for i in range(0, nc, chunk_size):
|
|
502
|
-
end_col = min(i + chunk_size, nc)
|
|
503
|
-
print(f"Phase [2/3]: Processing: {end_col} of {nc} cells.", end='\r')
|
|
504
|
-
|
|
505
|
-
tis_chunk_gpu = tis_gpu[i:end_col]
|
|
506
|
-
# Memory Intense: Creates dense (chunk x genes) float64 matrices
|
|
507
|
-
mu_chunk_gpu = tjs_gpu[:, cupy.newaxis] * tis_chunk_gpu[cupy.newaxis, :] / total
|
|
508
|
-
|
|
509
|
-
# Calculate p_is and p_var in steps to allow memory recycling if possible
|
|
510
|
-
|
|
511
|
-
# [PATCH START] Restored safety clamping from CPU version to prevent NaN/Inf crashes
|
|
512
|
-
base = 1 + mu_chunk_gpu / exp_size_gpu[:, cupy.newaxis]
|
|
513
|
-
base = cupy.maximum(base, 1e-12)
|
|
514
|
-
|
|
515
|
-
p_is_chunk_gpu = cupy.power(base, -exp_size_gpu[:, cupy.newaxis])
|
|
516
|
-
p_is_chunk_gpu = cupy.nan_to_num(p_is_chunk_gpu, nan=0.0, posinf=1.0, neginf=0.0)
|
|
517
|
-
# [PATCH END]
|
|
518
|
-
|
|
519
|
-
p_sum_gpu += p_is_chunk_gpu.sum(axis=1)
|
|
520
|
-
|
|
521
|
-
# Calculate Variance
|
|
522
|
-
p_var_is_chunk_gpu = p_is_chunk_gpu * (1 - p_is_chunk_gpu)
|
|
523
|
-
p_var_sum_gpu += p_var_is_chunk_gpu.sum(axis=1)
|
|
524
|
-
|
|
525
|
-
# Aggressive cleanup
|
|
526
|
-
del mu_chunk_gpu, p_is_chunk_gpu, p_var_is_chunk_gpu, tis_chunk_gpu
|
|
527
|
-
cupy.get_default_memory_pool().free_all_blocks()
|
|
528
|
-
|
|
529
|
-
print(f"Phase [2/3]: COMPLETE {' ' * 50}")
|
|
530
|
-
|
|
531
|
-
print("Phase [3/3]: Statistical testing...")
|
|
532
|
-
p_sum_cpu = p_sum_gpu.get()
|
|
533
|
-
p_var_sum_cpu = p_var_sum_gpu.get()
|
|
534
|
-
|
|
535
|
-
droprate_exp = p_sum_cpu / nc
|
|
536
|
-
droprate_exp_err = np.sqrt(p_var_sum_cpu / (nc**2))
|
|
537
|
-
droprate_obs = vals['djs'].values / nc
|
|
538
|
-
|
|
539
|
-
diff = droprate_obs - droprate_exp
|
|
540
|
-
combined_err = np.sqrt(droprate_exp_err**2 + (droprate_obs * (1 - droprate_obs) / nc))
|
|
541
|
-
|
|
542
|
-
with np.errstate(divide='ignore', invalid='ignore'):
|
|
543
|
-
Zed = diff / combined_err
|
|
544
|
-
|
|
545
|
-
pvalue = norm.sf(Zed)
|
|
546
|
-
results_df = pd.DataFrame({'Gene': vals['tjs'].index, 'p.value': pvalue, 'effect_size': diff})
|
|
547
|
-
results_df = results_df.sort_values(by='p.value')
|
|
548
|
-
qval = multipletests(results_df['p.value'].fillna(1), method=method)[1]
|
|
549
|
-
results_df['q.value'] = qval
|
|
550
|
-
final_table = results_df[results_df['q.value'] < qval_thresh]
|
|
551
|
-
print("Phase [3/3]: COMPLETE")
|
|
552
|
-
|
|
553
|
-
end_time = time.perf_counter()
|
|
554
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
555
|
-
return final_table[['Gene', 'effect_size', 'p.value', 'q.value']]
|
|
556
|
-
|
|
557
|
-
def NBumiCombinedDropVolcanoGPU(results_df, qval_thresh=0.05, effect_size_thresh=0.25, top_n_genes=10, suppress_plot=False, plot_filename=None):
|
|
558
|
-
start_time = time.perf_counter()
|
|
559
|
-
print(f"FUNCTION: NBumiCombinedDropVolcano()")
|
|
560
|
-
|
|
561
|
-
df = results_df.copy()
|
|
562
|
-
non_zero_min = df[df['q.value'] > 0]['q.value'].min()
|
|
563
|
-
df['q.value'] = df['q.value'].replace(0, non_zero_min)
|
|
564
|
-
df['-log10_qval'] = -np.log10(df['q.value'])
|
|
565
|
-
df['color'] = 'grey'
|
|
566
|
-
df.loc[(df['q.value'] < qval_thresh) & (df['effect_size'] > effect_size_thresh), 'color'] = 'red'
|
|
567
|
-
df.loc[(df['q.value'] < qval_thresh) & (df['effect_size'] < -effect_size_thresh), 'color'] = 'blue'
|
|
568
|
-
|
|
569
|
-
plt.figure(figsize=(10, 8))
|
|
570
|
-
plt.scatter(df['effect_size'], df['-log10_qval'], c=df['color'], s=10, alpha=0.6)
|
|
571
|
-
plt.axvline(x=effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
|
|
572
|
-
plt.axvline(x=-effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
|
|
573
|
-
plt.axhline(y=-np.log10(qval_thresh), linestyle='--', color='grey', linewidth=0.8)
|
|
574
|
-
|
|
575
|
-
top_genes = df.nsmallest(top_n_genes, 'q.value')
|
|
576
|
-
for i, row in top_genes.iterrows():
|
|
577
|
-
plt.text(row['effect_size'], row['-log10_qval'], row['Gene'], fontsize=9)
|
|
578
|
-
|
|
579
|
-
plt.title('Volcano Plot of Dropout Feature Selection')
|
|
580
|
-
plt.xlabel('Effect Size (Observed - Expected Dropout Rate)')
|
|
581
|
-
plt.ylabel('-log10 (Adjusted p-value)')
|
|
582
|
-
plt.grid(True, linestyle='--', alpha=0.3)
|
|
583
|
-
ax = plt.gca()
|
|
584
|
-
|
|
585
|
-
if plot_filename: plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
|
|
586
|
-
if not suppress_plot: plt.show()
|
|
587
|
-
plt.close()
|
|
588
|
-
|
|
589
|
-
end_time = time.perf_counter()
|
|
590
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
591
|
-
return ax
|