M3Drop 0.4.42__py3-none-any.whl → 0.4.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,420 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import cupy as cp
4
+ import cupyx.scipy.sparse as csp
5
+ import matplotlib.pyplot as plt
6
+ import h5py
7
+ import os
8
+ import time
9
+ import pickle
10
+ import psutil
11
+ import gc
12
+ from scipy import sparse
13
+ from scipy import stats
14
+ import anndata
15
+
16
+ from .ControlDeviceGPU import ControlDevice
17
+ from .CoreGPU import (
18
+ hidden_calc_valsGPU,
19
+ NBumiFitModelGPU,
20
+ NBumiFitDispVsMeanGPU,
21
+ dropout_prob_kernel
22
+ )
23
+
24
+ from cupy.sparse import csr_matrix as cp_csr_matrix
25
+ import scipy.sparse as sp
26
+ from scipy.sparse import csr_matrix as sp_csr_matrix
27
+
28
+ import statsmodels.api as sm
29
+ from scipy.stats import norm
30
+ from statsmodels.stats.multitest import multipletests
31
+
32
+ # ==========================================
33
+ # DIAGNOSTICS & COMPARISON
34
+ # ==========================================
35
+
36
+ def NBumiFitBasicModelGPU(
37
+ filename: str,
38
+ stats: dict,
39
+ mask_filename: str = None,
40
+ mode: str = "auto",
41
+ manual_target: int = 3000,
42
+ phase_label: str = "Phase [1/1]",
43
+ desc_label: str = None # [UI FIX] Added for delayed printing
44
+ ) -> dict:
45
+ """
46
+ Fits the Basic Model by calculating Normalized Variance ON-THE-FLY.
47
+ STRICT FLOAT64 ENFORCEMENT.
48
+ """
49
+ # 1. Get Raw Dimensions & Setup ControlDevice
50
+ with h5py.File(filename, 'r') as f:
51
+ indptr_cpu = f['X']['indptr'][:]
52
+ total_rows = len(indptr_cpu) - 1
53
+ raw_ng = f['X'].attrs['shape'][1]
54
+
55
+ device = ControlDevice(
56
+ indptr=indptr_cpu,
57
+ total_rows=total_rows,
58
+ n_genes=raw_ng,
59
+ mode=mode,
60
+ manual_target=manual_target
61
+ )
62
+ nc = device.total_rows
63
+
64
+ # [UI FIX] Print description AFTER ControlDevice box
65
+ if desc_label:
66
+ print(f"{phase_label}: {desc_label}")
67
+
68
+ # 2. Load Mask
69
+ if mask_filename and os.path.exists(mask_filename):
70
+ with open(mask_filename, 'rb') as f:
71
+ mask_cpu = pickle.load(f)
72
+ else:
73
+ mask_cpu = np.ones(raw_ng, dtype=bool)
74
+
75
+ filtered_ng = int(np.sum(mask_cpu))
76
+
77
+ # 3. Pre-calculate Size Factors
78
+ cell_sums = stats['tis'].values
79
+ median_sum = np.median(cell_sums[cell_sums > 0])
80
+ # [FLOAT64] Explicitly utilizing float64 for size factors
81
+ size_factors = np.ones_like(cell_sums, dtype=np.float64)
82
+ non_zero_mask = cell_sums > 0
83
+ size_factors[non_zero_mask] = cell_sums[non_zero_mask] / median_sum
84
+
85
+ # 4. Init GPU Arrays
86
+ sum_norm_x_gpu = cp.zeros(filtered_ng, dtype=cp.float64)
87
+ sum_norm_sq_gpu = cp.zeros(filtered_ng, dtype=cp.float64)
88
+
89
+ with h5py.File(filename, 'r') as f_in:
90
+ h5_indptr = f_in['X']['indptr']
91
+ h5_data = f_in['X']['data']
92
+ h5_indices = f_in['X']['indices']
93
+
94
+ current_row = 0
95
+ while current_row < nc:
96
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=1.5)
97
+ if end_row is None or end_row <= current_row: break
98
+
99
+ chunk_size = end_row - current_row
100
+ # [UI] Phase-aware progress bar
101
+ print(f"{phase_label}: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
102
+
103
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
104
+ if start_idx == end_idx:
105
+ current_row = end_row
106
+ continue
107
+
108
+ # [FLOAT64] Load Raw Chunk as float64
109
+ data_gpu = cp.asarray(h5_data[start_idx:end_idx], dtype=cp.float64)
110
+ indices_gpu = cp.asarray(h5_indices[start_idx:end_idx])
111
+ indptr_gpu = cp.asarray(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
112
+
113
+ # Reconstruct CSR & Filter
114
+ raw_chunk = cp_csr_matrix((data_gpu, indices_gpu, indptr_gpu), shape=(chunk_size, raw_ng))
115
+ mask_gpu = cp.asarray(mask_cpu)
116
+ filtered_chunk = raw_chunk[:, mask_gpu]
117
+
118
+ # Fused Normalization
119
+ # [FLOAT64] Size factors are already float64
120
+ sf_chunk = cp.asarray(size_factors[current_row:end_row], dtype=cp.float64)
121
+ recip_sf = 1.0 / sf_chunk
122
+ D = csp.diags(recip_sf)
123
+ norm_chunk = D.dot(filtered_chunk)
124
+ norm_chunk.data = cp.round(norm_chunk.data)
125
+
126
+ # Accumulate
127
+ sum_norm_x_gpu += norm_chunk.sum(axis=0).ravel()
128
+ norm_chunk.data **= 2
129
+ sum_norm_sq_gpu += norm_chunk.sum(axis=0).ravel()
130
+
131
+ del data_gpu, indices_gpu, raw_chunk, filtered_chunk, norm_chunk, D, sf_chunk, mask_gpu
132
+ cp.get_default_memory_pool().free_all_blocks()
133
+ current_row = end_row
134
+
135
+ # Final Calculations
136
+ mean_norm_gpu = sum_norm_x_gpu / nc
137
+ mean_sq_norm_gpu = sum_norm_sq_gpu / nc
138
+ var_norm_gpu = mean_sq_norm_gpu - (mean_norm_gpu ** 2)
139
+
140
+ denom_gpu = var_norm_gpu - mean_norm_gpu
141
+ size_gpu = cp.full(filtered_ng, 1000.0, dtype=cp.float64)
142
+ valid_mask = denom_gpu > 1e-6
143
+ size_gpu[valid_mask] = mean_norm_gpu[valid_mask]**2 / denom_gpu[valid_mask]
144
+
145
+ max_size_val = cp.nanmax(size_gpu[size_gpu < 1e6]) * 10
146
+ if cp.isnan(max_size_val) or max_size_val == 0: max_size_val = 1000.0
147
+ size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
148
+ size_gpu[size_gpu < 1e-10] = 1e-10
149
+
150
+ # [UI] Clean completion - Force Newline
151
+ print("")
152
+ print(f"{phase_label}: COMPLETE")
153
+
154
+ return {
155
+ 'var_obs': pd.Series(var_norm_gpu.get(), index=stats['tjs'].index),
156
+ 'sizes': pd.Series(size_gpu.get(), index=stats['tjs'].index),
157
+ 'vals': stats
158
+ }
159
+
160
+ def NBumiCheckFitFSGPU(
161
+ filename: str,
162
+ fit: dict,
163
+ mode: str = "auto",
164
+ manual_target: int = 3000,
165
+ suppress_plot=False,
166
+ plot_filename=None,
167
+ phase_label="Phase [1/1]",
168
+ desc_label: str = None # [UI FIX] Added for delayed printing
169
+ ) -> dict:
170
+ """
171
+ Calculates expected dropouts. Handles Real and Virtual Populations.
172
+ Uses FUSED KERNEL to prevent OOM on large chunks.
173
+ """
174
+ vals = fit['vals']
175
+ ng = vals['ng']
176
+
177
+ with h5py.File(filename, 'r') as f:
178
+ indptr_cpu = f['X']['indptr'][:]
179
+ total_rows = len(indptr_cpu) - 1
180
+
181
+ device = ControlDevice(
182
+ indptr=indptr_cpu,
183
+ total_rows=total_rows,
184
+ n_genes=ng,
185
+ mode=mode,
186
+ manual_target=manual_target
187
+ )
188
+ nc = device.total_rows
189
+
190
+ # [UI FIX] Print description AFTER ControlDevice box
191
+ if desc_label:
192
+ print(f"{phase_label}: {desc_label}")
193
+
194
+ size_coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
195
+
196
+ tjs_gpu = cp.asarray(vals['tjs'].values, dtype=cp.float64)
197
+ tis_gpu = cp.asarray(vals['tis'].values, dtype=cp.float64)
198
+ total = vals['total']
199
+
200
+ mean_expression_gpu = tjs_gpu / nc
201
+ log_mean_expression_gpu = cp.zeros_like(mean_expression_gpu)
202
+ valid_means = mean_expression_gpu > 0
203
+ log_mean_expression_gpu[valid_means] = cp.log(mean_expression_gpu[valid_means])
204
+ smoothed_size_gpu = cp.exp(size_coeffs[0] + size_coeffs[1] * log_mean_expression_gpu)
205
+
206
+ row_ps_gpu = cp.zeros(ng, dtype=cp.float64)
207
+ col_ps_gpu = cp.zeros(nc, dtype=cp.float64)
208
+
209
+ current_row = 0
210
+ while current_row < nc:
211
+ # [FIX] Keep overhead low (1.1) because we are using Fused Kernel
212
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=1.1)
213
+ if end_row is None or end_row <= current_row: break
214
+
215
+ chunk_size = end_row - current_row
216
+
217
+ # [UI] Phase-aware progress bar
218
+ print(f"{phase_label}: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
219
+
220
+ tis_chunk_gpu = tis_gpu[current_row:end_row]
221
+
222
+ # [CRITICAL] FUSED KERNEL PRESERVED (Supercomputer Fix)
223
+ # Explicit float64 for the output buffer
224
+ p_is_chunk_gpu = cp.empty((chunk_size, ng), dtype=cp.float64)
225
+
226
+ dropout_prob_kernel(
227
+ tjs_gpu, # Gene totals
228
+ tis_chunk_gpu[:, None], # Cell totals
229
+ total, # Grand total
230
+ smoothed_size_gpu, # Exp size
231
+ p_is_chunk_gpu # Output
232
+ )
233
+
234
+ p_is_chunk_gpu = cp.nan_to_num(p_is_chunk_gpu, nan=0.0, posinf=1.0, neginf=0.0)
235
+
236
+ row_ps_gpu += p_is_chunk_gpu.sum(axis=0)
237
+ col_ps_gpu[current_row:end_row] = p_is_chunk_gpu.sum(axis=1)
238
+
239
+ del p_is_chunk_gpu, tis_chunk_gpu
240
+ cp.get_default_memory_pool().free_all_blocks()
241
+ current_row = end_row
242
+
243
+ # [UI] Clean completion - Force Newline
244
+ print("")
245
+ print(f"{phase_label}: COMPLETE")
246
+
247
+ row_ps_cpu = row_ps_gpu.get()
248
+ col_ps_cpu = col_ps_gpu.get()
249
+
250
+ return {
251
+ 'rowPs': pd.Series(row_ps_cpu, index=fit['vals']['tjs'].index),
252
+ 'colPs': pd.Series(col_ps_cpu, index=fit['vals']['tis'].index)
253
+ }
254
+
255
+ def NBumiCompareModelsGPU(
256
+ raw_filename: str,
257
+ stats: dict,
258
+ fit_adjust: dict,
259
+ mask_filename: str = None,
260
+ mode: str = "auto",
261
+ manual_target: int = 3000,
262
+ suppress_plot=False,
263
+ plot_filename=None
264
+ ) -> dict:
265
+ """
266
+ Orchestrates the Comparison Pipeline with standardized UI.
267
+ """
268
+ print(f"FUNCTION: NBumiCompareModelsGPU()")
269
+ pipeline_start_time = time.time()
270
+
271
+ # STEP 1: Fit Basic Model
272
+ # [UI FIX] Removed early print, passed as desc_label
273
+ fit_basic = NBumiFitBasicModelGPU(
274
+ raw_filename,
275
+ stats,
276
+ mask_filename=mask_filename,
277
+ mode=mode,
278
+ manual_target=manual_target,
279
+ phase_label="Phase [1/3]",
280
+ desc_label="Fitting Basic Model (Virtual)..."
281
+ )
282
+
283
+ # STEP 2: Depth-Adjusted Dropout
284
+ # [UI FIX] Removed early print, passed as desc_label
285
+ check_adjust = NBumiCheckFitFSGPU(
286
+ raw_filename,
287
+ fit_adjust,
288
+ mode=mode,
289
+ manual_target=manual_target,
290
+ suppress_plot=True,
291
+ phase_label="Phase [2/3]",
292
+ desc_label="Calculating Depth-Adjusted Dropouts..."
293
+ )
294
+
295
+ # STEP 3: Basic Dropout
296
+ # [UI FIX] Removed early print, passed as desc_label
297
+ stats_virtual = stats.copy()
298
+ mean_depth = stats['total'] / stats['nc']
299
+ stats_virtual['tis'] = pd.Series(
300
+ np.full(stats['nc'], mean_depth),
301
+ index=stats['tis'].index
302
+ )
303
+
304
+ fit_basic_for_eval = {
305
+ 'sizes': fit_basic['sizes'],
306
+ 'vals': stats_virtual,
307
+ 'var_obs': fit_basic['var_obs']
308
+ }
309
+
310
+ check_basic = NBumiCheckFitFSGPU(
311
+ raw_filename,
312
+ fit_basic_for_eval,
313
+ mode=mode,
314
+ manual_target=manual_target,
315
+ suppress_plot=True,
316
+ phase_label="Phase [3/3]",
317
+ desc_label="Calculating Basic Dropouts..."
318
+ )
319
+
320
+ # Calculation & Plotting
321
+ nc_data = stats['nc']
322
+ mean_expr = stats['tjs'] / nc_data
323
+ observed_dropout = stats['djs'] / nc_data
324
+
325
+ adj_dropout_fit = check_adjust['rowPs'] / nc_data
326
+ bas_dropout_fit = check_basic['rowPs'] / nc_data
327
+
328
+ err_adj = np.sum(np.abs(adj_dropout_fit - observed_dropout))
329
+ err_bas = np.sum(np.abs(bas_dropout_fit - observed_dropout))
330
+
331
+ comparison_df = pd.DataFrame({
332
+ 'mean_expr': mean_expr,
333
+ 'observed': observed_dropout,
334
+ 'adj_fit': adj_dropout_fit,
335
+ 'bas_fit': bas_dropout_fit
336
+ })
337
+
338
+ # Plotting Logic
339
+ plt.figure(figsize=(10, 6))
340
+ sorted_idx = np.argsort(mean_expr.values)
341
+ plot_idx = sorted_idx[::2] if len(mean_expr) > 20000 else sorted_idx
342
+
343
+ plt.scatter(mean_expr.iloc[plot_idx], observed_dropout.iloc[plot_idx],
344
+ c='black', s=3, alpha=0.5, label='Observed')
345
+
346
+ plt.scatter(mean_expr.iloc[plot_idx], bas_dropout_fit.iloc[plot_idx],
347
+ c='purple', s=3, alpha=0.6, label=f'Basic Fit (Error: {err_bas:.2f})')
348
+
349
+ plt.scatter(mean_expr.iloc[plot_idx], adj_dropout_fit.iloc[plot_idx],
350
+ c='goldenrod', s=3, alpha=0.7, label=f'Depth-Adjusted Fit (Error: {err_adj:.2f})')
351
+
352
+ plt.xscale('log')
353
+ plt.xlabel("Mean Expression")
354
+ plt.ylabel("Dropout Rate")
355
+ plt.title("M3Drop Model Comparison")
356
+ plt.legend()
357
+ plt.grid(True, linestyle='--', alpha=0.3)
358
+
359
+ if plot_filename:
360
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
361
+ print(f"Saving plot to: {plot_filename}")
362
+
363
+ if not suppress_plot:
364
+ plt.show()
365
+
366
+ plt.close()
367
+
368
+ pipeline_end_time = time.time()
369
+ print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
370
+
371
+ return {
372
+ "errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
373
+ "comparison_df": comparison_df
374
+ }
375
+
376
+ def NBumiPlotDispVsMeanGPU(
377
+ fit: dict,
378
+ suppress_plot: bool = False,
379
+ plot_filename: str = None
380
+ ):
381
+ print("FUNCTION: NBumiPlotDispVsMean()")
382
+ start_time = time.time()
383
+
384
+ mean_expression = fit['vals']['tjs'].values / fit['vals']['nc']
385
+ sizes = fit['sizes'].values
386
+
387
+ coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
388
+ intercept, slope = coeffs[0], coeffs[1]
389
+
390
+ log_mean_expr_range = np.linspace(
391
+ np.log(mean_expression[mean_expression > 0].min()),
392
+ np.log(mean_expression.max()),
393
+ 100
394
+ )
395
+ log_fitted_sizes = intercept + slope * log_mean_expr_range
396
+ fitted_sizes = np.exp(log_fitted_sizes)
397
+
398
+ plt.figure(figsize=(8, 6))
399
+ plt.scatter(mean_expression, sizes, label='Observed Dispersion', alpha=0.5, s=8)
400
+ plt.plot(np.exp(log_mean_expr_range), fitted_sizes, color='red', label='Regression Fit', linewidth=2)
401
+
402
+ plt.xscale('log')
403
+ plt.yscale('log')
404
+ plt.xlabel('Mean Expression')
405
+ plt.ylabel('Dispersion Parameter (Sizes)')
406
+ plt.title('Dispersion vs. Mean Expression')
407
+ plt.legend()
408
+ plt.grid(True, which="both", linestyle='--', alpha=0.6)
409
+
410
+ if plot_filename:
411
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
412
+ print(f"Saving plot to: {plot_filename}")
413
+
414
+ if not suppress_plot:
415
+ plt.show()
416
+
417
+ plt.close()
418
+
419
+ end_time = time.time()
420
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
@@ -0,0 +1,199 @@
1
+ import pickle
2
+ import time
3
+ import sys
4
+ import numpy as np
5
+ import h5py
6
+ import anndata
7
+ import pandas as pd
8
+ import os
9
+ from scipy import sparse
10
+
11
+ try:
12
+ from numba import jit, prange
13
+ except ImportError:
14
+ print("CRITICAL ERROR: 'numba' not found. Please install it (pip install numba).")
15
+ sys.exit(1)
16
+
17
+ # [FIX] Strict Relative Import
18
+ from .ControlDeviceCPU import ControlDevice
19
+
20
+ # ==========================================
21
+ # NUMBA KERNELS (CPU)
22
+ # ==========================================
23
+
24
+ @jit(nopython=True, parallel=True, fastmath=True)
25
+ def pearson_residual_kernel_cpu(counts, tj, ti, theta, total, out_matrix):
26
+ """
27
+ Calculates Pearson residuals using Negative Binomial logic.
28
+ Parallelized across CPU cores.
29
+ """
30
+ rows = counts.shape[0]
31
+ cols = counts.shape[1]
32
+
33
+ for r in prange(rows):
34
+ ti_val = ti[r]
35
+ for c in range(cols):
36
+ count_val = counts[r, c]
37
+ mu = (tj[c] * ti_val) / total
38
+
39
+ # theta is vector of size cols (genes)
40
+ theta_val = theta[c]
41
+
42
+ denom_sq = mu + ((mu * mu) / theta_val)
43
+ denom = np.sqrt(denom_sq)
44
+
45
+ if denom < 1e-12:
46
+ out_matrix[r, c] = 0.0
47
+ else:
48
+ out_matrix[r, c] = (count_val - mu) / denom
49
+
50
+ @jit(nopython=True, parallel=True, fastmath=True)
51
+ def pearson_approx_kernel_cpu(counts, tj, ti, total, out_matrix):
52
+ """
53
+ Calculates Approximate Pearson residuals (Poisson limit).
54
+ """
55
+ rows = counts.shape[0]
56
+ cols = counts.shape[1]
57
+
58
+ for r in prange(rows):
59
+ ti_val = ti[r]
60
+ for c in range(cols):
61
+ count_val = counts[r, c]
62
+ mu = (tj[c] * ti_val) / total
63
+
64
+ denom = np.sqrt(mu)
65
+
66
+ if denom < 1e-12:
67
+ out_matrix[r, c] = 0.0
68
+ else:
69
+ out_matrix[r, c] = (count_val - mu) / denom
70
+
71
+ # ==========================================
72
+ # NORMALIZATION FUNCTION
73
+ # ==========================================
74
+
75
+ def NBumiPearsonResidualsCombinedCPU(
76
+ raw_filename: str,
77
+ mask_filename: str,
78
+ fit_filename: str,
79
+ stats_filename: str,
80
+ output_filename_full: str,
81
+ output_filename_approx: str,
82
+ mode: str = "auto",
83
+ manual_target: int = 3000
84
+ ):
85
+ """
86
+ CPU-Optimized: Calculates Full and Approximate residuals in a SINGLE PASS.
87
+ Uses Numba for acceleration on L3-sized dense chunks.
88
+ """
89
+ start_time = time.perf_counter()
90
+ print(f"FUNCTION: NBumiPearsonResidualsCombinedCPU() | FILE: {raw_filename}")
91
+
92
+ # 1. Load Mask
93
+ with open(mask_filename, 'rb') as f: mask = pickle.load(f)
94
+ ng_filtered = int(np.sum(mask))
95
+
96
+ # 2. Init Device
97
+ with h5py.File(raw_filename, 'r') as f: indptr_cpu = f['X']['indptr'][:]; total_rows = len(indptr_cpu) - 1
98
+ device = ControlDevice(indptr=indptr_cpu, total_rows=total_rows, n_genes=ng_filtered, mode=mode, manual_target=manual_target)
99
+ nc = device.total_rows
100
+
101
+ print("Phase [1/2]: Initializing parameters...")
102
+ # Load parameters
103
+ with open(fit_filename, 'rb') as f: fit = pickle.load(f)
104
+ with open(stats_filename, 'rb') as f: stats = pickle.load(f)
105
+
106
+ # Common params (Numpy Arrays)
107
+ total = fit['vals']['total']
108
+ tjs = fit['vals']['tjs'].values.astype(np.float64)
109
+ tis = fit['vals']['tis'].values.astype(np.float64)
110
+
111
+ # Specific params
112
+ sizes = fit['sizes'].values.astype(np.float64) # For Full
113
+
114
+ # Setup Output Files
115
+ adata_in = anndata.read_h5ad(raw_filename, backed='r')
116
+ filtered_var = adata_in.var[mask]
117
+
118
+ # Create skeletons
119
+ adata_out_full = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
120
+ adata_out_full.write_h5ad(output_filename_full, compression=None)
121
+
122
+ adata_out_approx = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
123
+ adata_out_approx.write_h5ad(output_filename_approx, compression=None)
124
+
125
+ # Calculate appropriate H5 storage chunks
126
+ storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
127
+ if storage_chunk_rows < 1: storage_chunk_rows = 1
128
+
129
+ # Open both files for writing simultaneously
130
+ with h5py.File(output_filename_full, 'a') as f_full, h5py.File(output_filename_approx, 'a') as f_approx:
131
+ if 'X' in f_full: del f_full['X']
132
+ if 'X' in f_approx: del f_approx['X']
133
+
134
+ # Float64 output
135
+ out_x_full = f_full.create_dataset(
136
+ 'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
137
+ )
138
+ out_x_approx = f_approx.create_dataset(
139
+ 'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
140
+ )
141
+
142
+ with h5py.File(raw_filename, 'r') as f_in:
143
+ h5_indptr = f_in['X']['indptr']
144
+ h5_data = f_in['X']['data']
145
+ h5_indices = f_in['X']['indices']
146
+
147
+ current_row = 0
148
+ while current_row < nc:
149
+ # Dense mode is faster for Numba
150
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=3.0)
151
+ if end_row is None or end_row <= current_row: break
152
+
153
+ chunk_size = end_row - current_row
154
+ print(f"Phase [2/2]: Processing rows {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
155
+
156
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
157
+
158
+ # Load & Filter
159
+ data = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
160
+ indices = np.array(h5_indices[start_idx:end_idx])
161
+ indptr = np.array(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
162
+
163
+ chunk_csr = sparse.csr_matrix((data, indices, indptr), shape=(chunk_size, len(mask)))
164
+ chunk_csr = chunk_csr[:, mask]
165
+ chunk_csr.data = np.ceil(chunk_csr.data)
166
+
167
+ # Convert to Dense for Numba (faster than sparse iteration for dense ops)
168
+ counts_dense = chunk_csr.toarray()
169
+
170
+ # --- CALC 1: APPROX ---
171
+ approx_out = np.empty_like(counts_dense)
172
+ pearson_approx_kernel_cpu(
173
+ counts_dense,
174
+ tjs,
175
+ tis[current_row:end_row],
176
+ total,
177
+ approx_out
178
+ )
179
+ out_x_approx[current_row:end_row, :] = approx_out
180
+ del approx_out
181
+
182
+ # --- CALC 2: FULL (In-place on counts_dense) ---
183
+ # We can reuse the counts_dense buffer for output to save RAM
184
+ pearson_residual_kernel_cpu(
185
+ counts_dense,
186
+ tjs,
187
+ tis[current_row:end_row],
188
+ sizes,
189
+ total,
190
+ counts_dense # Overwrite input
191
+ )
192
+ out_x_full[current_row:end_row, :] = counts_dense
193
+
194
+ current_row = end_row
195
+
196
+ print(f"\nPhase [2/2]: COMPLETE{' '*50}")
197
+
198
+ if hasattr(adata_in, "file") and adata_in.file is not None: adata_in.file.close()
199
+ print(f"Total time: {time.perf_counter() - start_time:.2f} seconds.\n")
@@ -1,8 +1,3 @@
1
- try:
2
- from .coreGPU import get_optimal_chunk_size
3
- except ImportError:
4
- from coreGPU import get_optimal_chunk_size
5
-
6
1
  import pickle
7
2
  import time
8
3
  import cupy
@@ -13,6 +8,8 @@ import pandas as pd
13
8
  from cupy.sparse import csr_matrix as cp_csr_matrix
14
9
  import os
15
10
 
11
+ from .ControlDeviceGPU import ControlDevice
12
+
16
13
  def NBumiPearsonResidualsGPU(
17
14
  cleaned_filename: str,
18
15
  fit_filename: str,
@@ -211,3 +208,4 @@ def NBumiPearsonResidualsApproxGPU(
211
208
 
212
209
  end_time = time.perf_counter()
213
210
  print(f"Total time: {end_time - start_time:.2f} seconds.\n")
211
+