M3Drop 0.4.41__py3-none-any.whl → 0.4.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
m3Drop/diagnosticsGPU.py DELETED
@@ -1,481 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import cupy as cp
4
- import cupyx.scipy.sparse as csp
5
- import matplotlib.pyplot as plt
6
- import h5py
7
- import os
8
- import time
9
- import psutil
10
- import gc
11
- from scipy import sparse
12
- from scipy import stats
13
- import anndata
14
-
15
- # [GOVERNOR INTEGRATION]
16
- from .coreGPU import hidden_calc_valsGPU, NBumiFitModelGPU, NBumiFitDispVsMeanGPU, get_optimal_chunk_size
17
- from cupy.sparse import csr_matrix as cp_csr_matrix
18
- import scipy.sparse as sp
19
- from scipy.sparse import csr_matrix as sp_csr_matrix
20
-
21
- import statsmodels.api as sm
22
- from scipy.stats import norm
23
- from statsmodels.stats.multitest import multipletests
24
-
25
- def NBumiFitBasicModelGPU(
26
- cleaned_filename: str,
27
- stats: dict,
28
- is_logged=False,
29
- chunk_size: int = None
30
- ) -> dict:
31
- """
32
- Fits a simpler, unadjusted NB model out-of-core using a GPU-accelerated
33
- algorithm. Designed to work with a standard (cell, gene) sparse matrix.
34
- """
35
- start_time = time.perf_counter()
36
- print(f"FUNCTION: NBumiFitBasicModel() | FILE: {cleaned_filename}")
37
-
38
- # [GOVERNOR INTEGRATION] Calculate optimal chunk size if not provided
39
- if chunk_size is None:
40
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=3.0, is_dense=True)
41
-
42
- # --- Phase 1: Initialization ---
43
- print("Phase [1/2]: Initializing parameters and arrays on GPU...")
44
- tjs = stats['tjs'].values
45
- nc, ng = stats['nc'], stats['ng']
46
-
47
- tjs_gpu = cp.asarray(tjs, dtype=cp.float64)
48
- sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
49
- print("Phase [1/2]: COMPLETE")
50
-
51
- # --- Phase 2: Calculate Variance from Data Chunks ---
52
- print("Phase [2/2]: Calculating variance from data chunks...")
53
- with h5py.File(cleaned_filename, 'r') as f_in:
54
- x_group = f_in['X']
55
- h5_indptr = x_group['indptr']
56
- h5_data = x_group['data']
57
- h5_indices = x_group['indices']
58
-
59
- for i in range(0, nc, chunk_size):
60
- end_row = min(i + chunk_size, nc)
61
- print(f"Phase [2/2]: Processing: {end_row} of {nc} cells.", end='\r')
62
-
63
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
64
- if start_idx == end_idx:
65
- continue
66
-
67
- # Original processing for smaller chunks
68
- data_slice = h5_data[start_idx:end_idx]
69
- indices_slice = h5_indices[start_idx:end_idx]
70
-
71
- data_gpu = cp.asarray(data_slice, dtype=cp.float64)
72
- indices_gpu = cp.asarray(indices_slice)
73
-
74
- # Accumulate the sum of squares for each gene
75
- cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
76
-
77
- # Clean up
78
- del data_gpu, indices_gpu
79
- cp.get_default_memory_pool().free_all_blocks()
80
-
81
- print(f"Phase [2/2]: COMPLETE ")
82
-
83
- # --- Final calculations on GPU ---
84
- if is_logged:
85
- raise NotImplementedError("Logged data variance calculation is not implemented for out-of-core.")
86
- else:
87
- # Variance of raw data: Var(X) = E[X^2] - E[X]^2
88
- mean_x_sq_gpu = sum_x_sq_gpu / nc
89
- mean_mu_gpu = tjs_gpu / nc
90
- my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
91
-
92
- # Calculate dispersion ('size')
93
- size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
94
-
95
- max_size_val = cp.nanmax(size_gpu) * 10
96
- if cp.isnan(max_size_val):
97
- max_size_val = 1000
98
- size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
99
- size_gpu[size_gpu < 1e-10] = 1e-10
100
-
101
- # Move results to CPU
102
- my_rowvar_cpu = my_rowvar_gpu.get()
103
- sizes_cpu = size_gpu.get()
104
-
105
- end_time = time.perf_counter()
106
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
107
-
108
- return {
109
- 'var_obs': pd.Series(my_rowvar_cpu, index=stats['tjs'].index),
110
- 'sizes': pd.Series(sizes_cpu, index=stats['tjs'].index),
111
- 'vals': stats
112
- }
113
-
114
- def NBumiCheckFitFSGPU(
115
- cleaned_filename: str,
116
- fit: dict,
117
- chunk_size: int = None,
118
- suppress_plot=False,
119
- plot_filename=None
120
- ) -> dict:
121
- """
122
- FIXED VERSION - No cupy.errstate, proper GPU computation.
123
- """
124
- start_time = time.perf_counter()
125
- print(f"FUNCTION: NBumiCheckFitFS() | FILE: {cleaned_filename}")
126
-
127
- # [GOVERNOR INTEGRATION] Adaptive chunk sizing
128
- # [CRITICAL FIX] Increased multiplier to 20.0 to prevent VRAM overflow during dense expansion
129
- if chunk_size is None:
130
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=20.0, is_dense=True)
131
-
132
- # --- Phase 1: Initialization ---
133
- print("Phase [1/2]: Initializing parameters and arrays on GPU...")
134
- vals = fit['vals']
135
- size_coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
136
-
137
- # Must use float64 for precision
138
- tjs_gpu = cp.asarray(vals['tjs'].values, dtype=cp.float64)
139
- tis_gpu = cp.asarray(vals['tis'].values, dtype=cp.float64)
140
- total = vals['total']
141
- nc, ng = vals['nc'], vals['ng']
142
-
143
- # Calculate smoothed size
144
- mean_expression_gpu = tjs_gpu / nc
145
- log_mean_expression_gpu = cp.log(mean_expression_gpu)
146
- smoothed_size_gpu = cp.exp(size_coeffs[0] + size_coeffs[1] * log_mean_expression_gpu)
147
-
148
- # Initialize result arrays
149
- row_ps_gpu = cp.zeros(ng, dtype=cp.float64)
150
- col_ps_gpu = cp.zeros(nc, dtype=cp.float64)
151
- print("Phase [1/2]: COMPLETE")
152
-
153
- # --- Phase 2: Calculate Expected Dropouts ---
154
- print("Phase [2/2]: Calculating expected dropouts from data chunks...")
155
-
156
- # [GOVERNOR INTEGRATION] Removed naive calculation, utilizing Governor's chunk_size
157
- optimal_chunk = chunk_size
158
- print(f" Using governor chunk size: {optimal_chunk}")
159
-
160
- for i in range(0, nc, optimal_chunk):
161
- end_col = min(i + optimal_chunk, nc)
162
- print(f"Phase [2/2]: Processing: {end_col} of {nc} cells.", end='\r')
163
-
164
- tis_chunk_gpu = tis_gpu[i:end_col]
165
-
166
- # Standard calculation without errstate
167
- mu_chunk_gpu = tjs_gpu[:, cp.newaxis] * tis_chunk_gpu[cp.newaxis, :] / total
168
-
169
- # Calculate p_is directly - CuPy handles overflow internally
170
- base = 1 + mu_chunk_gpu / smoothed_size_gpu[:, cp.newaxis]
171
- p_is_chunk_gpu = cp.power(base, -smoothed_size_gpu[:, cp.newaxis])
172
-
173
- # Handle any inf/nan values that might have occurred
174
- p_is_chunk_gpu = cp.nan_to_num(p_is_chunk_gpu, nan=0.0, posinf=1.0, neginf=0.0)
175
-
176
- # Sum results
177
- row_ps_gpu += p_is_chunk_gpu.sum(axis=1)
178
- col_ps_gpu[i:end_col] = p_is_chunk_gpu.sum(axis=0)
179
-
180
- # Clean up
181
- del mu_chunk_gpu, p_is_chunk_gpu, base, tis_chunk_gpu
182
-
183
- # Periodic memory cleanup
184
- mempool = cp.get_default_memory_pool()
185
- if (i // optimal_chunk) % 10 == 0:
186
- mempool.free_all_blocks()
187
-
188
- print(f"Phase [2/2]: COMPLETE{' ' * 50}")
189
-
190
- # Move results to CPU
191
- row_ps_cpu = row_ps_gpu.get()
192
- col_ps_cpu = col_ps_gpu.get()
193
- djs_cpu = vals['djs'].values
194
- dis_cpu = vals['dis'].values
195
-
196
- # Plotting
197
- if not suppress_plot:
198
- plt.figure(figsize=(12, 5))
199
- plt.subplot(1, 2, 1)
200
- plt.scatter(djs_cpu, row_ps_cpu, alpha=0.5, s=10)
201
- plt.title("Gene-specific Dropouts (Smoothed)")
202
- plt.xlabel("Observed")
203
- plt.ylabel("Fit")
204
- lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
205
- plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
206
- plt.grid(True); plt.legend()
207
-
208
- plt.subplot(1, 2, 2)
209
- plt.scatter(dis_cpu, col_ps_cpu, alpha=0.5, s=10)
210
- plt.title("Cell-specific Dropouts (Smoothed)")
211
- plt.xlabel("Observed")
212
- plt.ylabel("Expected")
213
- lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
214
- plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
215
- plt.grid(True); plt.legend()
216
-
217
- plt.tight_layout()
218
- if plot_filename:
219
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
220
- print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
221
- plt.show()
222
- plt.close()
223
-
224
- # Calculate errors
225
- gene_error = np.sum((djs_cpu - row_ps_cpu)**2)
226
- cell_error = np.sum((dis_cpu - col_ps_cpu)**2)
227
-
228
- end_time = time.perf_counter()
229
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
230
-
231
- return {
232
- 'gene_error': gene_error,
233
- 'cell_error': cell_error,
234
- 'rowPs': pd.Series(row_ps_cpu, index=fit['vals']['tjs'].index),
235
- 'colPs': pd.Series(col_ps_cpu, index=fit['vals']['tis'].index)
236
- }
237
-
238
- def NBumiCompareModelsGPU(
239
- raw_filename: str,
240
- cleaned_filename: str,
241
- stats: dict,
242
- fit_adjust: dict,
243
- chunk_size: int = None,
244
- suppress_plot=False,
245
- plot_filename=None
246
- ) -> dict:
247
- """
248
- OPTIMIZED VERSION - Faster normalization and sparse matrix writing.
249
- """
250
- pipeline_start_time = time.time()
251
- print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
252
-
253
- # [GOVERNOR INTEGRATION] Calculate chunk size for normalization phase (heavy IO)
254
- if chunk_size is None:
255
- # Multiplier 10.0 for safety during normalization of massive dense expansion
256
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=10.0, is_dense=True)
257
-
258
- # --- Phase 1: OPTIMIZED Normalization ---
259
- print("Phase [1/4]: Creating temporary 'basic' normalized data file...")
260
- basic_norm_filename = cleaned_filename.replace('.h5ad', '_basic_norm.h5ad')
261
-
262
- # Read metadata. In 'backed' mode, this keeps a file handle open.
263
- adata_meta = anndata.read_h5ad(cleaned_filename, backed='r')
264
- nc, ng = adata_meta.shape
265
- obs_df = adata_meta.obs.copy()
266
- var_df = adata_meta.var.copy()
267
-
268
- cell_sums = stats['tis'].values
269
- median_sum = np.median(cell_sums[cell_sums > 0])
270
-
271
- # Avoid division by zero for cells with zero counts
272
- size_factors = np.ones_like(cell_sums, dtype=np.float32)
273
- non_zero_mask = cell_sums > 0
274
- size_factors[non_zero_mask] = cell_sums[non_zero_mask] / median_sum
275
-
276
- adata_out = anndata.AnnData(obs=obs_df, var=var_df)
277
- # [OPTION 2 CHANGE] Removed compression="gzip" to speed up I/O
278
- adata_out.write_h5ad(basic_norm_filename)
279
-
280
- with h5py.File(basic_norm_filename, 'a') as f_out:
281
- if 'X' in f_out:
282
- del f_out['X']
283
- x_group_out = f_out.create_group('X')
284
- x_group_out.attrs['encoding-type'] = 'csr_matrix'
285
- x_group_out.attrs['encoding-version'] = '0.1.0'
286
- x_group_out.attrs['shape'] = np.array([nc, ng], dtype='int64')
287
-
288
- out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
289
- out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
290
- out_indptr = x_group_out.create_dataset('indptr', shape=(nc + 1,), dtype='int64')
291
- out_indptr[0] = 0
292
- current_nnz = 0
293
-
294
- with h5py.File(cleaned_filename, 'r') as f_in:
295
- h5_indptr = f_in['X']['indptr']
296
- h5_data = f_in['X']['data']
297
- h5_indices = f_in['X']['indices']
298
-
299
- for i in range(0, nc, chunk_size):
300
- end_row = min(i + chunk_size, nc)
301
- print(f"Phase [1/4]: Normalizing: {end_row} of {nc} cells.", end='\r')
302
-
303
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
304
- if start_idx == end_idx:
305
- out_indptr[i + 1 : end_row + 1] = current_nnz
306
- continue
307
-
308
- # Read data for the chunk
309
- data_slice = h5_data[start_idx:end_idx]
310
- indices_slice = h5_indices[start_idx:end_idx]
311
- indptr_slice = h5_indptr[i:end_row + 1] - start_idx
312
-
313
- # Move to GPU for fast normalization
314
- data_gpu = cp.asarray(data_slice.copy(), dtype=cp.float32)
315
-
316
- indptr_gpu = cp.asarray(indptr_slice.copy())
317
- nnz_in_chunk = indptr_gpu[-1].item()
318
- cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
319
- if len(indptr_gpu) > 1:
320
- cell_boundary_markers[indptr_gpu[:-1]] = 1
321
- row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
322
-
323
- size_factors_for_chunk = cp.asarray(size_factors[i:end_row])
324
-
325
- data_gpu /= size_factors_for_chunk[row_indices]
326
-
327
- # [RESTORED LEGACY LOGIC] Rounding matches original file.
328
- data_cpu = np.round(data_gpu.get())
329
-
330
- num_cells_in_chunk = end_row - i
331
- chunk_sp = sp_csr_matrix((data_cpu, indices_slice, indptr_slice),
332
- shape=(num_cells_in_chunk, ng))
333
-
334
- nnz_chunk = chunk_sp.nnz
335
- out_data.resize(current_nnz + nnz_chunk, axis=0)
336
- out_data[current_nnz:] = chunk_sp.data
337
-
338
- out_indices.resize(current_nnz + nnz_chunk, axis=0)
339
- out_indices[current_nnz:] = chunk_sp.indices
340
-
341
- new_indptr_list = chunk_sp.indptr[1:].astype(np.int64) + current_nnz
342
- out_indptr[i + 1 : end_row + 1] = new_indptr_list
343
-
344
- current_nnz += nnz_chunk
345
-
346
- del data_gpu, row_indices, size_factors_for_chunk, indptr_gpu
347
- cp.get_default_memory_pool().free_all_blocks()
348
-
349
- print(f"Phase [1/4]: COMPLETE{' '*50}")
350
-
351
- print("Phase [2/4]: Fitting Basic Model on normalized data...")
352
-
353
- # [GOVERNOR INTEGRATION] Calculate chunk size for basic fit on the heavy normalized file
354
- chunk_size_basic = get_optimal_chunk_size(basic_norm_filename, multiplier=10.0, is_dense=True)
355
-
356
- stats_basic = hidden_calc_valsGPU(basic_norm_filename) # hidden_calc uses its own governor internally
357
- fit_basic = NBumiFitBasicModelGPU(basic_norm_filename, stats_basic, chunk_size=chunk_size_basic)
358
- print("Phase [2/4]: COMPLETE")
359
-
360
- print("Phase [3/4]: Evaluating fits of both models on ORIGINAL data...")
361
- # [GOVERNOR INTEGRATION] Chunk size for check fit
362
- # [CRITICAL FIX] Multiplier 20.0 prevents VRAM overflow
363
- chunk_size_check = get_optimal_chunk_size(cleaned_filename, multiplier=20.0, is_dense=True)
364
-
365
- check_adjust = NBumiCheckFitFSGPU(cleaned_filename, fit_adjust, suppress_plot=True, chunk_size=chunk_size_check)
366
-
367
- fit_basic_for_eval = {
368
- 'sizes': fit_basic['sizes'],
369
- 'vals': stats,
370
- 'var_obs': fit_basic['var_obs']
371
- }
372
- check_basic = NBumiCheckFitFSGPU(cleaned_filename, fit_basic_for_eval, suppress_plot=True, chunk_size=chunk_size_check)
373
- print("Phase [3/4]: COMPLETE")
374
-
375
- print("Phase [4/4]: Generating final comparison...")
376
- nc_data = stats['nc']
377
- mean_expr = stats['tjs'] / nc_data
378
- observed_dropout = stats['djs'] / nc_data
379
-
380
- adj_dropout_fit = check_adjust['rowPs'] / nc_data
381
- bas_dropout_fit = check_basic['rowPs'] / nc_data
382
-
383
- err_adj = np.sum(np.abs(adj_dropout_fit - observed_dropout))
384
- err_bas = np.sum(np.abs(bas_dropout_fit - observed_dropout))
385
-
386
- comparison_df = pd.DataFrame({
387
- 'mean_expr': mean_expr,
388
- 'observed': observed_dropout,
389
- 'adj_fit': adj_dropout_fit,
390
- 'bas_fit': bas_dropout_fit
391
- })
392
-
393
- plt.figure(figsize=(10, 6))
394
- sorted_idx = np.argsort(mean_expr.values)
395
-
396
- plt.scatter(mean_expr.iloc[sorted_idx], observed_dropout.iloc[sorted_idx],
397
- c='black', s=3, alpha=0.5, label='Observed')
398
- plt.scatter(mean_expr.iloc[sorted_idx], bas_dropout_fit.iloc[sorted_idx],
399
- c='purple', s=3, alpha=0.6, label=f'Basic Fit (Error: {err_bas:.2f})')
400
- plt.scatter(mean_expr.iloc[sorted_idx], adj_dropout_fit.iloc[sorted_idx],
401
- c='goldenrod', s=3, alpha=0.7, label=f'Depth-Adjusted Fit (Error: {err_adj:.2f})')
402
-
403
- plt.xscale('log')
404
- plt.xlabel("Mean Expression")
405
- plt.ylabel("Dropout Rate")
406
- plt.title("M3Drop Model Comparison")
407
- plt.legend()
408
- plt.grid(True, linestyle='--', alpha=0.3)
409
-
410
- if plot_filename:
411
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
412
- print(f"STATUS: Model comparison plot saved to '{plot_filename}'")
413
-
414
- if not suppress_plot:
415
- plt.show()
416
-
417
- plt.close()
418
- print("Phase [4/4]: COMPLETE")
419
-
420
- pipeline_end_time = time.time()
421
-
422
- # --- ADD THIS LINE TO FIX THE ERROR ---
423
- adata_meta.file.close() # Explicitly close the file handle
424
-
425
- os.remove(basic_norm_filename)
426
- print(f"STATUS: Temporary file '{basic_norm_filename}' removed.")
427
- print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
428
-
429
- return {
430
- "errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
431
- "comparison_df": comparison_df
432
- }
433
-
434
- def NBumiPlotDispVsMeanGPU(
435
- fit: dict,
436
- suppress_plot: bool = False,
437
- plot_filename: str = None
438
- ):
439
- """
440
- Generates a diagnostic plot of the dispersion vs. mean expression.
441
- """
442
- print("FUNCTION: NBumiPlotDispVsMean()")
443
-
444
- # --- 1. Extract data and regression coefficients ---
445
- mean_expression = fit['vals']['tjs'].values / fit['vals']['nc']
446
- sizes = fit['sizes'].values
447
- coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
448
- intercept, slope = coeffs[0], coeffs[1]
449
-
450
- # --- 2. Calculate the fitted line for plotting ---
451
- # Create a smooth, continuous line using the regression coefficients
452
- log_mean_expr_range = np.linspace(
453
- np.log(mean_expression[mean_expression > 0].min()),
454
- np.log(mean_expression.max()),
455
- 100
456
- )
457
- log_fitted_sizes = intercept + slope * log_mean_expr_range
458
- fitted_sizes = np.exp(log_fitted_sizes)
459
-
460
- # --- 3. Create the plot ---
461
- plt.figure(figsize=(8, 6))
462
- plt.scatter(mean_expression, sizes, label='Observed Dispersion', alpha=0.5, s=8)
463
- plt.plot(np.exp(log_mean_expr_range), fitted_sizes, color='red', label='Regression Fit', linewidth=2)
464
-
465
- plt.xscale('log')
466
- plt.yscale('log')
467
- plt.xlabel('Mean Expression')
468
- plt.ylabel('Dispersion Parameter (Sizes)')
469
- plt.title('Dispersion vs. Mean Expression')
470
- plt.legend()
471
- plt.grid(True, which="both", linestyle='--', alpha=0.6)
472
-
473
- if plot_filename:
474
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
475
- print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
476
-
477
- if not suppress_plot:
478
- plt.show()
479
-
480
- plt.close()
481
- print("FUNCTION: NBumiPlotDispVsMean() COMPLETE\n")
@@ -1,146 +0,0 @@
1
- import pickle
2
- import time
3
- import numpy as np
4
- import h5py
5
- import anndata
6
- from scipy.sparse import csr_matrix as sp_csr_matrix
7
-
8
-
9
- def NBumiPearsonResidualsCPU(
10
- cleaned_filename: str,
11
- fit_filename: str,
12
- output_filename: str,
13
- chunk_size: int = 5000
14
- ):
15
- """
16
- Calculates Pearson residuals in an out-of-core, CPU-only manner.
17
- The output is a dense matrix of residuals.
18
- """
19
- start_time = time.perf_counter()
20
- print(f"FUNCTION: NBumiPearsonResidualsCPU() | FILE: {cleaned_filename}")
21
-
22
- print("Phase [1/2]: Initializing parameters and preparing output file...")
23
- with open(fit_filename, 'rb') as f:
24
- fit = pickle.load(f)
25
-
26
- vals = fit['vals']
27
- tjs = vals['tjs'].values.astype(np.float64)
28
- tis = vals['tis'].values.astype(np.float64)
29
- sizes = fit['sizes'].values.astype(np.float64)
30
- total = vals['total']
31
- nc, ng = vals['nc'], vals['ng']
32
-
33
- adata_in = anndata.read_h5ad(cleaned_filename, backed='r')
34
- adata_out = anndata.AnnData(obs=adata_in.obs, var=adata_in.var)
35
- adata_out.write_h5ad(output_filename, compression="gzip")
36
-
37
- with h5py.File(output_filename, 'a') as f_out:
38
- out_x = f_out.create_dataset('X', shape=(nc, ng), chunks=(chunk_size, ng), dtype='float32')
39
- print("Phase [1/2]: COMPLETE")
40
-
41
- print("Phase [2/2]: Calculating Pearson residuals in chunks...")
42
- with h5py.File(cleaned_filename, 'r') as f_in:
43
- h5_indptr = f_in['X']['indptr']
44
- h5_data = f_in['X']['data']
45
- h5_indices = f_in['X']['indices']
46
-
47
- for i in range(0, nc, chunk_size):
48
- end_row = min(i + chunk_size, nc)
49
- print(f"Phase [2/2]: Processing: {end_row} of {nc} cells.", end='\r')
50
-
51
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
52
- data_slice = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
53
- indices_slice = np.array(h5_indices[start_idx:end_idx], dtype=np.int64)
54
- indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
55
-
56
- counts_chunk_sparse = sp_csr_matrix(
57
- (data_slice, indices_slice, indptr_slice),
58
- shape=(end_row - i, ng)
59
- )
60
- counts_chunk_dense = counts_chunk_sparse.toarray()
61
-
62
- tis_chunk = tis[i:end_row]
63
- mus_chunk = tjs[np.newaxis, :] * tis_chunk[:, np.newaxis] / total
64
-
65
- denominator = np.sqrt(mus_chunk + mus_chunk**2 / sizes[np.newaxis, :])
66
- denominator = np.where(denominator == 0, 1, denominator)
67
- pearson_chunk = (counts_chunk_dense - mus_chunk) / denominator
68
-
69
- out_x[i:end_row, :] = pearson_chunk.astype(np.float32)
70
-
71
- print(f"Phase [2/2]: COMPLETE{' '*50}")
72
-
73
- if hasattr(adata_in, "file") and adata_in.file is not None:
74
- adata_in.file.close()
75
-
76
- end_time = time.perf_counter()
77
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
78
-
79
-
80
- def NBumiPearsonResidualsApproxCPU(
81
- cleaned_filename: str,
82
- stats_filename: str,
83
- output_filename: str,
84
- chunk_size: int = 5000
85
- ):
86
- """
87
- Calculates approximate Pearson residuals in an out-of-core, CPU-only manner.
88
- """
89
- start_time = time.perf_counter()
90
- print(f"FUNCTION: NBumiPearsonResidualsApproxCPU() | FILE: {cleaned_filename}")
91
-
92
- print("Phase [1/2]: Initializing parameters and preparing output file...")
93
- with open(stats_filename, 'rb') as f:
94
- stats = pickle.load(f)
95
-
96
- vals = stats
97
- tjs = vals['tjs'].values.astype(np.float64)
98
- tis = vals['tis'].values.astype(np.float64)
99
- total = vals['total']
100
- nc, ng = vals['nc'], vals['ng']
101
-
102
- adata_in = anndata.read_h5ad(cleaned_filename, backed='r')
103
- adata_out = anndata.AnnData(obs=adata_in.obs, var=adata_in.var)
104
- adata_out.write_h5ad(output_filename, compression="gzip")
105
-
106
- with h5py.File(output_filename, 'a') as f_out:
107
- out_x = f_out.create_dataset('X', shape=(nc, ng), chunks=(chunk_size, ng), dtype='float32')
108
- print("Phase [1/2]: COMPLETE")
109
-
110
- print("Phase [2/2]: Calculating approximate Pearson residuals in chunks...")
111
- with h5py.File(cleaned_filename, 'r') as f_in:
112
- h5_indptr = f_in['X']['indptr']
113
- h5_data = f_in['X']['data']
114
- h5_indices = f_in['X']['indices']
115
-
116
- for i in range(0, nc, chunk_size):
117
- end_row = min(i + chunk_size, nc)
118
- print(f"Phase [2/2]: Processing: {end_row} of {nc} cells.", end='\r')
119
-
120
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
121
- data_slice = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
122
- indices_slice = np.array(h5_indices[start_idx:end_idx], dtype=np.int64)
123
- indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
124
-
125
- counts_chunk_sparse = sp_csr_matrix(
126
- (data_slice, indices_slice, indptr_slice),
127
- shape=(end_row - i, ng)
128
- )
129
- counts_chunk_dense = counts_chunk_sparse.toarray()
130
-
131
- tis_chunk = tis[i:end_row]
132
- mus_chunk = tjs[np.newaxis, :] * tis_chunk[:, np.newaxis] / total
133
-
134
- denominator = np.sqrt(mus_chunk)
135
- denominator = np.where(denominator == 0, 1, denominator)
136
- pearson_chunk = (counts_chunk_dense - mus_chunk) / denominator
137
-
138
- out_x[i:end_row, :] = pearson_chunk.astype(np.float32)
139
-
140
- print(f"Phase [2/2]: COMPLETE{' '*50}")
141
-
142
- if hasattr(adata_in, "file") and adata_in.file is not None:
143
- adata_in.file.close()
144
-
145
- end_time = time.perf_counter()
146
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
@@ -1,12 +0,0 @@
1
- m3Drop/__init__.py,sha256=yaUXhUArnwgLf01Zlpqa5qm9K1aByGqQupIoCaLYiDw,2462
2
- m3Drop/coreCPU.py,sha256=3kPYlSVlYrJEhRUCIoVzmR8CYBaHpxVM5nx-3YQI4d4,17204
3
- m3Drop/coreGPU.py,sha256=k7A06VNgfJ59J8g1VpfKxhTIKrEbW7Bj8pTbQqHaQL8,24571
4
- m3Drop/diagnosticsCPU.py,sha256=BecOKTz2GDjzjs9ycXYsyrSHi2UVgsM58RBuNE62vmU,14273
5
- m3Drop/diagnosticsGPU.py,sha256=0tDHZHVS14qg46p1AZcdX8DOnGmbYJ7ha0FFfKtmENg,18891
6
- m3Drop/normalizationCPU.py,sha256=4ulCrDZZjxVFh2y0i4ayPkNCsZYaOP-Xq2Dnzu9WXtg,5697
7
- m3Drop/normalizationGPU.py,sha256=r5gvJFkabEfCfIsVdpJzWGqve_Iy57EYsEyiLfDo8Mo,8539
8
- m3drop-0.4.41.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
9
- m3drop-0.4.41.dist-info/METADATA,sha256=5jDbZa9PGiqBAv-TBPgGPqz3nCjMHiEDWdNw9qwPSyA,5161
10
- m3drop-0.4.41.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- m3drop-0.4.41.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
12
- m3drop-0.4.41.dist-info/RECORD,,