M3Drop 0.4.36__tar.gz → 0.4.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.36
3
+ Version: 0.4.37
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.36
3
+ Version: 0.4.37
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -0,0 +1,466 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import cupy as cp
4
+ import cupyx.scipy.sparse as csp
5
+ import matplotlib.pyplot as plt
6
+ import h5py
7
+ import os
8
+ import time
9
+ import psutil
10
+ import gc
11
+ from scipy import sparse
12
+ from scipy import stats
13
+ import anndata
14
+
15
+ # [GOVERNOR INTEGRATION]
16
+ from .coreGPU import hidden_calc_valsGPU, NBumiFitModelGPU, NBumiFitDispVsMeanGPU, get_optimal_chunk_size
17
+ from cupy.sparse import csr_matrix as cp_csr_matrix
18
+ import scipy.sparse as sp
19
+ from scipy.sparse import csr_matrix as sp_csr_matrix
20
+ import statsmodels.api as sm
21
+
22
+ def NBumiFitBasicModelGPU(
23
+ cleaned_filename: str,
24
+ stats: dict,
25
+ is_logged=False,
26
+ chunk_size: int = None
27
+ ) -> dict:
28
+ """
29
+ Fits a simpler, unadjusted NB model out-of-core using a GPU-accelerated
30
+ algorithm. Designed to work with a standard (cell, gene) sparse matrix.
31
+ """
32
+ start_time = time.perf_counter()
33
+ print(f"FUNCTION: NBumiFitBasicModel() | FILE: {cleaned_filename}")
34
+
35
+ # [GOVERNOR INTEGRATION] Calculate optimal chunk size if not provided
36
+ if chunk_size is None:
37
+ chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=3.0, is_dense=True)
38
+
39
+ # --- Phase 1: Initialization ---
40
+ print("Phase [1/2]: Initializing parameters and arrays on GPU...")
41
+ tjs = stats['tjs'].values
42
+ nc, ng = stats['nc'], stats['ng']
43
+
44
+ tjs_gpu = cp.asarray(tjs, dtype=cp.float64)
45
+ sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
46
+ print("Phase [1/2]: COMPLETE")
47
+
48
+ # --- Phase 2: Calculate Variance from Data Chunks ---
49
+ print("Phase [2/2]: Calculating variance from data chunks...")
50
+ with h5py.File(cleaned_filename, 'r') as f_in:
51
+ x_group = f_in['X']
52
+ h5_indptr = x_group['indptr']
53
+ h5_data = x_group['data']
54
+ h5_indices = x_group['indices']
55
+
56
+ for i in range(0, nc, chunk_size):
57
+ end_row = min(i + chunk_size, nc)
58
+ print(f"Phase [2/2]: Processing: {end_row} of {nc} cells.", end='\r')
59
+
60
+ start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
61
+ if start_idx == end_idx:
62
+ continue
63
+
64
+ # Process in smaller sub-chunks if needed
65
+ max_elements = 5_000_000 # Process max 5M elements at a time
66
+
67
+ if end_idx - start_idx > max_elements:
68
+ # Process in sub-chunks
69
+ for sub_start in range(start_idx, end_idx, max_elements):
70
+ sub_end = min(sub_start + max_elements, end_idx)
71
+
72
+ data_slice = h5_data[sub_start:sub_end]
73
+ indices_slice = h5_indices[sub_start:sub_end]
74
+
75
+ data_gpu = cp.asarray(data_slice, dtype=cp.float64)
76
+ indices_gpu = cp.asarray(indices_slice)
77
+
78
+ # Accumulate the sum of squares for each gene
79
+ cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
80
+
81
+ # Free GPU memory
82
+ del data_gpu, indices_gpu
83
+ cp.get_default_memory_pool().free_all_blocks()
84
+ else:
85
+ # Original processing for smaller chunks
86
+ data_slice = h5_data[start_idx:end_idx]
87
+ indices_slice = h5_indices[start_idx:end_idx]
88
+
89
+ data_gpu = cp.asarray(data_slice, dtype=cp.float64)
90
+ indices_gpu = cp.asarray(indices_slice)
91
+
92
+ # Accumulate the sum of squares for each gene
93
+ cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
94
+
95
+ # Clean up
96
+ del data_gpu, indices_gpu
97
+ cp.get_default_memory_pool().free_all_blocks()
98
+
99
+ print(f"Phase [2/2]: COMPLETE ")
100
+
101
+ # --- Final calculations on GPU ---
102
+ if is_logged:
103
+ raise NotImplementedError("Logged data variance calculation is not implemented for out-of-core.")
104
+ else:
105
+ # Variance of raw data: Var(X) = E[X^2] - E[X]^2
106
+ mean_x_sq_gpu = sum_x_sq_gpu / nc
107
+ mean_mu_gpu = tjs_gpu / nc
108
+ my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
109
+
110
+ # Calculate dispersion ('size')
111
+ size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
112
+
113
+ max_size_val = cp.nanmax(size_gpu) * 10
114
+ if cp.isnan(max_size_val):
115
+ max_size_val = 1000
116
+ size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
117
+ size_gpu[size_gpu < 1e-10] = 1e-10
118
+
119
+ # Move results to CPU
120
+ my_rowvar_cpu = my_rowvar_gpu.get()
121
+ sizes_cpu = size_gpu.get()
122
+
123
+ end_time = time.perf_counter()
124
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
125
+
126
+ return {
127
+ 'var_obs': pd.Series(my_rowvar_cpu, index=stats['tjs'].index),
128
+ 'sizes': pd.Series(sizes_cpu, index=stats['tjs'].index),
129
+ 'vals': stats
130
+ }
131
+
132
+ def NBumiCheckFitFSGPU(
133
+ cleaned_filename: str,
134
+ fit: dict,
135
+ chunk_size: int = None,
136
+ suppress_plot=False,
137
+ plot_filename=None
138
+ ) -> dict:
139
+ """
140
+ Calculates the fit errors (gene_error, cell_error) for a given model.
141
+ """
142
+ start_time = time.perf_counter()
143
+ print(f"FUNCTION: NBumiCheckFitFS() | FILE: {cleaned_filename}")
144
+
145
+ # [GOVERNOR INTEGRATION] Adaptive chunk sizing
146
+ if chunk_size is None:
147
+ chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=5.0, is_dense=True)
148
+
149
+ # --- Phase 1: Initialization ---
150
+ print("Phase [1/2]: Initializing parameters and arrays on GPU...")
151
+ vals = fit['vals']
152
+ size_coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
153
+
154
+ # Must use float64 for precision
155
+ tjs_gpu = cp.asarray(vals['tjs'].values, dtype=cp.float64)
156
+ tis_gpu = cp.asarray(vals['tis'].values, dtype=cp.float64)
157
+ total = vals['total']
158
+ nc, ng = vals['nc'], vals['ng']
159
+
160
+ # Calculate smoothed size
161
+ mean_expression_gpu = tjs_gpu / nc
162
+ log_mean_expression_gpu = cp.log(mean_expression_gpu)
163
+ smoothed_size_gpu = cp.exp(size_coeffs[0] + size_coeffs[1] * log_mean_expression_gpu)
164
+
165
+ # Initialize result arrays
166
+ row_ps_gpu = cp.zeros(ng, dtype=cp.float64)
167
+ col_ps_gpu = cp.zeros(nc, dtype=cp.float64)
168
+ print("Phase [1/2]: COMPLETE")
169
+
170
+ # --- Phase 2: Calculate Expected Dropouts ---
171
+ print(f"Phase [2/2]: Calculating expected dropouts (Chunk: {chunk_size})...")
172
+
173
+ for i in range(0, nc, chunk_size):
174
+ end_col = min(i + chunk_size, nc)
175
+ print(f"Phase [2/2]: Processing: {end_col} of {nc} cells.", end='\r')
176
+
177
+ tis_chunk_gpu = tis_gpu[i:end_col]
178
+
179
+ # Standard calculation without errstate
180
+ mu_chunk_gpu = tjs_gpu[:, cp.newaxis] * tis_chunk_gpu[cp.newaxis, :] / total
181
+
182
+ # Calculate p_is directly - CuPy handles overflow internally
183
+ base = 1 + mu_chunk_gpu / smoothed_size_gpu[:, cp.newaxis]
184
+ p_is_chunk_gpu = cp.power(base, -smoothed_size_gpu[:, cp.newaxis])
185
+
186
+ # Handle any inf/nan values that might have occurred
187
+ p_is_chunk_gpu = cp.nan_to_num(p_is_chunk_gpu, nan=0.0, posinf=1.0, neginf=0.0)
188
+
189
+ # Sum results
190
+ row_ps_gpu += p_is_chunk_gpu.sum(axis=1)
191
+ col_ps_gpu[i:end_col] = p_is_chunk_gpu.sum(axis=0)
192
+
193
+ # Clean up
194
+ del mu_chunk_gpu, p_is_chunk_gpu, base, tis_chunk_gpu
195
+ cp.get_default_memory_pool().free_all_blocks()
196
+
197
+ print(f"Phase [2/2]: COMPLETE{' ' * 50}")
198
+
199
+ # Move results to CPU
200
+ row_ps_cpu = row_ps_gpu.get()
201
+ col_ps_cpu = col_ps_gpu.get()
202
+ djs_cpu = vals['djs'].values
203
+ dis_cpu = vals['dis'].values
204
+
205
+ # Plotting
206
+ if not suppress_plot:
207
+ plt.figure(figsize=(12, 5))
208
+ plt.subplot(1, 2, 1)
209
+ plt.scatter(djs_cpu, row_ps_cpu, alpha=0.5, s=10)
210
+ plt.title("Gene-specific Dropouts (Smoothed)")
211
+ plt.xlabel("Observed")
212
+ plt.ylabel("Fit")
213
+ lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
214
+ plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
215
+ plt.grid(True); plt.legend()
216
+
217
+ plt.subplot(1, 2, 2)
218
+ plt.scatter(dis_cpu, col_ps_cpu, alpha=0.5, s=10)
219
+ plt.title("Cell-specific Dropouts (Smoothed)")
220
+ plt.xlabel("Observed")
221
+ plt.ylabel("Expected")
222
+ lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
223
+ plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
224
+ plt.grid(True); plt.legend()
225
+
226
+ plt.tight_layout()
227
+ if plot_filename:
228
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
229
+ print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
230
+ plt.show()
231
+ plt.close()
232
+
233
+ # Calculate errors
234
+ gene_error = np.sum((djs_cpu - row_ps_cpu)**2)
235
+ cell_error = np.sum((dis_cpu - col_ps_cpu)**2)
236
+
237
+ end_time = time.perf_counter()
238
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")
239
+
240
+ return {
241
+ 'gene_error': gene_error,
242
+ 'cell_error': cell_error,
243
+ 'rowPs': pd.Series(row_ps_cpu, index=fit['vals']['tjs'].index),
244
+ 'colPs': pd.Series(col_ps_cpu, index=fit['vals']['tis'].index)
245
+ }
246
+
247
+ def NBumiCompareModelsGPU(
248
+ raw_filename: str, # Kept for API compatibility, but functionally we use cleaned_filename for indices
249
+ cleaned_filename: str,
250
+ stats: dict,
251
+ fit_adjust: dict,
252
+ chunk_size: int = None,
253
+ suppress_plot=False,
254
+ plot_filename=None
255
+ ) -> dict:
256
+ """
257
+ OPTIMIZED VERSION (IN-MEMORY):
258
+ - Eliminates the 46GB '_basic_norm.h5ad' temporary file.
259
+ - Performs depth normalization and variance calculation on-the-fly in GPU VRAM.
260
+ - PRESERVED SCIENTIFIC LOGIC: Var(X) = E[X^2] - (E[X])^2 on normalized data.
261
+ """
262
+ pipeline_start_time = time.time()
263
+ print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
264
+
265
+ # [GOVERNOR] High multiplier (12.0) because we hold Raw + Norm + Square in VRAM
266
+ if chunk_size is None:
267
+ chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=12.0, is_dense=False)
268
+
269
+ # --- Phase 1: In-Memory "Basic Fit" (Normalization + Variance) ---
270
+ print("Phase [1/3]: Calculating Basic Model (Depth-Normalized) variance on-the-fly...")
271
+
272
+ # 1. Prepare Size Factors (CPU)
273
+ tjs = stats['tjs'].values # Gene sums
274
+ tis = stats['tis'].values # Cell sums
275
+ nc, ng = stats['nc'], stats['ng']
276
+
277
+ median_sum = np.median(tis[tis > 0])
278
+ size_factors = np.ones_like(tis, dtype=np.float32)
279
+ non_zero_mask = tis > 0
280
+ size_factors[non_zero_mask] = tis[non_zero_mask] / median_sum
281
+
282
+ # 2. Prepare GPU Arrays
283
+ sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
284
+ sum_x_gpu = cp.zeros(ng, dtype=cp.float64) # Need sum(x) to calc mean(x) for variance
285
+
286
+ # 3. GPU Loop (Raw Data -> Normalize -> Accumulate)
287
+ # CRITICAL: We read CLEANED_FILENAME to ensure indices match 'stats'
288
+ with h5py.File(cleaned_filename, 'r') as f_in:
289
+ h5_indptr = f_in['X']['indptr']
290
+ h5_data = f_in['X']['data']
291
+ h5_indices = f_in['X']['indices']
292
+
293
+ for i in range(0, nc, chunk_size):
294
+ end_row = min(i + chunk_size, nc)
295
+ print(f"Phase [1/3]: Processing: {end_row} of {nc} cells.", end='\r')
296
+
297
+ start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
298
+ if start_idx == end_idx: continue
299
+
300
+ # Load Raw Chunk
301
+ data_gpu = cp.asarray(h5_data[start_idx:end_idx], dtype=cp.float32)
302
+ indices_gpu = cp.asarray(h5_indices[start_idx:end_idx])
303
+ indptr_gpu = cp.asarray(h5_indptr[i:end_row + 1] - start_idx)
304
+
305
+ # Expand Size Factors to match Data Structure
306
+ nnz_in_chunk = indptr_gpu[-1].item()
307
+ cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
308
+ if len(indptr_gpu) > 1:
309
+ cell_boundary_markers[indptr_gpu[:-1]] = 1
310
+ # row_indices maps every data point to its cell index (0 to chunk_size)
311
+ row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
312
+
313
+ # Get size factors for this chunk
314
+ sf_chunk = cp.asarray(size_factors[i:end_row])
315
+
316
+ # --- THE MAGIC: On-the-Fly Normalization ---
317
+ # data_norm = data_raw / size_factor
318
+ data_gpu /= sf_chunk[row_indices]
319
+
320
+ # Accumulate for Variance: E[X^2] and E[X]
321
+ cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
322
+ cp.add.at(sum_x_gpu, indices_gpu, data_gpu)
323
+
324
+ # Clean up VRAM
325
+ del data_gpu, indices_gpu, indptr_gpu, row_indices, sf_chunk, cell_boundary_markers
326
+ cp.get_default_memory_pool().free_all_blocks()
327
+
328
+ print(f"Phase [1/3]: COMPLETE{' '*50}")
329
+
330
+ # 4. Finalize Basic Statistics
331
+ # Var(X) = E[X^2] - (E[X])^2
332
+ mean_x_sq_gpu = sum_x_sq_gpu / nc
333
+ mean_mu_gpu = sum_x_gpu / nc
334
+ my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
335
+
336
+ # Dispersion = Mean^2 / (Var - Mean)
337
+ size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
338
+
339
+ # Safety Clamping
340
+ max_size_val = cp.nanmax(size_gpu) * 10
341
+ if cp.isnan(max_size_val): max_size_val = 1000
342
+ size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
343
+ size_gpu[size_gpu < 1e-10] = 1e-10
344
+
345
+ # Construct "Basic Fit" Object
346
+ fit_basic = {
347
+ 'sizes': pd.Series(size_gpu.get(), index=stats['tjs'].index),
348
+ 'vals': stats,
349
+ 'var_obs': pd.Series(my_rowvar_gpu.get(), index=stats['tjs'].index)
350
+ }
351
+
352
+ # --- Phase 2: Check Fit (Calculate Errors) ---
353
+ print("Phase [2/3]: Evaluating fit errors on ORIGINAL data...")
354
+
355
+ # Check Adjust (M3Drop) - uses its own governor
356
+ check_adjust = NBumiCheckFitFSGPU(
357
+ cleaned_filename, fit_adjust, suppress_plot=True
358
+ )
359
+
360
+ # Check Basic (Depth-Norm) - uses its own governor
361
+ check_basic = NBumiCheckFitFSGPU(
362
+ cleaned_filename, fit_basic, suppress_plot=True
363
+ )
364
+ print("Phase [2/3]: COMPLETE")
365
+
366
+ # --- Phase 3: Plotting & Comparison ---
367
+ print("Phase [3/3]: Generating comparison...")
368
+
369
+ nc_data = stats['nc']
370
+ mean_expr = stats['tjs'] / nc_data
371
+ observed_dropout = stats['djs'] / nc_data
372
+
373
+ adj_dropout_fit = check_adjust['rowPs'] / nc_data
374
+ bas_dropout_fit = check_basic['rowPs'] / nc_data
375
+
376
+ err_adj = np.sum(np.abs(adj_dropout_fit - observed_dropout))
377
+ err_bas = np.sum(np.abs(bas_dropout_fit - observed_dropout))
378
+
379
+ comparison_df = pd.DataFrame({
380
+ 'mean_expr': mean_expr,
381
+ 'observed': observed_dropout,
382
+ 'adj_fit': adj_dropout_fit,
383
+ 'bas_fit': bas_dropout_fit
384
+ })
385
+
386
+ plt.figure(figsize=(10, 6))
387
+ sorted_idx = np.argsort(mean_expr.values)
388
+
389
+ plt.scatter(mean_expr.iloc[sorted_idx], observed_dropout.iloc[sorted_idx],
390
+ c='black', s=3, alpha=0.5, label='Observed')
391
+ plt.scatter(mean_expr.iloc[sorted_idx], bas_dropout_fit.iloc[sorted_idx],
392
+ c='purple', s=3, alpha=0.6, label=f'Basic Fit (Error: {err_bas:.2f})')
393
+ plt.scatter(mean_expr.iloc[sorted_idx], adj_dropout_fit.iloc[sorted_idx],
394
+ c='goldenrod', s=3, alpha=0.7, label=f'Depth-Adjusted Fit (Error: {err_adj:.2f})')
395
+
396
+ plt.xscale('log')
397
+ plt.xlabel("Mean Expression")
398
+ plt.ylabel("Dropout Rate")
399
+ plt.title("M3Drop Model Comparison")
400
+ plt.legend()
401
+ plt.grid(True, linestyle='--', alpha=0.3)
402
+
403
+ if plot_filename:
404
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
405
+ print(f"STATUS: Model comparison plot saved to '{plot_filename}'")
406
+
407
+ if not suppress_plot:
408
+ plt.show()
409
+
410
+ plt.close()
411
+
412
+ pipeline_end_time = time.time()
413
+ print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
414
+
415
+ return {
416
+ "errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
417
+ "comparison_df": comparison_df
418
+ }
419
+
420
+ def NBumiPlotDispVsMeanGPU(
421
+ fit: dict,
422
+ suppress_plot: bool = False,
423
+ plot_filename: str = None
424
+ ):
425
+ """
426
+ Generates a diagnostic plot of the dispersion vs. mean expression.
427
+ """
428
+ print("FUNCTION: NBumiPlotDispVsMean()")
429
+
430
+ # --- 1. Extract data and regression coefficients ---
431
+ mean_expression = fit['vals']['tjs'].values / fit['vals']['nc']
432
+ sizes = fit['sizes'].values
433
+ coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
434
+ intercept, slope = coeffs[0], coeffs[1]
435
+
436
+ # --- 2. Calculate the fitted line for plotting ---
437
+ log_mean_expr_range = np.linspace(
438
+ np.log(mean_expression[mean_expression > 0].min()),
439
+ np.log(mean_expression.max()),
440
+ 100
441
+ )
442
+ log_fitted_sizes = intercept + slope * log_mean_expr_range
443
+ fitted_sizes = np.exp(log_fitted_sizes)
444
+
445
+ # --- 3. Create the plot ---
446
+ plt.figure(figsize=(8, 6))
447
+ plt.scatter(mean_expression, sizes, label='Observed Dispersion', alpha=0.5, s=8)
448
+ plt.plot(np.exp(log_mean_expr_range), fitted_sizes, color='red', label='Regression Fit', linewidth=2)
449
+
450
+ plt.xscale('log')
451
+ plt.yscale('log')
452
+ plt.xlabel('Mean Expression')
453
+ plt.ylabel('Dispersion Parameter (Sizes)')
454
+ plt.title('Dispersion vs. Mean Expression')
455
+ plt.legend()
456
+ plt.grid(True, which="both", linestyle='--', alpha=0.6)
457
+
458
+ if plot_filename:
459
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
460
+ print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
461
+
462
+ if not suppress_plot:
463
+ plt.show()
464
+
465
+ plt.close()
466
+ print("FUNCTION: NBumiPlotDispVsMean() COMPLETE\n")
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setuptools.setup(
7
7
  name="M3Drop",
8
- version="0.4.36", # Version bump
8
+ version="0.4.37", # Version bump
9
9
  author="Tallulah Andrews",
10
10
  author_email="tandrew6@uwo.ca",
11
11
  description="A Python implementation of the M3Drop single-cell RNA-seq analysis tool.",
@@ -1,171 +0,0 @@
1
- def NBumiCompareModelsGPU(
2
- raw_filename: str,
3
- cleaned_filename: str,
4
- stats: dict,
5
- fit_adjust: dict,
6
- chunk_size: int = None,
7
- suppress_plot=False,
8
- plot_filename=None
9
- ) -> dict:
10
- """
11
- OPTIMIZED VERSION (IN-MEMORY):
12
- - Eliminates the 46GB '_basic_norm.h5ad' temporary file.
13
- - Performs depth normalization and variance calculation on-the-fly in GPU VRAM.
14
- """
15
- pipeline_start_time = time.time()
16
- print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
17
-
18
- # [GOVERNOR] High multiplier (12.0) because we hold Raw + Norm + Square in VRAM
19
- if chunk_size is None:
20
- chunk_size = get_optimal_chunk_size(raw_filename, multiplier=12.0, is_dense=False)
21
-
22
- # --- Phase 1: In-Memory "Basic Fit" (Normalization + Variance) ---
23
- print("Phase [1/3]: Calculating Basic Model (Depth-Normalized) variance on-the-fly...")
24
-
25
- # 1. Prepare Size Factors (CPU)
26
- tjs = stats['tjs'].values # Gene sums (needed for final dataframe)
27
- tis = stats['tis'].values # Cell sums (needed for size factors)
28
- nc, ng = stats['nc'], stats['ng']
29
-
30
- median_sum = np.median(tis[tis > 0])
31
- size_factors = np.ones_like(tis, dtype=np.float32)
32
- non_zero_mask = tis > 0
33
- size_factors[non_zero_mask] = tis[non_zero_mask] / median_sum
34
-
35
- # 2. Prepare GPU Arrays
36
- sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
37
- sum_x_gpu = cp.zeros(ng, dtype=cp.float64) # Need sum(x) to calc mean(x) for variance
38
-
39
- # 3. GPU Loop (Raw Data -> Normalize -> Accumulate)
40
- with h5py.File(raw_filename, 'r') as f_in:
41
- h5_indptr = f_in['X']['indptr']
42
- h5_data = f_in['X']['data']
43
- h5_indices = f_in['X']['indices']
44
-
45
- for i in range(0, nc, chunk_size):
46
- end_row = min(i + chunk_size, nc)
47
- print(f"Phase [1/3]: Processing: {end_row} of {nc} cells.", end='\r')
48
-
49
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
50
- if start_idx == end_idx: continue
51
-
52
- # Load Raw Chunk
53
- data_gpu = cp.asarray(h5_data[start_idx:end_idx], dtype=cp.float32)
54
- indices_gpu = cp.asarray(h5_indices[start_idx:end_idx])
55
- indptr_gpu = cp.asarray(h5_indptr[i:end_row + 1] - start_idx)
56
-
57
- # Expand Size Factors to match Data Structure
58
- # (Map cell's size factor to every non-zero gene in that cell)
59
- nnz_in_chunk = indptr_gpu[-1].item()
60
- cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
61
- if len(indptr_gpu) > 1:
62
- cell_boundary_markers[indptr_gpu[:-1]] = 1
63
- # row_indices maps every data point to its cell index (0 to chunk_size)
64
- row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
65
-
66
- # Get size factors for this chunk
67
- sf_chunk = cp.asarray(size_factors[i:end_row])
68
-
69
- # --- THE MAGIC: On-the-Fly Normalization ---
70
- # data_norm = data_raw / size_factor
71
- data_gpu /= sf_chunk[row_indices]
72
-
73
- # Accumulate for Variance: E[X^2] and E[X]
74
- cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
75
- cp.add.at(sum_x_gpu, indices_gpu, data_gpu)
76
-
77
- # Clean up VRAM
78
- del data_gpu, indices_gpu, indptr_gpu, row_indices, sf_chunk, cell_boundary_markers
79
- cp.get_default_memory_pool().free_all_blocks()
80
-
81
- print(f"Phase [1/3]: COMPLETE{' '*50}")
82
-
83
- # 4. Finalize Basic Statistics
84
- # Var(X) = E[X^2] - (E[X])^2
85
- mean_x_sq_gpu = sum_x_sq_gpu / nc
86
- mean_mu_gpu = sum_x_gpu / nc
87
- my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
88
-
89
- # Dispersion = Mean^2 / (Var - Mean)
90
- size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
91
-
92
- # Safety Clamping (Same as original)
93
- max_size_val = cp.nanmax(size_gpu) * 10
94
- if cp.isnan(max_size_val): max_size_val = 1000
95
- size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
96
- size_gpu[size_gpu < 1e-10] = 1e-10
97
-
98
- # Construct "Basic Fit" Object
99
- fit_basic = {
100
- 'sizes': pd.Series(size_gpu.get(), index=stats['tjs'].index),
101
- 'vals': stats,
102
- 'var_obs': pd.Series(my_rowvar_gpu.get(), index=stats['tjs'].index)
103
- }
104
-
105
- # --- Phase 2: Check Fit (Calculate Errors) ---
106
- print("Phase [2/3]: Evaluating fit errors on ORIGINAL data...")
107
-
108
- # Check Adjust (M3Drop)
109
- check_adjust = NBumiCheckFitFSGPU(
110
- cleaned_filename, fit_adjust, suppress_plot=True, chunk_size=chunk_size
111
- )
112
-
113
- # Check Basic (Depth-Norm)
114
- check_basic = NBumiCheckFitFSGPU(
115
- cleaned_filename, fit_basic, suppress_plot=True, chunk_size=chunk_size
116
- )
117
- print("Phase [2/3]: COMPLETE")
118
-
119
- # --- Phase 3: Plotting & Comparison ---
120
- print("Phase [3/3]: Generating comparison...")
121
-
122
- nc_data = stats['nc']
123
- mean_expr = stats['tjs'] / nc_data
124
- observed_dropout = stats['djs'] / nc_data
125
-
126
- adj_dropout_fit = check_adjust['rowPs'] / nc_data
127
- bas_dropout_fit = check_basic['rowPs'] / nc_data
128
-
129
- err_adj = np.sum(np.abs(adj_dropout_fit - observed_dropout))
130
- err_bas = np.sum(np.abs(bas_dropout_fit - observed_dropout))
131
-
132
- comparison_df = pd.DataFrame({
133
- 'mean_expr': mean_expr,
134
- 'observed': observed_dropout,
135
- 'adj_fit': adj_dropout_fit,
136
- 'bas_fit': bas_dropout_fit
137
- })
138
-
139
- plt.figure(figsize=(10, 6))
140
- sorted_idx = np.argsort(mean_expr.values)
141
-
142
- plt.scatter(mean_expr.iloc[sorted_idx], observed_dropout.iloc[sorted_idx],
143
- c='black', s=3, alpha=0.5, label='Observed')
144
- plt.scatter(mean_expr.iloc[sorted_idx], bas_dropout_fit.iloc[sorted_idx],
145
- c='purple', s=3, alpha=0.6, label=f'Basic Fit (Error: {err_bas:.2f})')
146
- plt.scatter(mean_expr.iloc[sorted_idx], adj_dropout_fit.iloc[sorted_idx],
147
- c='goldenrod', s=3, alpha=0.7, label=f'Depth-Adjusted Fit (Error: {err_adj:.2f})')
148
-
149
- plt.xscale('log')
150
- plt.xlabel("Mean Expression")
151
- plt.ylabel("Dropout Rate")
152
- plt.title("M3Drop Model Comparison")
153
- plt.legend()
154
- plt.grid(True, linestyle='--', alpha=0.3)
155
-
156
- if plot_filename:
157
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
158
- print(f"STATUS: Model comparison plot saved to '{plot_filename}'")
159
-
160
- if not suppress_plot:
161
- plt.show()
162
-
163
- plt.close()
164
-
165
- pipeline_end_time = time.time()
166
- print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
167
-
168
- return {
169
- "errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
170
- "comparison_df": comparison_df
171
- }
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes