M3Drop 0.4.34__tar.gz → 0.4.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.34
3
+ Version: 0.4.36
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.34
3
+ Version: 0.4.36
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -0,0 +1,171 @@
1
+ def NBumiCompareModelsGPU(
2
+ raw_filename: str,
3
+ cleaned_filename: str,
4
+ stats: dict,
5
+ fit_adjust: dict,
6
+ chunk_size: int = None,
7
+ suppress_plot=False,
8
+ plot_filename=None
9
+ ) -> dict:
10
+ """
11
+ OPTIMIZED VERSION (IN-MEMORY):
12
+ - Eliminates the 46GB '_basic_norm.h5ad' temporary file.
13
+ - Performs depth normalization and variance calculation on-the-fly in GPU VRAM.
14
+ """
15
+ pipeline_start_time = time.time()
16
+ print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
17
+
18
+ # [GOVERNOR] High multiplier (12.0) because we hold Raw + Norm + Square in VRAM
19
+ if chunk_size is None:
20
+ chunk_size = get_optimal_chunk_size(raw_filename, multiplier=12.0, is_dense=False)
21
+
22
+ # --- Phase 1: In-Memory "Basic Fit" (Normalization + Variance) ---
23
+ print("Phase [1/3]: Calculating Basic Model (Depth-Normalized) variance on-the-fly...")
24
+
25
+ # 1. Prepare Size Factors (CPU)
26
+ tjs = stats['tjs'].values # Gene sums (needed for final dataframe)
27
+ tis = stats['tis'].values # Cell sums (needed for size factors)
28
+ nc, ng = stats['nc'], stats['ng']
29
+
30
+ median_sum = np.median(tis[tis > 0])
31
+ size_factors = np.ones_like(tis, dtype=np.float32)
32
+ non_zero_mask = tis > 0
33
+ size_factors[non_zero_mask] = tis[non_zero_mask] / median_sum
34
+
35
+ # 2. Prepare GPU Arrays
36
+ sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
37
+ sum_x_gpu = cp.zeros(ng, dtype=cp.float64) # Need sum(x) to calc mean(x) for variance
38
+
39
+ # 3. GPU Loop (Raw Data -> Normalize -> Accumulate)
40
+ with h5py.File(raw_filename, 'r') as f_in:
41
+ h5_indptr = f_in['X']['indptr']
42
+ h5_data = f_in['X']['data']
43
+ h5_indices = f_in['X']['indices']
44
+
45
+ for i in range(0, nc, chunk_size):
46
+ end_row = min(i + chunk_size, nc)
47
+ print(f"Phase [1/3]: Processing: {end_row} of {nc} cells.", end='\r')
48
+
49
+ start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
50
+ if start_idx == end_idx: continue
51
+
52
+ # Load Raw Chunk
53
+ data_gpu = cp.asarray(h5_data[start_idx:end_idx], dtype=cp.float32)
54
+ indices_gpu = cp.asarray(h5_indices[start_idx:end_idx])
55
+ indptr_gpu = cp.asarray(h5_indptr[i:end_row + 1] - start_idx)
56
+
57
+ # Expand Size Factors to match Data Structure
58
+ # (Map cell's size factor to every non-zero gene in that cell)
59
+ nnz_in_chunk = indptr_gpu[-1].item()
60
+ cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
61
+ if len(indptr_gpu) > 1:
62
+ cell_boundary_markers[indptr_gpu[:-1]] = 1
63
+ # row_indices maps every data point to its cell index (0 to chunk_size)
64
+ row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
65
+
66
+ # Get size factors for this chunk
67
+ sf_chunk = cp.asarray(size_factors[i:end_row])
68
+
69
+ # --- THE MAGIC: On-the-Fly Normalization ---
70
+ # data_norm = data_raw / size_factor
71
+ data_gpu /= sf_chunk[row_indices]
72
+
73
+ # Accumulate for Variance: E[X^2] and E[X]
74
+ cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
75
+ cp.add.at(sum_x_gpu, indices_gpu, data_gpu)
76
+
77
+ # Clean up VRAM
78
+ del data_gpu, indices_gpu, indptr_gpu, row_indices, sf_chunk, cell_boundary_markers
79
+ cp.get_default_memory_pool().free_all_blocks()
80
+
81
+ print(f"Phase [1/3]: COMPLETE{' '*50}")
82
+
83
+ # 4. Finalize Basic Statistics
84
+ # Var(X) = E[X^2] - (E[X])^2
85
+ mean_x_sq_gpu = sum_x_sq_gpu / nc
86
+ mean_mu_gpu = sum_x_gpu / nc
87
+ my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
88
+
89
+ # Dispersion = Mean^2 / (Var - Mean)
90
+ size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
91
+
92
+ # Safety Clamping (Same as original)
93
+ max_size_val = cp.nanmax(size_gpu) * 10
94
+ if cp.isnan(max_size_val): max_size_val = 1000
95
+ size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
96
+ size_gpu[size_gpu < 1e-10] = 1e-10
97
+
98
+ # Construct "Basic Fit" Object
99
+ fit_basic = {
100
+ 'sizes': pd.Series(size_gpu.get(), index=stats['tjs'].index),
101
+ 'vals': stats,
102
+ 'var_obs': pd.Series(my_rowvar_gpu.get(), index=stats['tjs'].index)
103
+ }
104
+
105
+ # --- Phase 2: Check Fit (Calculate Errors) ---
106
+ print("Phase [2/3]: Evaluating fit errors on ORIGINAL data...")
107
+
108
+ # Check Adjust (M3Drop)
109
+ check_adjust = NBumiCheckFitFSGPU(
110
+ cleaned_filename, fit_adjust, suppress_plot=True, chunk_size=chunk_size
111
+ )
112
+
113
+ # Check Basic (Depth-Norm)
114
+ check_basic = NBumiCheckFitFSGPU(
115
+ cleaned_filename, fit_basic, suppress_plot=True, chunk_size=chunk_size
116
+ )
117
+ print("Phase [2/3]: COMPLETE")
118
+
119
+ # --- Phase 3: Plotting & Comparison ---
120
+ print("Phase [3/3]: Generating comparison...")
121
+
122
+ nc_data = stats['nc']
123
+ mean_expr = stats['tjs'] / nc_data
124
+ observed_dropout = stats['djs'] / nc_data
125
+
126
+ adj_dropout_fit = check_adjust['rowPs'] / nc_data
127
+ bas_dropout_fit = check_basic['rowPs'] / nc_data
128
+
129
+ err_adj = np.sum(np.abs(adj_dropout_fit - observed_dropout))
130
+ err_bas = np.sum(np.abs(bas_dropout_fit - observed_dropout))
131
+
132
+ comparison_df = pd.DataFrame({
133
+ 'mean_expr': mean_expr,
134
+ 'observed': observed_dropout,
135
+ 'adj_fit': adj_dropout_fit,
136
+ 'bas_fit': bas_dropout_fit
137
+ })
138
+
139
+ plt.figure(figsize=(10, 6))
140
+ sorted_idx = np.argsort(mean_expr.values)
141
+
142
+ plt.scatter(mean_expr.iloc[sorted_idx], observed_dropout.iloc[sorted_idx],
143
+ c='black', s=3, alpha=0.5, label='Observed')
144
+ plt.scatter(mean_expr.iloc[sorted_idx], bas_dropout_fit.iloc[sorted_idx],
145
+ c='purple', s=3, alpha=0.6, label=f'Basic Fit (Error: {err_bas:.2f})')
146
+ plt.scatter(mean_expr.iloc[sorted_idx], adj_dropout_fit.iloc[sorted_idx],
147
+ c='goldenrod', s=3, alpha=0.7, label=f'Depth-Adjusted Fit (Error: {err_adj:.2f})')
148
+
149
+ plt.xscale('log')
150
+ plt.xlabel("Mean Expression")
151
+ plt.ylabel("Dropout Rate")
152
+ plt.title("M3Drop Model Comparison")
153
+ plt.legend()
154
+ plt.grid(True, linestyle='--', alpha=0.3)
155
+
156
+ if plot_filename:
157
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
158
+ print(f"STATUS: Model comparison plot saved to '{plot_filename}'")
159
+
160
+ if not suppress_plot:
161
+ plt.show()
162
+
163
+ plt.close()
164
+
165
+ pipeline_end_time = time.time()
166
+ print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
167
+
168
+ return {
169
+ "errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
170
+ "comparison_df": comparison_df
171
+ }
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setuptools.setup(
7
7
  name="M3Drop",
8
- version="0.4.34", # Version bump
8
+ version="0.4.36", # Version bump
9
9
  author="Tallulah Andrews",
10
10
  author_email="tandrew6@uwo.ca",
11
11
  description="A Python implementation of the M3Drop single-cell RNA-seq analysis tool.",
@@ -1,503 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import cupy as cp
4
- import cupyx.scipy.sparse as csp
5
- import matplotlib.pyplot as plt
6
- import h5py
7
- import os
8
- import time
9
- import psutil
10
- import gc
11
- from scipy import sparse
12
- from scipy import stats
13
- import anndata # <--- FIXED: Added missing import
14
-
15
- # [GOVERNOR INTEGRATION] Added get_optimal_chunk_size
16
- from .coreGPU import hidden_calc_valsGPU, NBumiFitModelGPU, NBumiFitDispVsMeanGPU, get_optimal_chunk_size
17
- from cupy.sparse import csr_matrix as cp_csr_matrix
18
- import scipy.sparse as sp
19
- from scipy.sparse import csr_matrix as sp_csr_matrix
20
-
21
- import statsmodels.api as sm
22
- from scipy.stats import norm
23
- from statsmodels.stats.multitest import multipletests
24
-
25
- def NBumiFitBasicModelGPU(
26
- cleaned_filename: str,
27
- stats: dict,
28
- is_logged=False,
29
- chunk_size: int = None
30
- ) -> dict:
31
- """
32
- Fits a simpler, unadjusted NB model out-of-core using a GPU-accelerated
33
- algorithm. Designed to work with a standard (cell, gene) sparse matrix.
34
- """
35
- start_time = time.perf_counter()
36
- print(f"FUNCTION: NBumiFitBasicModel() | FILE: {cleaned_filename}")
37
-
38
- # [GOVERNOR INTEGRATION] Calculate optimal chunk size if not provided
39
- if chunk_size is None:
40
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=3.0, is_dense=True)
41
-
42
- # --- Phase 1: Initialization ---
43
- print("Phase [1/2]: Initializing parameters and arrays on GPU...")
44
- tjs = stats['tjs'].values
45
- nc, ng = stats['nc'], stats['ng']
46
-
47
- tjs_gpu = cp.asarray(tjs, dtype=cp.float64)
48
- sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
49
- print("Phase [1/2]: COMPLETE")
50
-
51
- # --- Phase 2: Calculate Variance from Data Chunks ---
52
- print("Phase [2/2]: Calculating variance from data chunks...")
53
- with h5py.File(cleaned_filename, 'r') as f_in:
54
- x_group = f_in['X']
55
- h5_indptr = x_group['indptr']
56
- h5_data = x_group['data']
57
- h5_indices = x_group['indices']
58
-
59
- for i in range(0, nc, chunk_size):
60
- end_row = min(i + chunk_size, nc)
61
- print(f"Phase [2/2]: Processing: {end_row} of {nc} cells.", end='\r')
62
-
63
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
64
- if start_idx == end_idx:
65
- continue
66
-
67
- # Process in smaller sub-chunks if needed
68
- max_elements = 5_000_000 # Process max 5M elements at a time
69
-
70
- if end_idx - start_idx > max_elements:
71
- # Process in sub-chunks
72
- for sub_start in range(start_idx, end_idx, max_elements):
73
- sub_end = min(sub_start + max_elements, end_idx)
74
-
75
- data_slice = h5_data[sub_start:sub_end]
76
- indices_slice = h5_indices[sub_start:sub_end]
77
-
78
- data_gpu = cp.asarray(data_slice, dtype=cp.float64)
79
- indices_gpu = cp.asarray(indices_slice)
80
-
81
- # Accumulate the sum of squares for each gene
82
- cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
83
-
84
- # Free GPU memory
85
- del data_gpu, indices_gpu
86
- cp.get_default_memory_pool().free_all_blocks()
87
- else:
88
- # Original processing for smaller chunks
89
- data_slice = h5_data[start_idx:end_idx]
90
- indices_slice = h5_indices[start_idx:end_idx]
91
-
92
- data_gpu = cp.asarray(data_slice, dtype=cp.float64)
93
- indices_gpu = cp.asarray(indices_slice)
94
-
95
- # Accumulate the sum of squares for each gene
96
- cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
97
-
98
- # Clean up
99
- del data_gpu, indices_gpu
100
- cp.get_default_memory_pool().free_all_blocks()
101
-
102
- print(f"Phase [2/2]: COMPLETE ")
103
-
104
- # --- Final calculations on GPU ---
105
- if is_logged:
106
- raise NotImplementedError("Logged data variance calculation is not implemented for out-of-core.")
107
- else:
108
- # Variance of raw data: Var(X) = E[X^2] - E[X]^2
109
- mean_x_sq_gpu = sum_x_sq_gpu / nc
110
- mean_mu_gpu = tjs_gpu / nc
111
- my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
112
-
113
- # Calculate dispersion ('size')
114
- size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
115
-
116
- max_size_val = cp.nanmax(size_gpu) * 10
117
- if cp.isnan(max_size_val):
118
- max_size_val = 1000
119
- size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
120
- size_gpu[size_gpu < 1e-10] = 1e-10
121
-
122
- # Move results to CPU
123
- my_rowvar_cpu = my_rowvar_gpu.get()
124
- sizes_cpu = size_gpu.get()
125
-
126
- end_time = time.perf_counter()
127
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
128
-
129
- return {
130
- 'var_obs': pd.Series(my_rowvar_cpu, index=stats['tjs'].index),
131
- 'sizes': pd.Series(sizes_cpu, index=stats['tjs'].index),
132
- 'vals': stats
133
- }
134
-
135
- def NBumiCheckFitFSGPU(
136
- cleaned_filename: str,
137
- fit: dict,
138
- chunk_size: int = None,
139
- suppress_plot=False,
140
- plot_filename=None
141
- ) -> dict:
142
- """
143
- FIXED VERSION - No cupy.errstate, proper GPU computation.
144
- """
145
- start_time = time.perf_counter()
146
- print(f"FUNCTION: NBumiCheckFitFS() | FILE: {cleaned_filename}")
147
-
148
- # [GOVERNOR INTEGRATION] Adaptive chunk sizing
149
- if chunk_size is None:
150
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=5.0, is_dense=True)
151
-
152
- # --- Phase 1: Initialization ---
153
- print("Phase [1/2]: Initializing parameters and arrays on GPU...")
154
- vals = fit['vals']
155
- size_coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
156
-
157
- # Must use float64 for precision
158
- tjs_gpu = cp.asarray(vals['tjs'].values, dtype=cp.float64)
159
- tis_gpu = cp.asarray(vals['tis'].values, dtype=cp.float64)
160
- total = vals['total']
161
- nc, ng = vals['nc'], vals['ng']
162
-
163
- # Calculate smoothed size
164
- mean_expression_gpu = tjs_gpu / nc
165
- log_mean_expression_gpu = cp.log(mean_expression_gpu)
166
- smoothed_size_gpu = cp.exp(size_coeffs[0] + size_coeffs[1] * log_mean_expression_gpu)
167
-
168
- # Initialize result arrays
169
- row_ps_gpu = cp.zeros(ng, dtype=cp.float64)
170
- col_ps_gpu = cp.zeros(nc, dtype=cp.float64)
171
- print("Phase [1/2]: COMPLETE")
172
-
173
- # --- Phase 2: Calculate Expected Dropouts ---
174
- print("Phase [2/2]: Calculating expected dropouts from data chunks...")
175
-
176
- # [GOVERNOR INTEGRATION] Removed naive calculation, utilizing Governor's chunk_size
177
- optimal_chunk = chunk_size
178
- print(f" Using governor chunk size: {optimal_chunk}")
179
-
180
- for i in range(0, nc, optimal_chunk):
181
- end_col = min(i + optimal_chunk, nc)
182
- print(f"Phase [2/2]: Processing: {end_col} of {nc} cells.", end='\r')
183
-
184
- tis_chunk_gpu = tis_gpu[i:end_col]
185
-
186
- # Standard calculation without errstate
187
- mu_chunk_gpu = tjs_gpu[:, cp.newaxis] * tis_chunk_gpu[cp.newaxis, :] / total
188
-
189
- # Calculate p_is directly - CuPy handles overflow internally
190
- base = 1 + mu_chunk_gpu / smoothed_size_gpu[:, cp.newaxis]
191
- p_is_chunk_gpu = cp.power(base, -smoothed_size_gpu[:, cp.newaxis])
192
-
193
- # Handle any inf/nan values that might have occurred
194
- p_is_chunk_gpu = cp.nan_to_num(p_is_chunk_gpu, nan=0.0, posinf=1.0, neginf=0.0)
195
-
196
- # Sum results
197
- row_ps_gpu += p_is_chunk_gpu.sum(axis=1)
198
- col_ps_gpu[i:end_col] = p_is_chunk_gpu.sum(axis=0)
199
-
200
- # Clean up
201
- del mu_chunk_gpu, p_is_chunk_gpu, base, tis_chunk_gpu
202
-
203
- # Periodic memory cleanup
204
- mempool = cp.get_default_memory_pool()
205
- if (i // optimal_chunk) % 10 == 0:
206
- mempool.free_all_blocks()
207
-
208
- print(f"Phase [2/2]: COMPLETE{' ' * 50}")
209
-
210
- # Move results to CPU
211
- row_ps_cpu = row_ps_gpu.get()
212
- col_ps_cpu = col_ps_gpu.get()
213
- djs_cpu = vals['djs'].values
214
- dis_cpu = vals['dis'].values
215
-
216
- # Plotting
217
- if not suppress_plot:
218
- plt.figure(figsize=(12, 5))
219
- plt.subplot(1, 2, 1)
220
- plt.scatter(djs_cpu, row_ps_cpu, alpha=0.5, s=10)
221
- plt.title("Gene-specific Dropouts (Smoothed)")
222
- plt.xlabel("Observed")
223
- plt.ylabel("Fit")
224
- lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
225
- plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
226
- plt.grid(True); plt.legend()
227
-
228
- plt.subplot(1, 2, 2)
229
- plt.scatter(dis_cpu, col_ps_cpu, alpha=0.5, s=10)
230
- plt.title("Cell-specific Dropouts (Smoothed)")
231
- plt.xlabel("Observed")
232
- plt.ylabel("Expected")
233
- lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
234
- plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
235
- plt.grid(True); plt.legend()
236
-
237
- plt.tight_layout()
238
- if plot_filename:
239
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
240
- print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
241
- plt.show()
242
- plt.close()
243
-
244
- # Calculate errors
245
- gene_error = np.sum((djs_cpu - row_ps_cpu)**2)
246
- cell_error = np.sum((dis_cpu - col_ps_cpu)**2)
247
-
248
- end_time = time.perf_counter()
249
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
250
-
251
- return {
252
- 'gene_error': gene_error,
253
- 'cell_error': cell_error,
254
- 'rowPs': pd.Series(row_ps_cpu, index=fit['vals']['tjs'].index),
255
- 'colPs': pd.Series(col_ps_cpu, index=fit['vals']['tis'].index)
256
- }
257
-
258
- def NBumiCompareModelsGPU(
259
- raw_filename: str,
260
- cleaned_filename: str,
261
- stats: dict,
262
- fit_adjust: dict,
263
- chunk_size: int = None,
264
- suppress_plot=False,
265
- plot_filename=None
266
- ) -> dict:
267
- """
268
- OPTIMIZED VERSION - Faster normalization and sparse matrix writing.
269
- """
270
- pipeline_start_time = time.time()
271
- print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
272
-
273
- # [GOVERNOR INTEGRATION] Calculate chunk size for normalization phase (heavy IO)
274
- if chunk_size is None:
275
- # Multiplier 10.0 for safety during normalization of massive dense expansion
276
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=10.0, is_dense=True)
277
-
278
- # --- Phase 1: OPTIMIZED Normalization ---
279
- print("Phase [1/4]: Creating temporary 'basic' normalized data file...")
280
- basic_norm_filename = cleaned_filename.replace('.h5ad', '_basic_norm.h5ad')
281
-
282
- # Read metadata. In 'backed' mode, this keeps a file handle open.
283
- adata_meta = anndata.read_h5ad(cleaned_filename, backed='r')
284
- nc, ng = adata_meta.shape
285
- obs_df = adata_meta.obs.copy()
286
- var_df = adata_meta.var.copy()
287
-
288
- cell_sums = stats['tis'].values
289
- median_sum = np.median(cell_sums[cell_sums > 0])
290
-
291
- # Avoid division by zero for cells with zero counts
292
- size_factors = np.ones_like(cell_sums, dtype=np.float32)
293
- non_zero_mask = cell_sums > 0
294
- size_factors[non_zero_mask] = cell_sums[non_zero_mask] / median_sum
295
-
296
- adata_out = anndata.AnnData(obs=obs_df, var=var_df)
297
- adata_out.write_h5ad(basic_norm_filename, compression="gzip")
298
-
299
- with h5py.File(basic_norm_filename, 'a') as f_out:
300
- if 'X' in f_out:
301
- del f_out['X']
302
- x_group_out = f_out.create_group('X')
303
- x_group_out.attrs['encoding-type'] = 'csr_matrix'
304
- x_group_out.attrs['encoding-version'] = '0.1.0'
305
- x_group_out.attrs['shape'] = np.array([nc, ng], dtype='int64')
306
-
307
- out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
308
- out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
309
- out_indptr = x_group_out.create_dataset('indptr', shape=(nc + 1,), dtype='int64')
310
- out_indptr[0] = 0
311
- current_nnz = 0
312
-
313
- with h5py.File(cleaned_filename, 'r') as f_in:
314
- h5_indptr = f_in['X']['indptr']
315
- h5_data = f_in['X']['data']
316
- h5_indices = f_in['X']['indices']
317
-
318
- for i in range(0, nc, chunk_size):
319
- end_row = min(i + chunk_size, nc)
320
- print(f"Phase [1/4]: Normalizing: {end_row} of {nc} cells.", end='\r')
321
-
322
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
323
- if start_idx == end_idx:
324
- out_indptr[i + 1 : end_row + 1] = current_nnz
325
- continue
326
-
327
- # Read data for the chunk
328
- data_slice = h5_data[start_idx:end_idx]
329
- indices_slice = h5_indices[start_idx:end_idx]
330
- indptr_slice = h5_indptr[i:end_row + 1] - start_idx
331
-
332
- # Move to GPU for fast normalization
333
- data_gpu = cp.asarray(data_slice.copy(), dtype=cp.float32)
334
-
335
- indptr_gpu = cp.asarray(indptr_slice.copy())
336
- nnz_in_chunk = indptr_gpu[-1].item()
337
- cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
338
- if len(indptr_gpu) > 1:
339
- cell_boundary_markers[indptr_gpu[:-1]] = 1
340
- row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
341
-
342
- size_factors_for_chunk = cp.asarray(size_factors[i:end_row])
343
-
344
- data_gpu /= size_factors_for_chunk[row_indices]
345
-
346
- data_cpu = np.round(data_gpu.get())
347
-
348
- num_cells_in_chunk = end_row - i
349
- chunk_sp = sp_csr_matrix((data_cpu, indices_slice, indptr_slice),
350
- shape=(num_cells_in_chunk, ng))
351
-
352
- nnz_chunk = chunk_sp.nnz
353
- out_data.resize(current_nnz + nnz_chunk, axis=0)
354
- out_data[current_nnz:] = chunk_sp.data
355
-
356
- out_indices.resize(current_nnz + nnz_chunk, axis=0)
357
- out_indices[current_nnz:] = chunk_sp.indices
358
-
359
- new_indptr_list = chunk_sp.indptr[1:].astype(np.int64) + current_nnz
360
- out_indptr[i + 1 : end_row + 1] = new_indptr_list
361
-
362
- current_nnz += nnz_chunk
363
-
364
- del data_gpu, row_indices, size_factors_for_chunk, indptr_gpu
365
- cp.get_default_memory_pool().free_all_blocks()
366
-
367
- print(f"Phase [1/4]: COMPLETE{' '*50}")
368
-
369
- print("Phase [2/4]: Fitting Basic Model on normalized data...")
370
-
371
- # [GOVERNOR INTEGRATION] Calculate chunk size for basic fit on the heavy normalized file
372
- chunk_size_basic = get_optimal_chunk_size(basic_norm_filename, multiplier=10.0, is_dense=True)
373
-
374
- stats_basic = hidden_calc_valsGPU(basic_norm_filename) # hidden_calc uses its own governor internally
375
- fit_basic = NBumiFitBasicModelGPU(basic_norm_filename, stats_basic, chunk_size=chunk_size_basic)
376
- print("Phase [2/4]: COMPLETE")
377
-
378
- print("Phase [3/4]: Evaluating fits of both models on ORIGINAL data...")
379
- # [GOVERNOR INTEGRATION] Chunk size for check fit
380
- chunk_size_check = get_optimal_chunk_size(cleaned_filename, multiplier=5.0, is_dense=True)
381
-
382
- check_adjust = NBumiCheckFitFSGPU(cleaned_filename, fit_adjust, suppress_plot=True, chunk_size=chunk_size_check)
383
-
384
- fit_basic_for_eval = {
385
- 'sizes': fit_basic['sizes'],
386
- 'vals': stats,
387
- 'var_obs': fit_basic['var_obs']
388
- }
389
- check_basic = NBumiCheckFitFSGPU(cleaned_filename, fit_basic_for_eval, suppress_plot=True, chunk_size=chunk_size_check)
390
- print("Phase [3/4]: COMPLETE")
391
-
392
- print("Phase [4/4]: Generating final comparison...")
393
- nc_data = stats['nc']
394
- mean_expr = stats['tjs'] / nc_data
395
- observed_dropout = stats['djs'] / nc_data
396
-
397
- adj_dropout_fit = check_adjust['rowPs'] / nc_data
398
- bas_dropout_fit = check_basic['rowPs'] / nc_data
399
-
400
- err_adj = np.sum(np.abs(adj_dropout_fit - observed_dropout))
401
- err_bas = np.sum(np.abs(bas_dropout_fit - observed_dropout))
402
-
403
- comparison_df = pd.DataFrame({
404
- 'mean_expr': mean_expr,
405
- 'observed': observed_dropout,
406
- 'adj_fit': adj_dropout_fit,
407
- 'bas_fit': bas_dropout_fit
408
- })
409
-
410
- plt.figure(figsize=(10, 6))
411
- sorted_idx = np.argsort(mean_expr.values)
412
-
413
- plt.scatter(mean_expr.iloc[sorted_idx], observed_dropout.iloc[sorted_idx],
414
- c='black', s=3, alpha=0.5, label='Observed')
415
- plt.scatter(mean_expr.iloc[sorted_idx], bas_dropout_fit.iloc[sorted_idx],
416
- c='purple', s=3, alpha=0.6, label=f'Basic Fit (Error: {err_bas:.2f})')
417
- plt.scatter(mean_expr.iloc[sorted_idx], adj_dropout_fit.iloc[sorted_idx],
418
- c='goldenrod', s=3, alpha=0.7, label=f'Depth-Adjusted Fit (Error: {err_adj:.2f})')
419
-
420
- plt.xscale('log')
421
- plt.xlabel("Mean Expression")
422
- plt.ylabel("Dropout Rate")
423
- plt.title("M3Drop Model Comparison")
424
- plt.legend()
425
- plt.grid(True, linestyle='--', alpha=0.3)
426
-
427
- if plot_filename:
428
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
429
- print(f"STATUS: Model comparison plot saved to '{plot_filename}'")
430
-
431
- if not suppress_plot:
432
- plt.show()
433
-
434
- plt.close()
435
- print("Phase [4/4]: COMPLETE")
436
-
437
- pipeline_end_time = time.time()
438
-
439
- # --- ADD THIS LINE TO FIX THE ERROR ---
440
- adata_meta.file.close() # Explicitly close the file handle
441
-
442
- os.remove(basic_norm_filename)
443
- print(f"STATUS: Temporary file '{basic_norm_filename}' removed.")
444
- print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
445
-
446
- return {
447
- "errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
448
- "comparison_df": comparison_df
449
- }
450
-
451
- def NBumiPlotDispVsMeanGPU(
452
- fit: dict,
453
- suppress_plot: bool = False,
454
- plot_filename: str = None
455
- ):
456
- """
457
- Generates a diagnostic plot of the dispersion vs. mean expression.
458
-
459
- Args:
460
- fit (dict): The 'fit' object from NBumiFitModelGPU.
461
- suppress_plot (bool): If True, the plot will not be displayed on screen.
462
- plot_filename (str, optional): Path to save the plot. If None, not saved.
463
- """
464
- print("FUNCTION: NBumiPlotDispVsMean()")
465
-
466
- # --- 1. Extract data and regression coefficients ---
467
- mean_expression = fit['vals']['tjs'].values / fit['vals']['nc']
468
- sizes = fit['sizes'].values
469
- coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
470
- intercept, slope = coeffs[0], coeffs[1]
471
-
472
- # --- 2. Calculate the fitted line for plotting ---
473
- # Create a smooth, continuous line using the regression coefficients
474
- log_mean_expr_range = np.linspace(
475
- np.log(mean_expression[mean_expression > 0].min()),
476
- np.log(mean_expression.max()),
477
- 100
478
- )
479
- log_fitted_sizes = intercept + slope * log_mean_expr_range
480
- fitted_sizes = np.exp(log_fitted_sizes)
481
-
482
- # --- 3. Create the plot ---
483
- plt.figure(figsize=(8, 6))
484
- plt.scatter(mean_expression, sizes, label='Observed Dispersion', alpha=0.5, s=8)
485
- plt.plot(np.exp(log_mean_expr_range), fitted_sizes, color='red', label='Regression Fit', linewidth=2)
486
-
487
- plt.xscale('log')
488
- plt.yscale('log')
489
- plt.xlabel('Mean Expression')
490
- plt.ylabel('Dispersion Parameter (Sizes)')
491
- plt.title('Dispersion vs. Mean Expression')
492
- plt.legend()
493
- plt.grid(True, which="both", linestyle='--', alpha=0.6)
494
-
495
- if plot_filename:
496
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
497
- print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
498
-
499
- if not suppress_plot:
500
- plt.show()
501
-
502
- plt.close()
503
- print("FUNCTION: NBumiPlotDispVsMean() COMPLETE\n")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes