M3Drop 0.4.41__py3-none-any.whl → 0.4.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,407 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import h5py
5
+ import os
6
+ import time
7
+ import pickle
8
+ import gc
9
+ from scipy import sparse
10
+ from scipy import stats
11
+ import anndata
12
+
13
+ import statsmodels.api as sm
14
+ from scipy.stats import norm
15
+ from statsmodels.stats.multitest import multipletests
16
+
17
+ # [REFACTOR] Relative Imports
18
+ try:
19
+ from .ControlDeviceCPU import ControlDevice
20
+ # Import the Numba-optimized kernel from CoreCPU
21
+ from .CoreCPU import hidden_calc_valsCPU, NBumiFitModelCPU, NBumiFitDispVsMeanCPU, dropout_prob_kernel_cpu
22
+ except ImportError:
23
+ # Fallback
24
+ from ControlDeviceCPU import ControlDevice
25
+ from CoreCPU import hidden_calc_valsCPU, NBumiFitModelCPU, NBumiFitDispVsMeanCPU, dropout_prob_kernel_cpu
26
+
27
+ # ==========================================
28
+ # DIAGNOSTICS & COMPARISON (CPU)
29
+ # ==========================================
30
+
31
+ def NBumiFitBasicModelCPU(
32
+ filename: str,
33
+ stats: dict,
34
+ mask_filename: str = None,
35
+ mode: str = "auto",
36
+ manual_target: int = 3000,
37
+ phase_label: str = "Phase [1/1]",
38
+ desc_label: str = None
39
+ ) -> dict:
40
+ """
41
+ Fits the Basic Model by calculating Normalized Variance ON-THE-FLY (CPU Optimized).
42
+ STRICT FLOAT64 ENFORCEMENT.
43
+ """
44
+ # 1. Get Raw Dimensions & Setup ControlDevice
45
+ with h5py.File(filename, 'r') as f:
46
+ indptr_cpu = f['X']['indptr'][:]
47
+ total_rows = len(indptr_cpu) - 1
48
+ raw_ng = f['X'].attrs['shape'][1]
49
+
50
+ device = ControlDevice(
51
+ indptr=indptr_cpu,
52
+ total_rows=total_rows,
53
+ n_genes=raw_ng,
54
+ mode=mode,
55
+ manual_target=manual_target
56
+ )
57
+ nc = device.total_rows
58
+
59
+ if desc_label:
60
+ print(f"{phase_label}: {desc_label}")
61
+
62
+ # 2. Load Mask
63
+ if mask_filename and os.path.exists(mask_filename):
64
+ with open(mask_filename, 'rb') as f:
65
+ mask = pickle.load(f)
66
+ else:
67
+ mask = np.ones(raw_ng, dtype=bool)
68
+
69
+ filtered_ng = int(np.sum(mask))
70
+
71
+ # 3. Pre-calculate Size Factors
72
+ cell_sums = stats['tis'].values
73
+ median_sum = np.median(cell_sums[cell_sums > 0])
74
+
75
+ # [FLOAT64] Explicit enforcement
76
+ size_factors = np.ones_like(cell_sums, dtype=np.float64)
77
+ non_zero_mask = cell_sums > 0
78
+ size_factors[non_zero_mask] = cell_sums[non_zero_mask] / median_sum
79
+
80
+ # 4. Init Accumulators
81
+ sum_norm_x = np.zeros(filtered_ng, dtype=np.float64)
82
+ sum_norm_sq = np.zeros(filtered_ng, dtype=np.float64)
83
+
84
+ with h5py.File(filename, 'r') as f_in:
85
+ h5_indptr = f_in['X']['indptr']
86
+ h5_data = f_in['X']['data']
87
+ h5_indices = f_in['X']['indices']
88
+
89
+ current_row = 0
90
+ while current_row < nc:
91
+ # CPU prefers dense chunks if they fit in L3, but sparse is safer for RAM.
92
+ # We use 'dense' mode here because we convert to dense for normalization anyway.
93
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=1.5)
94
+ if end_row is None or end_row <= current_row: break
95
+
96
+ chunk_size = end_row - current_row
97
+ print(f"{phase_label}: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
98
+
99
+ start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
100
+ if start_idx == end_idx:
101
+ current_row = end_row
102
+ continue
103
+
104
+ # [FLOAT64] Load Raw Chunk
105
+ data = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
106
+ indices = np.array(h5_indices[start_idx:end_idx])
107
+ indptr = np.array(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
108
+
109
+ # Reconstruct CSR & Filter
110
+ raw_chunk = sparse.csr_matrix((data, indices, indptr), shape=(chunk_size, raw_ng))
111
+ filtered_chunk = raw_chunk[:, mask]
112
+
113
+ # Normalization (Vectorized CPU)
114
+ sf_chunk = size_factors[current_row:end_row]
115
+
116
+ # Scipy sparse multiplication is efficient
117
+ # D = diag(1/sf)
118
+ recip_sf = 1.0 / sf_chunk
119
+ D = sparse.diags(recip_sf)
120
+ norm_chunk = D.dot(filtered_chunk)
121
+
122
+ # Rounding (in-place on data array)
123
+ np.round(norm_chunk.data, out=norm_chunk.data)
124
+
125
+ # Accumulate
126
+ # Convert to dense for summation if chunk is small (faster on CPU)
127
+ # or keep sparse if very large. Given L3 optimization, dense is often fine.
128
+ norm_dense = norm_chunk.toarray()
129
+
130
+ sum_norm_x += norm_dense.sum(axis=0)
131
+ sum_norm_sq += (norm_dense ** 2).sum(axis=0)
132
+
133
+ current_row = end_row
134
+
135
+ # Final Calculations
136
+ mean_norm = sum_norm_x / nc
137
+ mean_sq_norm = sum_norm_sq / nc
138
+ var_norm = mean_sq_norm - (mean_norm ** 2)
139
+
140
+ denom = var_norm - mean_norm
141
+ sizes = np.full(filtered_ng, 1000.0, dtype=np.float64)
142
+ valid_mask = denom > 1e-6
143
+ sizes[valid_mask] = mean_norm[valid_mask]**2 / denom[valid_mask]
144
+
145
+ # Filtering outliers (Numpy version)
146
+ with np.errstate(invalid='ignore'):
147
+ max_size_val = np.nanmax(sizes[sizes < 1e6]) * 10
148
+
149
+ if np.isnan(max_size_val) or max_size_val == 0: max_size_val = 1000.0
150
+ sizes[np.isnan(sizes) | (sizes <= 0)] = max_size_val
151
+ sizes[sizes < 1e-10] = 1e-10
152
+
153
+ print("")
154
+ print(f"{phase_label}: COMPLETE")
155
+
156
+ return {
157
+ 'var_obs': pd.Series(var_norm, index=stats['tjs'].index),
158
+ 'sizes': pd.Series(sizes, index=stats['tjs'].index),
159
+ 'vals': stats
160
+ }
161
+
162
+ def NBumiCheckFitFSCPU(
163
+ filename: str,
164
+ fit: dict,
165
+ mode: str = "auto",
166
+ manual_target: int = 3000,
167
+ suppress_plot=False,
168
+ plot_filename=None,
169
+ phase_label="Phase [1/1]",
170
+ desc_label: str = None
171
+ ) -> dict:
172
+ """
173
+ Calculates expected dropouts using NUMBA KERNEL on CPU.
174
+ """
175
+ vals = fit['vals']
176
+ ng = vals['ng']
177
+
178
+ with h5py.File(filename, 'r') as f:
179
+ indptr_cpu = f['X']['indptr'][:]
180
+ total_rows = len(indptr_cpu) - 1
181
+
182
+ device = ControlDevice(
183
+ indptr=indptr_cpu,
184
+ total_rows=total_rows,
185
+ n_genes=ng,
186
+ mode=mode,
187
+ manual_target=manual_target
188
+ )
189
+ nc = device.total_rows
190
+
191
+ if desc_label:
192
+ print(f"{phase_label}: {desc_label}")
193
+
194
+ size_coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
195
+
196
+ tjs = vals['tjs'].values.astype(np.float64)
197
+ tis = vals['tis'].values.astype(np.float64)
198
+ total = vals['total']
199
+
200
+ mean_expression = tjs / nc
201
+ log_mean_expression = np.zeros_like(mean_expression)
202
+ valid_means = mean_expression > 0
203
+ log_mean_expression[valid_means] = np.log(mean_expression[valid_means])
204
+ smoothed_size = np.exp(size_coeffs[0] + size_coeffs[1] * log_mean_expression)
205
+
206
+ row_ps = np.zeros(ng, dtype=np.float64)
207
+ col_ps = np.zeros(nc, dtype=np.float64)
208
+
209
+ current_row = 0
210
+ while current_row < nc:
211
+ # Use dense mode for Numba efficiency
212
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=1.1)
213
+ if end_row is None or end_row <= current_row: break
214
+
215
+ chunk_size = end_row - current_row
216
+ print(f"{phase_label}: Processing {end_row} of {nc} | Chunk: {chunk_size}", end='\r')
217
+
218
+ tis_chunk = tis[current_row:end_row]
219
+
220
+ # [CRITICAL] NUMBA KERNEL CALL
221
+ # Prepare output buffer
222
+ p_is_chunk = np.empty((chunk_size, ng), dtype=np.float64)
223
+
224
+ dropout_prob_kernel_cpu(
225
+ tjs, # Gene totals
226
+ tis_chunk, # Cell totals (1D array, broadcasting handled inside kernel)
227
+ total, # Grand total
228
+ smoothed_size, # Exp size
229
+ p_is_chunk # Output buffer
230
+ )
231
+
232
+ # Sanitize
233
+ p_is_chunk = np.nan_to_num(p_is_chunk, nan=0.0, posinf=1.0, neginf=0.0)
234
+
235
+ row_ps += p_is_chunk.sum(axis=0)
236
+ col_ps[current_row:end_row] = p_is_chunk.sum(axis=1)
237
+
238
+ current_row = end_row
239
+
240
+ print("")
241
+ print(f"{phase_label}: COMPLETE")
242
+
243
+ return {
244
+ 'rowPs': pd.Series(row_ps, index=fit['vals']['tjs'].index),
245
+ 'colPs': pd.Series(col_ps, index=fit['vals']['tis'].index)
246
+ }
247
+
248
+ def NBumiCompareModelsCPU(
249
+ raw_filename: str,
250
+ stats: dict,
251
+ fit_adjust: dict,
252
+ mask_filename: str = None,
253
+ mode: str = "auto",
254
+ manual_target: int = 3000,
255
+ suppress_plot=False,
256
+ plot_filename=None
257
+ ) -> dict:
258
+ print(f"FUNCTION: NBumiCompareModelsCPU()")
259
+ pipeline_start_time = time.time()
260
+
261
+ # STEP 1: Fit Basic Model
262
+ fit_basic = NBumiFitBasicModelCPU(
263
+ raw_filename,
264
+ stats,
265
+ mask_filename=mask_filename,
266
+ mode=mode,
267
+ manual_target=manual_target,
268
+ phase_label="Phase [1/3]",
269
+ desc_label="Fitting Basic Model (Virtual)..."
270
+ )
271
+
272
+ # STEP 2: Depth-Adjusted Dropout
273
+ check_adjust = NBumiCheckFitFSCPU(
274
+ raw_filename,
275
+ fit_adjust,
276
+ mode=mode,
277
+ manual_target=manual_target,
278
+ suppress_plot=True,
279
+ phase_label="Phase [2/3]",
280
+ desc_label="Calculating Depth-Adjusted Dropouts..."
281
+ )
282
+
283
+ # STEP 3: Basic Dropout
284
+ stats_virtual = stats.copy()
285
+ mean_depth = stats['total'] / stats['nc']
286
+ stats_virtual['tis'] = pd.Series(
287
+ np.full(stats['nc'], mean_depth),
288
+ index=stats['tis'].index
289
+ )
290
+
291
+ fit_basic_for_eval = {
292
+ 'sizes': fit_basic['sizes'],
293
+ 'vals': stats_virtual,
294
+ 'var_obs': fit_basic['var_obs']
295
+ }
296
+
297
+ check_basic = NBumiCheckFitFSCPU(
298
+ raw_filename,
299
+ fit_basic_for_eval,
300
+ mode=mode,
301
+ manual_target=manual_target,
302
+ suppress_plot=True,
303
+ phase_label="Phase [3/3]",
304
+ desc_label="Calculating Basic Dropouts..."
305
+ )
306
+
307
+ # Calculation & Plotting
308
+ nc_data = stats['nc']
309
+ mean_expr = stats['tjs'] / nc_data
310
+ observed_dropout = stats['djs'] / nc_data
311
+
312
+ adj_dropout_fit = check_adjust['rowPs'] / nc_data
313
+ bas_dropout_fit = check_basic['rowPs'] / nc_data
314
+
315
+ err_adj = np.sum(np.abs(adj_dropout_fit - observed_dropout))
316
+ err_bas = np.sum(np.abs(bas_dropout_fit - observed_dropout))
317
+
318
+ comparison_df = pd.DataFrame({
319
+ 'mean_expr': mean_expr,
320
+ 'observed': observed_dropout,
321
+ 'adj_fit': adj_dropout_fit,
322
+ 'bas_fit': bas_dropout_fit
323
+ })
324
+
325
+ # Plotting Logic (Standard Matplotlib)
326
+ plt.figure(figsize=(10, 6))
327
+ sorted_idx = np.argsort(mean_expr.values)
328
+ plot_idx = sorted_idx[::2] if len(mean_expr) > 20000 else sorted_idx
329
+
330
+ plt.scatter(mean_expr.iloc[plot_idx], observed_dropout.iloc[plot_idx],
331
+ c='black', s=3, alpha=0.5, label='Observed')
332
+
333
+ plt.scatter(mean_expr.iloc[plot_idx], bas_dropout_fit.iloc[plot_idx],
334
+ c='purple', s=3, alpha=0.6, label=f'Basic Fit (Error: {err_bas:.2f})')
335
+
336
+ plt.scatter(mean_expr.iloc[plot_idx], adj_dropout_fit.iloc[plot_idx],
337
+ c='goldenrod', s=3, alpha=0.7, label=f'Depth-Adjusted Fit (Error: {err_adj:.2f})')
338
+
339
+ plt.xscale('log')
340
+ plt.xlabel("Mean Expression")
341
+ plt.ylabel("Dropout Rate")
342
+ plt.title("M3Drop Model Comparison (CPU)")
343
+ plt.legend()
344
+ plt.grid(True, linestyle='--', alpha=0.3)
345
+
346
+ if plot_filename:
347
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
348
+ print(f"Saving plot to: {plot_filename}")
349
+
350
+ if not suppress_plot:
351
+ plt.show()
352
+
353
+ plt.close()
354
+
355
+ pipeline_end_time = time.time()
356
+ print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
357
+
358
+ return {
359
+ "errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
360
+ "comparison_df": comparison_df
361
+ }
362
+
363
+ def NBumiPlotDispVsMeanCPU(
364
+ fit: dict,
365
+ suppress_plot: bool = False,
366
+ plot_filename: str = None
367
+ ):
368
+ print("FUNCTION: NBumiPlotDispVsMeanCPU()")
369
+ start_time = time.time()
370
+
371
+ mean_expression = fit['vals']['tjs'].values / fit['vals']['nc']
372
+ sizes = fit['sizes'].values
373
+
374
+ coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
375
+ intercept, slope = coeffs[0], coeffs[1]
376
+
377
+ log_mean_expr_range = np.linspace(
378
+ np.log(mean_expression[mean_expression > 0].min()),
379
+ np.log(mean_expression.max()),
380
+ 100
381
+ )
382
+ log_fitted_sizes = intercept + slope * log_mean_expr_range
383
+ fitted_sizes = np.exp(log_fitted_sizes)
384
+
385
+ plt.figure(figsize=(8, 6))
386
+ plt.scatter(mean_expression, sizes, label='Observed Dispersion', alpha=0.5, s=8)
387
+ plt.plot(np.exp(log_mean_expr_range), fitted_sizes, color='red', label='Regression Fit', linewidth=2)
388
+
389
+ plt.xscale('log')
390
+ plt.yscale('log')
391
+ plt.xlabel('Mean Expression')
392
+ plt.ylabel('Dispersion Parameter (Sizes)')
393
+ plt.title('Dispersion vs. Mean Expression (CPU)')
394
+ plt.legend()
395
+ plt.grid(True, which="both", linestyle='--', alpha=0.6)
396
+
397
+ if plot_filename:
398
+ plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
399
+ print(f"Saving plot to: {plot_filename}")
400
+
401
+ if not suppress_plot:
402
+ plt.show()
403
+
404
+ plt.close()
405
+
406
+ end_time = time.time()
407
+ print(f"Total time: {end_time - start_time:.2f} seconds.\n")