M3Drop 0.4.41__py3-none-any.whl → 0.4.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
m3Drop/diagnosticsCPU.py DELETED
@@ -1,391 +0,0 @@
1
- from .coreCPU import hidden_calc_valsCPU, NBumiFitModelCPU, NBumiFitDispVsMeanCPU
2
- import numpy as np
3
- import anndata
4
- import h5py
5
- import pandas as pd
6
- import time
7
- import os
8
-
9
- from scipy.sparse import csr_matrix as sp_csr_matrix
10
-
11
- import matplotlib.pyplot as plt
12
-
13
-
14
- def NBumiFitBasicModelCPU(
15
- cleaned_filename: str,
16
- stats: dict,
17
- is_logged=False,
18
- chunk_size: int = 5000
19
- ) -> dict:
20
- """
21
- Fits a simpler, unadjusted NB model out-of-core using a CPU-only algorithm.
22
- """
23
- start_time = time.perf_counter()
24
- print(f"FUNCTION: NBumiFitBasicModelCPU() | FILE: {cleaned_filename}")
25
-
26
- print("Phase [1/2]: Initializing parameters on CPU...")
27
- tjs_series = stats['tjs']
28
- tjs = tjs_series.values.astype(np.float64)
29
- nc, ng = stats['nc'], stats['ng']
30
-
31
- sum_x_sq = np.zeros(ng, dtype=np.float64)
32
- print("Phase [1/2]: COMPLETE")
33
-
34
- print("Phase [2/2]: Calculating variance from data chunks...")
35
- with h5py.File(cleaned_filename, 'r') as f_in:
36
- x_group = f_in['X']
37
- h5_indptr = x_group['indptr']
38
- h5_data = x_group['data']
39
- h5_indices = x_group['indices']
40
-
41
- for i in range(0, nc, chunk_size):
42
- end_row = min(i + chunk_size, nc)
43
- print(f"Phase [2/2]: Processing: {end_row} of {nc} cells.", end='\r')
44
-
45
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
46
- if start_idx == end_idx:
47
- continue
48
-
49
- data_slice = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
50
- indices_slice = np.array(h5_indices[start_idx:end_idx], dtype=np.int64)
51
-
52
- np.add.at(sum_x_sq, indices_slice, data_slice**2)
53
-
54
- print(f"Phase [2/2]: COMPLETE{' '*40}")
55
-
56
- if is_logged:
57
- raise NotImplementedError("Logged data variance calculation is not implemented for out-of-core CPU.")
58
- else:
59
- mean_x_sq = sum_x_sq / nc
60
- mean_mu = tjs / nc
61
- my_rowvar = mean_x_sq - mean_mu**2
62
-
63
- numerator = mean_mu**2
64
- denominator = my_rowvar - mean_mu
65
-
66
- sizes = np.full(ng, np.nan, dtype=np.float64)
67
- valid_mask = denominator > 1e-12
68
- sizes[valid_mask] = numerator[valid_mask] / denominator[valid_mask]
69
-
70
- finite_sizes = sizes[np.isfinite(sizes) & (sizes > 0)]
71
- max_size_val = np.max(finite_sizes) * 10 if finite_sizes.size else 1000
72
- sizes[~np.isfinite(sizes) | (sizes <= 0)] = max_size_val
73
- sizes[sizes < 1e-10] = 1e-10
74
-
75
- end_time = time.perf_counter()
76
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
77
-
78
- return {
79
- 'var_obs': pd.Series(my_rowvar, index=tjs_series.index),
80
- 'sizes': pd.Series(sizes, index=tjs_series.index),
81
- 'vals': stats
82
- }
83
-
84
-
85
- def NBumiCheckFitFSCPU(
86
- cleaned_filename: str,
87
- fit: dict,
88
- chunk_size: int = 5000,
89
- suppress_plot=False,
90
- plot_filename=None
91
- ) -> dict:
92
- """
93
- CPU-only version of NBumiCheckFitFS. Computes expected dropouts for genes
94
- and cells to compare observed vs fitted values.
95
- """
96
- start_time = time.perf_counter()
97
- print(f"FUNCTION: NBumiCheckFitFSCPU() | FILE: {cleaned_filename}")
98
-
99
- print("Phase [1/2]: Initializing parameters on CPU...")
100
- vals = fit['vals']
101
- size_coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
102
-
103
- tjs = vals['tjs'].values.astype(np.float64)
104
- tis = vals['tis'].values.astype(np.float64)
105
- total = vals['total']
106
- nc, ng = vals['nc'], vals['ng']
107
-
108
- mean_expression = tjs / nc
109
- log_mean_expression = np.log(mean_expression, where=(mean_expression > 0))
110
- smoothed_size = np.exp(size_coeffs[0] + size_coeffs[1] * log_mean_expression)
111
- smoothed_size = np.nan_to_num(smoothed_size, nan=1.0, posinf=1e6, neginf=1.0)
112
-
113
- row_ps = np.zeros(ng, dtype=np.float64)
114
- col_ps = np.zeros(nc, dtype=np.float64)
115
- print("Phase [1/2]: COMPLETE")
116
-
117
- print("Phase [2/2]: Calculating expected dropouts from data chunks...")
118
- for i in range(0, nc, chunk_size):
119
- end_col = min(i + chunk_size, nc)
120
- print(f"Phase [2/2]: Processing: {end_col} of {nc} cells.", end='\r')
121
-
122
- tis_chunk = tis[i:end_col]
123
- if tis_chunk.size == 0:
124
- continue
125
-
126
- mu_chunk = tjs[:, np.newaxis] * tis_chunk[np.newaxis, :] / total
127
- base = 1 + mu_chunk / smoothed_size[:, np.newaxis]
128
- base = np.maximum(base, 1e-12)
129
- p_is_chunk = np.power(base, -smoothed_size[:, np.newaxis])
130
- p_is_chunk = np.nan_to_num(p_is_chunk, nan=0.0, posinf=1.0, neginf=0.0)
131
-
132
- row_ps += np.sum(p_is_chunk, axis=1)
133
- col_ps[i:end_col] = np.sum(p_is_chunk, axis=0)
134
-
135
- print(f"Phase [2/2]: COMPLETE{' ' * 50}")
136
-
137
- djs = vals['djs'].values
138
- dis = vals['dis'].values
139
-
140
- if not suppress_plot:
141
- plt.figure(figsize=(12, 5))
142
- plt.subplot(1, 2, 1)
143
- plt.scatter(djs, row_ps, alpha=0.5, s=10)
144
- plt.title("Gene-specific Dropouts (Smoothed)")
145
- plt.xlabel("Observed")
146
- plt.ylabel("Fit")
147
- lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
148
- plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
149
- plt.grid(True); plt.legend()
150
-
151
- plt.subplot(1, 2, 2)
152
- plt.scatter(dis, col_ps, alpha=0.5, s=10)
153
- plt.title("Cell-specific Dropouts (Smoothed)")
154
- plt.xlabel("Observed")
155
- plt.ylabel("Expected")
156
- lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
157
- plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
158
- plt.grid(True); plt.legend()
159
-
160
- plt.tight_layout()
161
- if plot_filename:
162
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
163
- print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
164
- plt.show()
165
- plt.close()
166
-
167
- gene_error = np.sum((djs - row_ps)**2)
168
- cell_error = np.sum((dis - col_ps)**2)
169
-
170
- end_time = time.perf_counter()
171
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
172
-
173
- return {
174
- 'gene_error': gene_error,
175
- 'cell_error': cell_error,
176
- 'rowPs': pd.Series(row_ps, index=fit['vals']['tjs'].index),
177
- 'colPs': pd.Series(col_ps, index=fit['vals']['tis'].index)
178
- }
179
-
180
-
181
- def NBumiCompareModelsCPU(
182
- raw_filename: str,
183
- cleaned_filename: str,
184
- stats: dict,
185
- fit_adjust: dict,
186
- chunk_size: int = 5000,
187
- suppress_plot=False,
188
- plot_filename=None
189
- ) -> dict:
190
- """
191
- CPU-only comparison between the depth-adjusted NB model and a basic model.
192
- """
193
- pipeline_start_time = time.time()
194
- print(f"FUNCTION: NBumiCompareModelsCPU() | Comparing models for {cleaned_filename}")
195
-
196
- print("Phase [1/4]: Creating temporary 'basic' normalized data file...")
197
- basic_norm_filename = cleaned_filename.replace('.h5ad', '_basic_norm.h5ad')
198
-
199
- adata_meta = anndata.read_h5ad(cleaned_filename, backed='r')
200
- nc, ng = adata_meta.shape
201
- obs_df = adata_meta.obs.copy()
202
- var_df = adata_meta.var.copy()
203
-
204
- cell_sums = stats['tis'].values.astype(np.float64)
205
- positive_mask = cell_sums > 0
206
- median_sum = np.median(cell_sums[positive_mask]) if np.any(positive_mask) else 1.0
207
- size_factors = np.ones_like(cell_sums, dtype=np.float32)
208
- size_factors[positive_mask] = cell_sums[positive_mask] / median_sum
209
-
210
- adata_out = anndata.AnnData(obs=obs_df, var=var_df)
211
- adata_out.write_h5ad(basic_norm_filename, compression="gzip")
212
-
213
- with h5py.File(basic_norm_filename, 'a') as f_out:
214
- if 'X' in f_out:
215
- del f_out['X']
216
- x_group_out = f_out.create_group('X')
217
- x_group_out.attrs['encoding-type'] = 'csr_matrix'
218
- x_group_out.attrs['encoding-version'] = '0.1.0'
219
- x_group_out.attrs['shape'] = np.array([nc, ng], dtype='int64')
220
-
221
- out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
222
- out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
223
- out_indptr = x_group_out.create_dataset('indptr', shape=(nc + 1,), dtype='int64')
224
- out_indptr[0] = 0
225
- current_nnz = 0
226
-
227
- with h5py.File(cleaned_filename, 'r') as f_in:
228
- h5_indptr = f_in['X']['indptr']
229
- h5_data = f_in['X']['data']
230
- h5_indices = f_in['X']['indices']
231
-
232
- for i in range(0, nc, chunk_size):
233
- end_row = min(i + chunk_size, nc)
234
- print(f"Phase [1/4]: Normalizing: {end_row} of {nc} cells.", end='\r')
235
-
236
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
237
- if start_idx == end_idx:
238
- out_indptr[i + 1 : end_row + 1] = current_nnz
239
- continue
240
-
241
- data_slice = np.array(h5_data[start_idx:end_idx], dtype=np.float32)
242
- indices_slice = np.array(h5_indices[start_idx:end_idx], dtype=np.int32)
243
- abs_indptr = h5_indptr[i:end_row + 1]
244
- indptr_slice = abs_indptr - abs_indptr[0]
245
- row_lengths = np.diff(indptr_slice)
246
-
247
- norm_factors = np.repeat(size_factors[i:end_row], row_lengths)
248
- norm_factors[norm_factors == 0] = 1.0
249
- normalized_data = data_slice / norm_factors
250
- normalized_data = np.round(normalized_data).astype(np.float32)
251
-
252
- chunk_sp = sp_csr_matrix((normalized_data, indices_slice, indptr_slice),
253
- shape=(end_row - i, ng))
254
-
255
- nnz_chunk = chunk_sp.nnz
256
- out_data.resize(current_nnz + nnz_chunk, axis=0)
257
- out_data[current_nnz:] = chunk_sp.data
258
-
259
- out_indices.resize(current_nnz + nnz_chunk, axis=0)
260
- out_indices[current_nnz:] = chunk_sp.indices
261
-
262
- new_indptr_list = chunk_sp.indptr[1:].astype(np.int64) + current_nnz
263
- out_indptr[i + 1 : end_row + 1] = new_indptr_list
264
-
265
- current_nnz += nnz_chunk
266
-
267
- print(f"Phase [1/4]: COMPLETE{' '*50}")
268
-
269
- print("Phase [2/4]: Fitting Basic Model on normalized data...")
270
- stats_basic = hidden_calc_valsCPU(basic_norm_filename, chunk_size=chunk_size)
271
- fit_basic = NBumiFitBasicModelCPU(basic_norm_filename, stats_basic, chunk_size=chunk_size)
272
- print("Phase [2/4]: COMPLETE")
273
-
274
- print("Phase [3/4]: Evaluating fits of both models on ORIGINAL data...")
275
- check_adjust = NBumiCheckFitFSCPU(cleaned_filename, fit_adjust, suppress_plot=True, chunk_size=chunk_size)
276
-
277
- fit_basic_for_eval = {
278
- 'sizes': fit_basic['sizes'],
279
- 'vals': stats,
280
- 'var_obs': fit_basic['var_obs']
281
- }
282
- check_basic = NBumiCheckFitFSCPU(cleaned_filename, fit_basic_for_eval, suppress_plot=True, chunk_size=chunk_size)
283
- print("Phase [3/4]: COMPLETE")
284
-
285
- print("Phase [4/4]: Generating final comparison...")
286
- nc_data = stats['nc']
287
- mean_expr = stats['tjs'] / nc_data
288
- observed_dropout = stats['djs'] / nc_data
289
-
290
- adj_dropout_fit = check_adjust['rowPs'] / nc_data
291
- bas_dropout_fit = check_basic['rowPs'] / nc_data
292
-
293
- err_adj = np.sum(np.abs(adj_dropout_fit - observed_dropout))
294
- err_bas = np.sum(np.abs(bas_dropout_fit - observed_dropout))
295
-
296
- comparison_df = pd.DataFrame({
297
- 'mean_expr': mean_expr,
298
- 'observed': observed_dropout,
299
- 'adj_fit': adj_dropout_fit,
300
- 'bas_fit': bas_dropout_fit
301
- })
302
-
303
- plt.figure(figsize=(10, 6))
304
- sorted_idx = np.argsort(mean_expr.values)
305
-
306
- plt.scatter(mean_expr.iloc[sorted_idx], observed_dropout.iloc[sorted_idx],
307
- c='black', s=3, alpha=0.5, label='Observed')
308
- plt.scatter(mean_expr.iloc[sorted_idx], bas_dropout_fit.iloc[sorted_idx],
309
- c='purple', s=3, alpha=0.6, label=f'Basic Fit (Error: {err_bas:.2f})')
310
- plt.scatter(mean_expr.iloc[sorted_idx], adj_dropout_fit.iloc[sorted_idx],
311
- c='goldenrod', s=3, alpha=0.7, label=f'Depth-Adjusted Fit (Error: {err_adj:.2f})')
312
-
313
- plt.xscale('log')
314
- plt.xlabel("Mean Expression")
315
- plt.ylabel("Dropout Rate")
316
- plt.title("M3Drop Model Comparison (CPU)")
317
- plt.legend()
318
- plt.grid(True, linestyle='--', alpha=0.3)
319
-
320
- if plot_filename:
321
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
322
- print(f"STATUS: Model comparison plot saved to '{plot_filename}'")
323
-
324
- if not suppress_plot:
325
- plt.show()
326
-
327
- plt.close()
328
- print("Phase [4/4]: COMPLETE")
329
-
330
- pipeline_end_time = time.time()
331
-
332
- adata_meta.file.close()
333
-
334
- os.remove(basic_norm_filename)
335
- print(f"STATUS: Temporary file '{basic_norm_filename}' removed.")
336
- print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
337
-
338
- return {
339
- "errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
340
- "comparison_df": comparison_df
341
- }
342
-
343
-
344
- def NBumiPlotDispVsMeanCPU(
345
- fit: dict,
346
- suppress_plot: bool = False,
347
- plot_filename: str = None
348
- ):
349
- """
350
- Generates a diagnostic plot of the dispersion vs. mean expression (CPU version).
351
- """
352
- print("FUNCTION: NBumiPlotDispVsMeanCPU()")
353
-
354
- mean_expression = fit['vals']['tjs'].values / fit['vals']['nc']
355
- sizes = fit['sizes'].values
356
- coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
357
- intercept, slope = coeffs[0], coeffs[1]
358
-
359
- positive_means = mean_expression[mean_expression > 0]
360
- if positive_means.size == 0:
361
- raise ValueError("Mean expression contains no positive values for plotting.")
362
-
363
- log_mean_expr_range = np.linspace(
364
- np.log(positive_means.min()),
365
- np.log(positive_means.max()),
366
- 100
367
- )
368
- log_fitted_sizes = intercept + slope * log_mean_expr_range
369
- fitted_sizes = np.exp(log_fitted_sizes)
370
-
371
- plt.figure(figsize=(8, 6))
372
- plt.scatter(mean_expression, sizes, label='Observed Dispersion', alpha=0.5, s=8)
373
- plt.plot(np.exp(log_mean_expr_range), fitted_sizes, color='red', label='Regression Fit', linewidth=2)
374
-
375
- plt.xscale('log')
376
- plt.yscale('log')
377
- plt.xlabel('Mean Expression')
378
- plt.ylabel('Dispersion Parameter (Sizes)')
379
- plt.title('Dispersion vs. Mean Expression (CPU)')
380
- plt.legend()
381
- plt.grid(True, which="both", linestyle='--', alpha=0.6)
382
-
383
- if plot_filename:
384
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
385
- print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
386
-
387
- if not suppress_plot:
388
- plt.show()
389
-
390
- plt.close()
391
- print("FUNCTION: NBumiPlotDispVsMeanCPU() COMPLETE\n")