M3Drop 0.4.42__py3-none-any.whl → 0.4.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
m3Drop/__init__.py CHANGED
@@ -1,11 +1,9 @@
1
- # M3Drop (Refactored) __init__.py
2
- # This file imports all CPU and GPU functions to make them
3
- # directly accessible from the main package.
1
+ # M3Drop/__init__.py
4
2
 
5
3
  # --- CPU Functions ---
6
4
 
7
- # From coreCPU.py
8
- from .coreCPU import (
5
+ # From CoreCPU.py
6
+ from .CoreCPU import (
9
7
  ConvertDataSparseCPU,
10
8
  hidden_calc_valsCPU,
11
9
  NBumiFitModelCPU,
@@ -15,56 +13,57 @@ from .coreCPU import (
15
13
  NBumiCombinedDropVolcanoCPU,
16
14
  )
17
15
 
18
- # From diagnosticsCPU.py
19
- from .diagnosticsCPU import (
16
+ # From DiagnosticsCPU.py
17
+ from .DiagnosticsCPU import (
20
18
  NBumiFitBasicModelCPU,
21
19
  NBumiCheckFitFSCPU,
22
20
  NBumiCompareModelsCPU,
23
21
  NBumiPlotDispVsMeanCPU,
24
22
  )
25
23
 
26
- # From normalizationCPU.py
27
- from .normalizationCPU import (
28
- NBumiPearsonResidualsCPU,
29
- NBumiPearsonResidualsApproxCPU,
24
+ # From NormalizationCPU.py
25
+ from .NormalizationCPU import (
26
+ NBumiPearsonResidualsCombinedCPU,
30
27
  )
31
28
 
29
+ # --- GPU Functions (Placeholders based on your request) ---
32
30
 
33
- # --- GPU Functions ---
34
-
35
- # From coreGPU.py
36
- from .coreGPU import (
37
- ConvertDataSparseGPU,
38
- hidden_calc_valsGPU,
39
- NBumiFitModelGPU,
40
- NBumiFitDispVsMeanGPU,
41
- NBumiFeatureSelectionHighVarGPU,
42
- NBumiFeatureSelectionCombinedDropGPU,
43
- NBumiCombinedDropVolcanoGPU,
44
- get_optimal_chunk_size # The Governor Protocol
45
- )
46
-
47
- # From diagnosticsGPU.py
48
- from .diagnosticsGPU import (
49
- NBumiFitBasicModelGPU,
50
- NBumiCheckFitFSGPU,
51
- NBumiCompareModelsGPU,
52
- NBumiPlotDispVsMeanGPU,
53
- )
54
-
55
- # From normalizationGPU.py
56
- from .normalizationGPU import (
57
- NBumiPearsonResidualsGPU,
58
- NBumiPearsonResidualsApproxGPU,
59
- )
31
+ # From CoreGPU.py
32
+ try:
33
+ from .CoreGPU import (
34
+ ConvertDataSparseGPU,
35
+ hidden_calc_valsGPU,
36
+ NBumiFitModelGPU,
37
+ NBumiFitDispVsMeanGPU,
38
+ NBumiFeatureSelectionHighVarGPU,
39
+ NBumiFeatureSelectionCombinedDropGPU,
40
+ NBumiCombinedDropVolcanoGPU,
41
+ )
42
+ except ImportError:
43
+ pass # Handle case where GPU modules might not be present on CPU node
60
44
 
45
+ # From DiagnosticsGPU.py
46
+ try:
47
+ from .DiagnosticsGPU import (
48
+ NBumiFitBasicModelGPU,
49
+ NBumiCheckFitFSGPU,
50
+ NBumiCompareModelsGPU,
51
+ NBumiPlotDispVsMeanGPU,
52
+ )
53
+ except ImportError:
54
+ pass
61
55
 
62
- # --- Public API (`__all__`) ---
63
- # Defines what `from m3Drop import *` will import.
56
+ # From NormalizationGPU.py
57
+ try:
58
+ from .NormalizationGPU import (
59
+ NBumiPearsonResidualsCombinedGPU,
60
+ )
61
+ except ImportError:
62
+ pass
64
63
 
64
+ # --- Public API ---
65
65
  __all__ = [
66
66
  # --- CPU ---
67
- # coreCPU
68
67
  'ConvertDataSparseCPU',
69
68
  'hidden_calc_valsCPU',
70
69
  'NBumiFitModelCPU',
@@ -73,18 +72,14 @@ __all__ = [
73
72
  'NBumiFeatureSelectionCombinedDropCPU',
74
73
  'NBumiCombinedDropVolcanoCPU',
75
74
 
76
- # diagnosticsCPU
77
75
  'NBumiFitBasicModelCPU',
78
76
  'NBumiCheckFitFSCPU',
79
77
  'NBumiCompareModelsCPU',
80
78
  'NBumiPlotDispVsMeanCPU',
81
79
 
82
- # normalizationCPU
83
- 'NBumiPearsonResidualsCPU',
84
- 'NBumiPearsonResidualsApproxCPU',
80
+ 'NBumiPearsonResidualsCombinedCPU',
85
81
 
86
82
  # --- GPU ---
87
- # coreGPU
88
83
  'ConvertDataSparseGPU',
89
84
  'hidden_calc_valsGPU',
90
85
  'NBumiFitModelGPU',
@@ -92,15 +87,11 @@ __all__ = [
92
87
  'NBumiFeatureSelectionHighVarGPU',
93
88
  'NBumiFeatureSelectionCombinedDropGPU',
94
89
  'NBumiCombinedDropVolcanoGPU',
95
- 'get_optimal_chunk_size',
96
90
 
97
- # diagnosticsGPU
98
91
  'NBumiFitBasicModelGPU',
99
92
  'NBumiCheckFitFSGPU',
100
93
  'NBumiCompareModelsGPU',
101
94
  'NBumiPlotDispVsMeanGPU',
102
95
 
103
- # normalizationGPU
104
- 'NBumiPearsonResidualsGPU',
105
- 'NBumiPearsonResidualsApproxGPU',
96
+ 'NBumiPearsonResidualsCombinedGPU',
106
97
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.42
3
+ Version: 0.4.45
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -21,12 +21,14 @@ Requires-Dist: matplotlib-venn>=0.11
21
21
  Requires-Dist: memory_profiler>=0.60.0
22
22
  Requires-Dist: numpy>=1.21.0
23
23
  Requires-Dist: pandas>=1.5.0
24
- Requires-Dist: py-cpuinfo
25
24
  Requires-Dist: scanpy>=1.9.0
26
25
  Requires-Dist: scikit-learn>=1.0.0
27
26
  Requires-Dist: scipy>=1.8.0
28
27
  Requires-Dist: seaborn>=0.11.0
29
28
  Requires-Dist: statsmodels>=0.13.0
29
+ Requires-Dist: numba>=0.57.0
30
+ Requires-Dist: psutil>=5.9.0
31
+ Requires-Dist: py-cpuinfo
30
32
  Provides-Extra: gpu
31
33
  Requires-Dist: cupy-cuda12x; extra == "gpu"
32
34
  Dynamic: author
@@ -0,0 +1,14 @@
1
+ m3Drop/ControlDeviceCPU.py,sha256=8P-hxd4thc2wSeon73b9rz3clIGkE3x0cEE82RiGFds,8880
2
+ m3Drop/ControlDeviceGPU.py,sha256=4nzPtgyV0RsEOeezwCVJ7oyDOsp9-dRVLczlduUocpU,9143
3
+ m3Drop/CoreCPU.py,sha256=csRg5TLQx1Sup7k3lDJm9OO5Oe5-1aC3u_6ldE_GIX8,18679
4
+ m3Drop/CoreGPU.py,sha256=6LToLuWyHxX_7sC2z0Xnvy_qqgmpew5DmnCV0PxmTZQ,19785
5
+ m3Drop/DiagnosticsCPU.py,sha256=l0Imkh3F3zo4ovihUjx7cYWYgzPdztWCN1hcBFO43nY,12943
6
+ m3Drop/DiagnosticsGPU.py,sha256=z5BMOZNo_ruMBaDWJIE6zWhMUtf2ItY5Vcgu4N9lbok,14321
7
+ m3Drop/NormalizationCPU.py,sha256=Mm8VzWDu-NONbp-ngAt4PLjCKAGc7gJZKf-Yd-U95r0,7255
8
+ m3Drop/NormalizationGPU.py,sha256=3gRO82_6hSzB4rxmTRGocRUO2hy--i-szGCAY6FBnAI,8462
9
+ m3Drop/__init__.py,sha256=_J5p4bb_RAD6k_bnJUqj0DfA_akZMjd-AXzcVQpkW_g,2240
10
+ m3drop-0.4.45.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
11
+ m3drop-0.4.45.dist-info/METADATA,sha256=kFQ74ZykcHo4U-NKp-fISDj1UnF2VSsghT3rzgCiUxw,5248
12
+ m3drop-0.4.45.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
+ m3drop-0.4.45.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
14
+ m3drop-0.4.45.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
m3Drop/coreCPU.py DELETED
@@ -1,477 +0,0 @@
1
- import numpy as np
2
- import anndata
3
- import h5py
4
- import pandas as pd
5
- import time
6
-
7
- from scipy.sparse import csr_matrix as sp_csr_matrix
8
-
9
- import statsmodels.api as sm
10
- import matplotlib.pyplot as plt
11
- from scipy.stats import norm
12
- from statsmodels.stats.multitest import multipletests
13
- def ConvertDataSparseCPU(
14
- input_filename: str,
15
- output_filename: str,
16
- row_chunk_size: int = 5000
17
- ):
18
- """
19
- Performs out-of-core data cleaning on a standard (cell, gene) sparse
20
- .h5ad file. It correctly identifies and removes genes with zero counts
21
- across all cells. CPU-only implementation.
22
- """
23
- start_time = time.perf_counter()
24
- print(f"FUNCTION: ConvertDataSparseCPU() | FILE: {input_filename}")
25
-
26
- with h5py.File(input_filename, 'r') as f_in:
27
- x_group_in = f_in['X']
28
- n_cells, n_genes = x_group_in.attrs['shape']
29
-
30
- print("Phase [1/2]: Identifying genes with non-zero counts...")
31
- genes_to_keep_mask = np.zeros(n_genes, dtype=bool)
32
-
33
- h5_indptr = x_group_in['indptr']
34
- h5_indices = x_group_in['indices']
35
-
36
- for i in range(0, n_cells, row_chunk_size):
37
- end_row = min(i + row_chunk_size, n_cells)
38
- print(f"Phase [1/2]: Processing: {end_row} of {n_cells} cells.", end='\r')
39
-
40
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
41
- if start_idx == end_idx:
42
- continue
43
-
44
- indices_slice = np.array(h5_indices[start_idx:end_idx])
45
- unique_in_chunk = np.unique(indices_slice)
46
- genes_to_keep_mask[unique_in_chunk] = True
47
-
48
- n_genes_to_keep = np.sum(genes_to_keep_mask)
49
- print(f"\nPhase [1/2]: COMPLETE | Result: {n_genes_to_keep} / {n_genes} genes retained.")
50
-
51
- print("Phase [2/2]: Rounding up decimals and saving filtered output to disk...")
52
- adata_meta = anndata.read_h5ad(input_filename, backed='r')
53
- filtered_var_df = adata_meta.var[genes_to_keep_mask]
54
-
55
- adata_out_template = anndata.AnnData(obs=adata_meta.obs, var=filtered_var_df, uns=adata_meta.uns)
56
- adata_out_template.write_h5ad(output_filename, compression="gzip")
57
-
58
- with h5py.File(output_filename, 'a') as f_out:
59
- if 'X' in f_out:
60
- del f_out['X']
61
- x_group_out = f_out.create_group('X')
62
-
63
- out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
64
- out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
65
- out_indptr = x_group_out.create_dataset('indptr', shape=(n_cells + 1,), dtype='int64')
66
- out_indptr[0] = 0
67
- current_nnz = 0
68
-
69
- h5_data = x_group_in['data']
70
-
71
- for i in range(0, n_cells, row_chunk_size):
72
- end_row = min(i + row_chunk_size, n_cells)
73
- print(f"Phase [2/2]: Processing: {end_row} of {n_cells} cells.", end='\r')
74
-
75
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
76
- data_slice = np.array(h5_data[start_idx:end_idx])
77
- indices_slice = np.array(h5_indices[start_idx:end_idx])
78
- indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
79
-
80
- chunk = sp_csr_matrix((data_slice, indices_slice, indptr_slice), shape=(end_row-i, n_genes))
81
- filtered_chunk = chunk[:, genes_to_keep_mask]
82
- filtered_chunk.data = np.ceil(filtered_chunk.data).astype('float32')
83
-
84
- out_data.resize(current_nnz + filtered_chunk.nnz, axis=0)
85
- out_data[current_nnz:] = filtered_chunk.data
86
-
87
- out_indices.resize(current_nnz + filtered_chunk.nnz, axis=0)
88
- out_indices[current_nnz:] = filtered_chunk.indices
89
-
90
- new_indptr_list = filtered_chunk.indptr[1:].astype(np.int64) + current_nnz
91
- out_indptr[i + 1 : end_row + 1] = new_indptr_list
92
-
93
- current_nnz += filtered_chunk.nnz
94
-
95
- x_group_out.attrs['encoding-type'] = 'csr_matrix'
96
- x_group_out.attrs['encoding-version'] = '0.1.0'
97
- x_group_out.attrs['shape'] = np.array([n_cells, n_genes_to_keep], dtype='int64')
98
- print(f"\nPhase [2/2]: COMPLETE | Output: {output_filename} {' ' * 50}")
99
-
100
- end_time = time.perf_counter()
101
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
102
-
103
-
104
- def hidden_calc_valsCPU(
105
- filename: str,
106
- chunk_size: int = 5000
107
- ) -> dict:
108
- """
109
- Calculates key statistics from a large, sparse (cell, gene) .h5ad file
110
- using a memory-safe, CPU-only, single-pass algorithm.
111
- """
112
- start_time = time.perf_counter()
113
- print(f"FUNCTION: hidden_calc_valsCPU() | FILE: {filename}")
114
-
115
- adata_meta = anndata.read_h5ad(filename, backed='r')
116
- print("Phase [1/3]: Finding nc and ng...")
117
- nc, ng = adata_meta.shape
118
- print("Phase [1/3]: COMPLETE")
119
-
120
- tis = np.zeros(nc, dtype='float64')
121
- cell_non_zeros = np.zeros(nc, dtype='int64')
122
- tjs = np.zeros(ng, dtype='float64')
123
- gene_non_zeros = np.zeros(ng, dtype='int64')
124
-
125
- print("Phase [2/3]: Calculating tis and tjs...")
126
- with h5py.File(filename, 'r') as f_in:
127
- x_group = f_in['X']
128
- h5_indptr = x_group['indptr']
129
- h5_data = x_group['data']
130
- h5_indices = x_group['indices']
131
-
132
- for i in range(0, nc, chunk_size):
133
- end_row = min(i + chunk_size, nc)
134
- print(f"Phase [2/3]: Processing: {end_row} of {nc} cells.", end='\r')
135
-
136
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
137
- if start_idx == end_idx:
138
- continue
139
-
140
- data_slice = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
141
- indices_slice = np.array(h5_indices[start_idx:end_idx], dtype=np.int64)
142
- indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
143
-
144
- chunk = sp_csr_matrix((data_slice, indices_slice, indptr_slice), shape=(end_row-i, ng))
145
-
146
- tis[i:end_row] = np.asarray(chunk.sum(axis=1)).ravel()
147
- cell_non_zeros[i:end_row] = np.diff(indptr_slice)
148
-
149
- np.add.at(tjs, indices_slice, data_slice)
150
- unique_indices, counts = np.unique(indices_slice, return_counts=True)
151
- gene_non_zeros[unique_indices] += counts
152
-
153
- tjs_series = pd.Series(tjs, index=adata_meta.var.index)
154
- tis_series = pd.Series(tis, index=adata_meta.obs.index)
155
- print(f"Phase [2/3]: COMPLETE{' ' * 50}")
156
-
157
- print("Phase [3/3]: Calculating dis, djs, and total...")
158
- dis = ng - cell_non_zeros
159
- djs = nc - gene_non_zeros
160
- total = tjs.sum()
161
- print("Phase [3/3]: COMPLETE")
162
-
163
- end_time = time.perf_counter()
164
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
165
-
166
- return {
167
- "tis": tis_series,
168
- "tjs": tjs_series,
169
- "dis": pd.Series(dis, index=adata_meta.obs.index),
170
- "djs": pd.Series(djs, index=adata_meta.var.index),
171
- "total": total,
172
- "nc": nc,
173
- "ng": ng
174
- }
175
-
176
-
177
- def NBumiFitModelCPU(
178
- cleaned_filename: str,
179
- stats: dict,
180
- chunk_size: int = 5000
181
- ) -> dict:
182
- start_time = time.perf_counter()
183
- print(f"FUNCTION: NBumiFitModelCPU() | FILE: {cleaned_filename}")
184
-
185
- tjs_series = stats['tjs']
186
- tis_series = stats['tis']
187
- tjs = tjs_series.values.astype(np.float64)
188
- tis = tis_series.values.astype(np.float64)
189
- nc, ng = stats['nc'], stats['ng']
190
- total = stats['total']
191
-
192
- sum_x_sq = np.zeros(ng, dtype=np.float64)
193
- sum_2xmu = np.zeros(ng, dtype=np.float64)
194
-
195
- print("Phase [1/3]: Pre-calculating sum of squared expectations...")
196
- sum_tis_sq = np.sum(tis**2)
197
- sum_mu_sq = (tjs**2 / total**2) * sum_tis_sq
198
- print("Phase [1/3]: COMPLETE")
199
-
200
- print("Phase [2/3]: Calculating variance components from data chunks...")
201
- with h5py.File(cleaned_filename, 'r') as f_in:
202
- x_group = f_in['X']
203
- h5_indptr = x_group['indptr']
204
- h5_data = x_group['data']
205
- h5_indices = x_group['indices']
206
-
207
- for i in range(0, nc, chunk_size):
208
- end_row = min(i + chunk_size, nc)
209
- print(f"Phase [2/3]: Processing: {end_row} of {nc} cells.", end='\r')
210
-
211
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
212
- if start_idx == end_idx:
213
- continue
214
-
215
- data_slice = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
216
- indices_slice = np.array(h5_indices[start_idx:end_idx], dtype=np.int64)
217
- indptr_slice = h5_indptr[i:end_row+1] - h5_indptr[i]
218
-
219
- np.add.at(sum_x_sq, indices_slice, data_slice**2)
220
-
221
- row_lengths = np.diff(indptr_slice)
222
- if row_lengths.sum() == 0:
223
- continue
224
- cell_indices = np.repeat(np.arange(i, end_row), row_lengths)
225
-
226
- tis_per_nz = tis[cell_indices]
227
- tjs_per_nz = tjs[indices_slice]
228
- term_vals = 2 * data_slice * tjs_per_nz * tis_per_nz / total
229
- np.add.at(sum_2xmu, indices_slice, term_vals)
230
-
231
- print(f"Phase [2/3]: COMPLETE {' ' * 50}")
232
-
233
- print("Phase [3/3]: Finalizing dispersion and variance calculations...")
234
- sum_sq_dev = sum_x_sq - sum_2xmu + sum_mu_sq
235
- var_obs = sum_sq_dev / max(nc - 1, 1)
236
-
237
- sizes = np.full(ng, 10000.0, dtype=np.float64)
238
- numerator = (tjs**2 / total**2) * sum_tis_sq
239
- denominator = sum_sq_dev - tjs
240
- stable_mask = denominator > 1e-6
241
- sizes[stable_mask] = numerator[stable_mask] / denominator[stable_mask]
242
- sizes[np.isnan(sizes) | (sizes <= 0)] = 10000.0
243
-
244
- print("Phase [3/3]: COMPLETE")
245
-
246
- end_time = time.perf_counter()
247
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
248
-
249
- return {
250
- 'var_obs': pd.Series(var_obs, index=tjs_series.index),
251
- 'sizes': pd.Series(sizes, index=tjs_series.index),
252
- 'vals': stats
253
- }
254
-
255
-
256
- def NBumiFitDispVsMeanCPU(fit, suppress_plot=True):
257
- """
258
- Fits a linear model to the log-dispersion vs log-mean of gene expression.
259
- """
260
- vals = fit['vals']
261
- size_g = fit['sizes'].values
262
- tjs = vals['tjs'].values
263
-
264
- mean_expression = tjs / vals['nc']
265
- forfit = (np.isfinite(size_g)) & (size_g < 1e6) & (mean_expression > 1e-3) & (size_g > 0)
266
-
267
- log2_mean_expr = np.log2(mean_expression, where=(mean_expression > 0))
268
- higher = log2_mean_expr > 4
269
- if np.sum(higher & forfit) > 2000:
270
- forfit = higher & forfit
271
-
272
- y = np.log(size_g[forfit])
273
- x = np.log(mean_expression[forfit])
274
-
275
- X = sm.add_constant(x)
276
- model = sm.OLS(y, X).fit()
277
-
278
- if not suppress_plot:
279
- plt.figure(figsize=(7, 6))
280
- plt.scatter(x, y, alpha=0.5, label="Data Points")
281
- plt.plot(x, model.fittedvalues, color='red', label='Regression Fit')
282
- plt.title('Dispersion vs. Mean Expression')
283
- plt.xlabel("Log Mean Expression")
284
- plt.ylabel("Log Size (Dispersion)")
285
- plt.legend()
286
- plt.grid(True)
287
- plt.show()
288
-
289
- return model.params
290
-
291
-
292
- def NBumiFeatureSelectionHighVarCPU(fit: dict) -> pd.DataFrame:
293
- """
294
- Selects features (genes) with higher variance than expected.
295
- """
296
- start_time = time.perf_counter()
297
- print(f"FUNCTION: NBumiFeatureSelectionHighVarCPU()")
298
-
299
- print("Phase [1/1]: Calculating residuals for high variance selection...")
300
- vals = fit['vals']
301
- coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
302
-
303
- mean_expression = vals['tjs'].values / vals['nc']
304
-
305
- with np.errstate(divide='ignore', invalid='ignore'):
306
- log_mean_expression = np.log(mean_expression)
307
- log_mean_expression[np.isneginf(log_mean_expression)] = 0
308
- exp_size = np.exp(coeffs[0] + coeffs[1] * log_mean_expression)
309
-
310
- with np.errstate(divide='ignore', invalid='ignore'):
311
- res = np.log(fit['sizes'].values) - np.log(exp_size)
312
-
313
- results_df = pd.DataFrame({
314
- 'Gene': fit['sizes'].index,
315
- 'Residual': res
316
- })
317
-
318
- final_table = results_df.sort_values(by='Residual', ascending=True)
319
- print("Phase [1/1]: COMPLETE")
320
-
321
- end_time = time.perf_counter()
322
- print(f"Total time: {end_time - start_time:.4f} seconds.\n")
323
-
324
- return final_table
325
-
326
-
327
- def NBumiFeatureSelectionCombinedDropCPU(
328
- fit: dict,
329
- cleaned_filename: str,
330
- chunk_size: int = 5000,
331
- method="fdr_bh",
332
- qval_thresh=0.05
333
- ) -> pd.DataFrame:
334
- """
335
- Selects features with a significantly higher dropout rate than expected,
336
- using an out-of-core, CPU-only approach.
337
- """
338
- start_time = time.perf_counter()
339
- print(f"FUNCTION: NBumiFeatureSelectionCombinedDropCPU() | FILE: {cleaned_filename}")
340
-
341
- print("Phase [1/3]: Initializing arrays and calculating expected dispersion...")
342
- vals = fit['vals']
343
- coeffs = NBumiFitDispVsMeanCPU(fit, suppress_plot=True)
344
-
345
- tjs = vals['tjs'].values.astype(np.float64)
346
- tis = vals['tis'].values.astype(np.float64)
347
- total = vals['total']
348
- nc = vals['nc']
349
- ng = vals['ng']
350
-
351
- mean_expression = tjs / nc
352
- with np.errstate(divide='ignore'):
353
- exp_size = np.exp(coeffs[0] + coeffs[1] * np.log(mean_expression, where=(mean_expression > 0)))
354
- exp_size = np.nan_to_num(exp_size, nan=1.0, posinf=1e6, neginf=1.0)
355
-
356
- p_sum = np.zeros(ng, dtype=np.float64)
357
- p_var_sum = np.zeros(ng, dtype=np.float64)
358
- print("Phase [1/3]: COMPLETE")
359
-
360
- print("Phase [2/3]: Calculating expected dropout sums from data chunks...")
361
- for i in range(0, nc, chunk_size):
362
- end_col = min(i + chunk_size, nc)
363
- print(f"Phase [2/3]: Processing: {end_col} of {nc} cells.", end='\r')
364
-
365
- tis_chunk = tis[i:end_col]
366
- if tis_chunk.size == 0:
367
- continue
368
-
369
- mu_chunk = tjs[:, np.newaxis] * tis_chunk[np.newaxis, :] / total
370
- base = 1 + mu_chunk / exp_size[:, np.newaxis]
371
- base = np.maximum(base, 1e-12)
372
- p_is_chunk = np.power(base, -exp_size[:, np.newaxis])
373
- p_is_chunk = np.nan_to_num(p_is_chunk, nan=0.0, posinf=1.0, neginf=0.0)
374
-
375
- p_var_is_chunk = p_is_chunk * (1 - p_is_chunk)
376
-
377
- p_sum += np.sum(p_is_chunk, axis=1)
378
- p_var_sum += np.sum(p_var_is_chunk, axis=1)
379
-
380
- print(f"Phase [2/3]: COMPLETE {' ' * 50}")
381
-
382
- print("Phase [3/3]: Performing statistical test and adjusting p-values...")
383
-
384
- droprate_exp = p_sum / nc
385
- droprate_exp_err = np.sqrt(p_var_sum / (nc**2))
386
-
387
- droprate_obs = vals['djs'].values / nc
388
-
389
- diff = droprate_obs - droprate_exp
390
- combined_err = np.sqrt(droprate_exp_err**2 + (droprate_obs * (1 - droprate_obs) / nc))
391
-
392
- with np.errstate(divide='ignore', invalid='ignore'):
393
- Zed = diff / combined_err
394
-
395
- pvalue = norm.sf(Zed)
396
-
397
- results_df = pd.DataFrame({
398
- 'Gene': vals['tjs'].index,
399
- 'p.value': pvalue,
400
- 'effect_size': diff
401
- })
402
- results_df = results_df.sort_values(by='p.value')
403
-
404
- qval = multipletests(results_df['p.value'].fillna(1), method=method)[1]
405
- results_df['q.value'] = qval
406
- final_table = results_df[results_df['q.value'] < qval_thresh]
407
- print("Phase [3/3]: COMPLETE")
408
-
409
- end_time = time.perf_counter()
410
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
411
-
412
- return final_table[['Gene', 'effect_size', 'p.value', 'q.value']]
413
-
414
-
415
- def NBumiCombinedDropVolcanoCPU(
416
- results_df: pd.DataFrame,
417
- qval_thresh: float = 0.05,
418
- effect_size_thresh: float = 0.25,
419
- top_n_genes: int = 10,
420
- suppress_plot: bool = False,
421
- plot_filename: str = None
422
- ):
423
- """
424
- Generates a volcano plot from the results of feature selection (CPU version).
425
- """
426
- start_time = time.perf_counter()
427
- print(f"FUNCTION: NBumiCombinedDropVolcanoCPU()")
428
-
429
- print("Phase [1/1]: Preparing data for visualization...")
430
- df = results_df.copy()
431
-
432
- non_zero_min = df[df['q.value'] > 0]['q.value'].min()
433
- df['q.value'] = df['q.value'].replace(0, non_zero_min)
434
- df['-log10_qval'] = -np.log10(df['q.value'])
435
-
436
- df['color'] = 'grey'
437
- sig_up = (df['q.value'] < qval_thresh) & (df['effect_size'] > effect_size_thresh)
438
- sig_down = (df['q.value'] < qval_thresh) & (df['effect_size'] < -effect_size_thresh)
439
- df.loc[sig_up, 'color'] = 'red'
440
- df.loc[sig_down, 'color'] = 'blue'
441
-
442
- print("Phase [1/1]: COMPLETE")
443
- print("Phase [2/2]: Generating plot...")
444
-
445
- plt.figure(figsize=(10, 8))
446
- plt.scatter(df['effect_size'], df['-log10_qval'], c=df['color'], s=10, alpha=0.6)
447
- plt.axvline(x=effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
448
- plt.axvline(x=-effect_size_thresh, linestyle='--', color='grey', linewidth=0.8)
449
- plt.axhline(y=-np.log10(qval_thresh), linestyle='--', color='grey', linewidth=0.8)
450
-
451
- top_genes = df.nsmallest(top_n_genes, 'q.value')
452
- for _, row in top_genes.iterrows():
453
- plt.text(row['effect_size'], row['-log10_qval'], row['Gene'],
454
- fontsize=9, ha='left', va='bottom', alpha=0.8)
455
-
456
- plt.title('Volcano Plot of Dropout Feature Selection')
457
- plt.xlabel('Effect Size (Observed - Expected Dropout Rate)')
458
- plt.ylabel('-log10 (Adjusted p-value)')
459
- plt.grid(True, linestyle='--', alpha=0.3)
460
-
461
- ax = plt.gca()
462
-
463
- if plot_filename:
464
- plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
465
- print(f"STATUS: Volcano plot saved to '{plot_filename}'")
466
-
467
- if not suppress_plot:
468
- plt.show()
469
-
470
- plt.close()
471
-
472
- print("Phase [2/2]: COMPLETE")
473
-
474
- end_time = time.perf_counter()
475
- print(f"Total time: {end_time - start_time:.2f} seconds.\n")
476
-
477
- return ax