M3Drop 0.4.34__py3-none-any.whl → 0.4.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- m3Drop/diagnosticsGPU.py +92 -424
- {m3drop-0.4.34.dist-info → m3drop-0.4.36.dist-info}/METADATA +1 -1
- {m3drop-0.4.34.dist-info → m3drop-0.4.36.dist-info}/RECORD +6 -6
- {m3drop-0.4.34.dist-info → m3drop-0.4.36.dist-info}/WHEEL +0 -0
- {m3drop-0.4.34.dist-info → m3drop-0.4.36.dist-info}/licenses/LICENSE +0 -0
- {m3drop-0.4.34.dist-info → m3drop-0.4.36.dist-info}/top_level.txt +0 -0
m3Drop/diagnosticsGPU.py
CHANGED
|
@@ -1,395 +1,124 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import cupy as cp
|
|
4
|
-
import cupyx.scipy.sparse as csp
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
|
-
import h5py
|
|
7
|
-
import os
|
|
8
|
-
import time
|
|
9
|
-
import psutil
|
|
10
|
-
import gc
|
|
11
|
-
from scipy import sparse
|
|
12
|
-
from scipy import stats
|
|
13
|
-
import anndata # <--- FIXED: Added missing import
|
|
14
|
-
|
|
15
|
-
# [GOVERNOR INTEGRATION] Added get_optimal_chunk_size
|
|
16
|
-
from .coreGPU import hidden_calc_valsGPU, NBumiFitModelGPU, NBumiFitDispVsMeanGPU, get_optimal_chunk_size
|
|
17
|
-
from cupy.sparse import csr_matrix as cp_csr_matrix
|
|
18
|
-
import scipy.sparse as sp
|
|
19
|
-
from scipy.sparse import csr_matrix as sp_csr_matrix
|
|
20
|
-
|
|
21
|
-
import statsmodels.api as sm
|
|
22
|
-
from scipy.stats import norm
|
|
23
|
-
from statsmodels.stats.multitest import multipletests
|
|
24
|
-
|
|
25
|
-
def NBumiFitBasicModelGPU(
|
|
1
|
+
def NBumiCompareModelsGPU(
|
|
2
|
+
raw_filename: str,
|
|
26
3
|
cleaned_filename: str,
|
|
27
4
|
stats: dict,
|
|
28
|
-
|
|
29
|
-
chunk_size: int = None
|
|
5
|
+
fit_adjust: dict,
|
|
6
|
+
chunk_size: int = None,
|
|
7
|
+
suppress_plot=False,
|
|
8
|
+
plot_filename=None
|
|
30
9
|
) -> dict:
|
|
31
10
|
"""
|
|
32
|
-
|
|
33
|
-
|
|
11
|
+
OPTIMIZED VERSION (IN-MEMORY):
|
|
12
|
+
- Eliminates the 46GB '_basic_norm.h5ad' temporary file.
|
|
13
|
+
- Performs depth normalization and variance calculation on-the-fly in GPU VRAM.
|
|
34
14
|
"""
|
|
35
|
-
|
|
36
|
-
print(f"FUNCTION:
|
|
15
|
+
pipeline_start_time = time.time()
|
|
16
|
+
print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
|
|
37
17
|
|
|
38
|
-
# [GOVERNOR
|
|
18
|
+
# [GOVERNOR] High multiplier (12.0) because we hold Raw + Norm + Square in VRAM
|
|
39
19
|
if chunk_size is None:
|
|
40
|
-
chunk_size = get_optimal_chunk_size(
|
|
20
|
+
chunk_size = get_optimal_chunk_size(raw_filename, multiplier=12.0, is_dense=False)
|
|
41
21
|
|
|
42
|
-
# --- Phase 1:
|
|
43
|
-
print("Phase [1/
|
|
44
|
-
|
|
22
|
+
# --- Phase 1: In-Memory "Basic Fit" (Normalization + Variance) ---
|
|
23
|
+
print("Phase [1/3]: Calculating Basic Model (Depth-Normalized) variance on-the-fly...")
|
|
24
|
+
|
|
25
|
+
# 1. Prepare Size Factors (CPU)
|
|
26
|
+
tjs = stats['tjs'].values # Gene sums (needed for final dataframe)
|
|
27
|
+
tis = stats['tis'].values # Cell sums (needed for size factors)
|
|
45
28
|
nc, ng = stats['nc'], stats['ng']
|
|
46
|
-
|
|
47
|
-
|
|
29
|
+
|
|
30
|
+
median_sum = np.median(tis[tis > 0])
|
|
31
|
+
size_factors = np.ones_like(tis, dtype=np.float32)
|
|
32
|
+
non_zero_mask = tis > 0
|
|
33
|
+
size_factors[non_zero_mask] = tis[non_zero_mask] / median_sum
|
|
34
|
+
|
|
35
|
+
# 2. Prepare GPU Arrays
|
|
48
36
|
sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
#
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
h5_data = x_group['data']
|
|
57
|
-
h5_indices = x_group['indices']
|
|
37
|
+
sum_x_gpu = cp.zeros(ng, dtype=cp.float64) # Need sum(x) to calc mean(x) for variance
|
|
38
|
+
|
|
39
|
+
# 3. GPU Loop (Raw Data -> Normalize -> Accumulate)
|
|
40
|
+
with h5py.File(raw_filename, 'r') as f_in:
|
|
41
|
+
h5_indptr = f_in['X']['indptr']
|
|
42
|
+
h5_data = f_in['X']['data']
|
|
43
|
+
h5_indices = f_in['X']['indices']
|
|
58
44
|
|
|
59
45
|
for i in range(0, nc, chunk_size):
|
|
60
46
|
end_row = min(i + chunk_size, nc)
|
|
61
|
-
print(f"Phase [
|
|
62
|
-
|
|
47
|
+
print(f"Phase [1/3]: Processing: {end_row} of {nc} cells.", end='\r')
|
|
48
|
+
|
|
63
49
|
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
64
|
-
if start_idx == end_idx:
|
|
65
|
-
|
|
50
|
+
if start_idx == end_idx: continue
|
|
51
|
+
|
|
52
|
+
# Load Raw Chunk
|
|
53
|
+
data_gpu = cp.asarray(h5_data[start_idx:end_idx], dtype=cp.float32)
|
|
54
|
+
indices_gpu = cp.asarray(h5_indices[start_idx:end_idx])
|
|
55
|
+
indptr_gpu = cp.asarray(h5_indptr[i:end_row + 1] - start_idx)
|
|
56
|
+
|
|
57
|
+
# Expand Size Factors to match Data Structure
|
|
58
|
+
# (Map cell's size factor to every non-zero gene in that cell)
|
|
59
|
+
nnz_in_chunk = indptr_gpu[-1].item()
|
|
60
|
+
cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
|
|
61
|
+
if len(indptr_gpu) > 1:
|
|
62
|
+
cell_boundary_markers[indptr_gpu[:-1]] = 1
|
|
63
|
+
# row_indices maps every data point to its cell index (0 to chunk_size)
|
|
64
|
+
row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
|
|
66
65
|
|
|
67
|
-
#
|
|
68
|
-
|
|
66
|
+
# Get size factors for this chunk
|
|
67
|
+
sf_chunk = cp.asarray(size_factors[i:end_row])
|
|
69
68
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
# Accumulate the sum of squares for each gene
|
|
82
|
-
cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
|
|
83
|
-
|
|
84
|
-
# Free GPU memory
|
|
85
|
-
del data_gpu, indices_gpu
|
|
86
|
-
cp.get_default_memory_pool().free_all_blocks()
|
|
87
|
-
else:
|
|
88
|
-
# Original processing for smaller chunks
|
|
89
|
-
data_slice = h5_data[start_idx:end_idx]
|
|
90
|
-
indices_slice = h5_indices[start_idx:end_idx]
|
|
69
|
+
# --- THE MAGIC: On-the-Fly Normalization ---
|
|
70
|
+
# data_norm = data_raw / size_factor
|
|
71
|
+
data_gpu /= sf_chunk[row_indices]
|
|
72
|
+
|
|
73
|
+
# Accumulate for Variance: E[X^2] and E[X]
|
|
74
|
+
cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
|
|
75
|
+
cp.add.at(sum_x_gpu, indices_gpu, data_gpu)
|
|
76
|
+
|
|
77
|
+
# Clean up VRAM
|
|
78
|
+
del data_gpu, indices_gpu, indptr_gpu, row_indices, sf_chunk, cell_boundary_markers
|
|
79
|
+
cp.get_default_memory_pool().free_all_blocks()
|
|
91
80
|
|
|
92
|
-
|
|
93
|
-
indices_gpu = cp.asarray(indices_slice)
|
|
81
|
+
print(f"Phase [1/3]: COMPLETE{' '*50}")
|
|
94
82
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
cp.get_default_memory_pool().free_all_blocks()
|
|
83
|
+
# 4. Finalize Basic Statistics
|
|
84
|
+
# Var(X) = E[X^2] - (E[X])^2
|
|
85
|
+
mean_x_sq_gpu = sum_x_sq_gpu / nc
|
|
86
|
+
mean_mu_gpu = sum_x_gpu / nc
|
|
87
|
+
my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
|
|
101
88
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
# --- Final calculations on GPU ---
|
|
105
|
-
if is_logged:
|
|
106
|
-
raise NotImplementedError("Logged data variance calculation is not implemented for out-of-core.")
|
|
107
|
-
else:
|
|
108
|
-
# Variance of raw data: Var(X) = E[X^2] - E[X]^2
|
|
109
|
-
mean_x_sq_gpu = sum_x_sq_gpu / nc
|
|
110
|
-
mean_mu_gpu = tjs_gpu / nc
|
|
111
|
-
my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
|
|
112
|
-
|
|
113
|
-
# Calculate dispersion ('size')
|
|
114
|
-
size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
|
|
89
|
+
# Dispersion = Mean^2 / (Var - Mean)
|
|
90
|
+
size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
|
|
115
91
|
|
|
92
|
+
# Safety Clamping (Same as original)
|
|
116
93
|
max_size_val = cp.nanmax(size_gpu) * 10
|
|
117
|
-
if cp.isnan(max_size_val):
|
|
118
|
-
max_size_val = 1000
|
|
94
|
+
if cp.isnan(max_size_val): max_size_val = 1000
|
|
119
95
|
size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
|
|
120
96
|
size_gpu[size_gpu < 1e-10] = 1e-10
|
|
121
97
|
|
|
122
|
-
#
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
128
|
-
|
|
129
|
-
return {
|
|
130
|
-
'var_obs': pd.Series(my_rowvar_cpu, index=stats['tjs'].index),
|
|
131
|
-
'sizes': pd.Series(sizes_cpu, index=stats['tjs'].index),
|
|
132
|
-
'vals': stats
|
|
98
|
+
# Construct "Basic Fit" Object
|
|
99
|
+
fit_basic = {
|
|
100
|
+
'sizes': pd.Series(size_gpu.get(), index=stats['tjs'].index),
|
|
101
|
+
'vals': stats,
|
|
102
|
+
'var_obs': pd.Series(my_rowvar_gpu.get(), index=stats['tjs'].index)
|
|
133
103
|
}
|
|
134
|
-
|
|
135
|
-
def NBumiCheckFitFSGPU(
|
|
136
|
-
cleaned_filename: str,
|
|
137
|
-
fit: dict,
|
|
138
|
-
chunk_size: int = None,
|
|
139
|
-
suppress_plot=False,
|
|
140
|
-
plot_filename=None
|
|
141
|
-
) -> dict:
|
|
142
|
-
"""
|
|
143
|
-
FIXED VERSION - No cupy.errstate, proper GPU computation.
|
|
144
|
-
"""
|
|
145
|
-
start_time = time.perf_counter()
|
|
146
|
-
print(f"FUNCTION: NBumiCheckFitFS() | FILE: {cleaned_filename}")
|
|
147
|
-
|
|
148
|
-
# [GOVERNOR INTEGRATION] Adaptive chunk sizing
|
|
149
|
-
if chunk_size is None:
|
|
150
|
-
chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=5.0, is_dense=True)
|
|
151
|
-
|
|
152
|
-
# --- Phase 1: Initialization ---
|
|
153
|
-
print("Phase [1/2]: Initializing parameters and arrays on GPU...")
|
|
154
|
-
vals = fit['vals']
|
|
155
|
-
size_coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
|
|
156
|
-
|
|
157
|
-
# Must use float64 for precision
|
|
158
|
-
tjs_gpu = cp.asarray(vals['tjs'].values, dtype=cp.float64)
|
|
159
|
-
tis_gpu = cp.asarray(vals['tis'].values, dtype=cp.float64)
|
|
160
|
-
total = vals['total']
|
|
161
|
-
nc, ng = vals['nc'], vals['ng']
|
|
162
|
-
|
|
163
|
-
# Calculate smoothed size
|
|
164
|
-
mean_expression_gpu = tjs_gpu / nc
|
|
165
|
-
log_mean_expression_gpu = cp.log(mean_expression_gpu)
|
|
166
|
-
smoothed_size_gpu = cp.exp(size_coeffs[0] + size_coeffs[1] * log_mean_expression_gpu)
|
|
167
|
-
|
|
168
|
-
# Initialize result arrays
|
|
169
|
-
row_ps_gpu = cp.zeros(ng, dtype=cp.float64)
|
|
170
|
-
col_ps_gpu = cp.zeros(nc, dtype=cp.float64)
|
|
171
|
-
print("Phase [1/2]: COMPLETE")
|
|
172
|
-
|
|
173
|
-
# --- Phase 2: Calculate Expected Dropouts ---
|
|
174
|
-
print("Phase [2/2]: Calculating expected dropouts from data chunks...")
|
|
175
104
|
|
|
176
|
-
#
|
|
177
|
-
|
|
178
|
-
print(f" Using governor chunk size: {optimal_chunk}")
|
|
105
|
+
# --- Phase 2: Check Fit (Calculate Errors) ---
|
|
106
|
+
print("Phase [2/3]: Evaluating fit errors on ORIGINAL data...")
|
|
179
107
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
tis_chunk_gpu = tis_gpu[i:end_col]
|
|
185
|
-
|
|
186
|
-
# Standard calculation without errstate
|
|
187
|
-
mu_chunk_gpu = tjs_gpu[:, cp.newaxis] * tis_chunk_gpu[cp.newaxis, :] / total
|
|
188
|
-
|
|
189
|
-
# Calculate p_is directly - CuPy handles overflow internally
|
|
190
|
-
base = 1 + mu_chunk_gpu / smoothed_size_gpu[:, cp.newaxis]
|
|
191
|
-
p_is_chunk_gpu = cp.power(base, -smoothed_size_gpu[:, cp.newaxis])
|
|
192
|
-
|
|
193
|
-
# Handle any inf/nan values that might have occurred
|
|
194
|
-
p_is_chunk_gpu = cp.nan_to_num(p_is_chunk_gpu, nan=0.0, posinf=1.0, neginf=0.0)
|
|
195
|
-
|
|
196
|
-
# Sum results
|
|
197
|
-
row_ps_gpu += p_is_chunk_gpu.sum(axis=1)
|
|
198
|
-
col_ps_gpu[i:end_col] = p_is_chunk_gpu.sum(axis=0)
|
|
199
|
-
|
|
200
|
-
# Clean up
|
|
201
|
-
del mu_chunk_gpu, p_is_chunk_gpu, base, tis_chunk_gpu
|
|
202
|
-
|
|
203
|
-
# Periodic memory cleanup
|
|
204
|
-
mempool = cp.get_default_memory_pool()
|
|
205
|
-
if (i // optimal_chunk) % 10 == 0:
|
|
206
|
-
mempool.free_all_blocks()
|
|
207
|
-
|
|
208
|
-
print(f"Phase [2/2]: COMPLETE{' ' * 50}")
|
|
209
|
-
|
|
210
|
-
# Move results to CPU
|
|
211
|
-
row_ps_cpu = row_ps_gpu.get()
|
|
212
|
-
col_ps_cpu = col_ps_gpu.get()
|
|
213
|
-
djs_cpu = vals['djs'].values
|
|
214
|
-
dis_cpu = vals['dis'].values
|
|
215
|
-
|
|
216
|
-
# Plotting
|
|
217
|
-
if not suppress_plot:
|
|
218
|
-
plt.figure(figsize=(12, 5))
|
|
219
|
-
plt.subplot(1, 2, 1)
|
|
220
|
-
plt.scatter(djs_cpu, row_ps_cpu, alpha=0.5, s=10)
|
|
221
|
-
plt.title("Gene-specific Dropouts (Smoothed)")
|
|
222
|
-
plt.xlabel("Observed")
|
|
223
|
-
plt.ylabel("Fit")
|
|
224
|
-
lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
|
|
225
|
-
plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
|
|
226
|
-
plt.grid(True); plt.legend()
|
|
227
|
-
|
|
228
|
-
plt.subplot(1, 2, 2)
|
|
229
|
-
plt.scatter(dis_cpu, col_ps_cpu, alpha=0.5, s=10)
|
|
230
|
-
plt.title("Cell-specific Dropouts (Smoothed)")
|
|
231
|
-
plt.xlabel("Observed")
|
|
232
|
-
plt.ylabel("Expected")
|
|
233
|
-
lims = [min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1])]
|
|
234
|
-
plt.plot(lims, lims, 'r-', alpha=0.75, zorder=0, label="y=x line")
|
|
235
|
-
plt.grid(True); plt.legend()
|
|
236
|
-
|
|
237
|
-
plt.tight_layout()
|
|
238
|
-
if plot_filename:
|
|
239
|
-
plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
|
|
240
|
-
print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
|
|
241
|
-
plt.show()
|
|
242
|
-
plt.close()
|
|
243
|
-
|
|
244
|
-
# Calculate errors
|
|
245
|
-
gene_error = np.sum((djs_cpu - row_ps_cpu)**2)
|
|
246
|
-
cell_error = np.sum((dis_cpu - col_ps_cpu)**2)
|
|
247
|
-
|
|
248
|
-
end_time = time.perf_counter()
|
|
249
|
-
print(f"Total time: {end_time - start_time:.2f} seconds.\n")
|
|
250
|
-
|
|
251
|
-
return {
|
|
252
|
-
'gene_error': gene_error,
|
|
253
|
-
'cell_error': cell_error,
|
|
254
|
-
'rowPs': pd.Series(row_ps_cpu, index=fit['vals']['tjs'].index),
|
|
255
|
-
'colPs': pd.Series(col_ps_cpu, index=fit['vals']['tis'].index)
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
def NBumiCompareModelsGPU(
|
|
259
|
-
raw_filename: str,
|
|
260
|
-
cleaned_filename: str,
|
|
261
|
-
stats: dict,
|
|
262
|
-
fit_adjust: dict,
|
|
263
|
-
chunk_size: int = None,
|
|
264
|
-
suppress_plot=False,
|
|
265
|
-
plot_filename=None
|
|
266
|
-
) -> dict:
|
|
267
|
-
"""
|
|
268
|
-
OPTIMIZED VERSION - Faster normalization and sparse matrix writing.
|
|
269
|
-
"""
|
|
270
|
-
pipeline_start_time = time.time()
|
|
271
|
-
print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
|
|
272
|
-
|
|
273
|
-
# [GOVERNOR INTEGRATION] Calculate chunk size for normalization phase (heavy IO)
|
|
274
|
-
if chunk_size is None:
|
|
275
|
-
# Multiplier 10.0 for safety during normalization of massive dense expansion
|
|
276
|
-
chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=10.0, is_dense=True)
|
|
277
|
-
|
|
278
|
-
# --- Phase 1: OPTIMIZED Normalization ---
|
|
279
|
-
print("Phase [1/4]: Creating temporary 'basic' normalized data file...")
|
|
280
|
-
basic_norm_filename = cleaned_filename.replace('.h5ad', '_basic_norm.h5ad')
|
|
281
|
-
|
|
282
|
-
# Read metadata. In 'backed' mode, this keeps a file handle open.
|
|
283
|
-
adata_meta = anndata.read_h5ad(cleaned_filename, backed='r')
|
|
284
|
-
nc, ng = adata_meta.shape
|
|
285
|
-
obs_df = adata_meta.obs.copy()
|
|
286
|
-
var_df = adata_meta.var.copy()
|
|
287
|
-
|
|
288
|
-
cell_sums = stats['tis'].values
|
|
289
|
-
median_sum = np.median(cell_sums[cell_sums > 0])
|
|
108
|
+
# Check Adjust (M3Drop)
|
|
109
|
+
check_adjust = NBumiCheckFitFSGPU(
|
|
110
|
+
cleaned_filename, fit_adjust, suppress_plot=True, chunk_size=chunk_size
|
|
111
|
+
)
|
|
290
112
|
|
|
291
|
-
#
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
adata_out = anndata.AnnData(obs=obs_df, var=var_df)
|
|
297
|
-
adata_out.write_h5ad(basic_norm_filename, compression="gzip")
|
|
298
|
-
|
|
299
|
-
with h5py.File(basic_norm_filename, 'a') as f_out:
|
|
300
|
-
if 'X' in f_out:
|
|
301
|
-
del f_out['X']
|
|
302
|
-
x_group_out = f_out.create_group('X')
|
|
303
|
-
x_group_out.attrs['encoding-type'] = 'csr_matrix'
|
|
304
|
-
x_group_out.attrs['encoding-version'] = '0.1.0'
|
|
305
|
-
x_group_out.attrs['shape'] = np.array([nc, ng], dtype='int64')
|
|
306
|
-
|
|
307
|
-
out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
|
|
308
|
-
out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
|
|
309
|
-
out_indptr = x_group_out.create_dataset('indptr', shape=(nc + 1,), dtype='int64')
|
|
310
|
-
out_indptr[0] = 0
|
|
311
|
-
current_nnz = 0
|
|
312
|
-
|
|
313
|
-
with h5py.File(cleaned_filename, 'r') as f_in:
|
|
314
|
-
h5_indptr = f_in['X']['indptr']
|
|
315
|
-
h5_data = f_in['X']['data']
|
|
316
|
-
h5_indices = f_in['X']['indices']
|
|
317
|
-
|
|
318
|
-
for i in range(0, nc, chunk_size):
|
|
319
|
-
end_row = min(i + chunk_size, nc)
|
|
320
|
-
print(f"Phase [1/4]: Normalizing: {end_row} of {nc} cells.", end='\r')
|
|
321
|
-
|
|
322
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
323
|
-
if start_idx == end_idx:
|
|
324
|
-
out_indptr[i + 1 : end_row + 1] = current_nnz
|
|
325
|
-
continue
|
|
326
|
-
|
|
327
|
-
# Read data for the chunk
|
|
328
|
-
data_slice = h5_data[start_idx:end_idx]
|
|
329
|
-
indices_slice = h5_indices[start_idx:end_idx]
|
|
330
|
-
indptr_slice = h5_indptr[i:end_row + 1] - start_idx
|
|
331
|
-
|
|
332
|
-
# Move to GPU for fast normalization
|
|
333
|
-
data_gpu = cp.asarray(data_slice.copy(), dtype=cp.float32)
|
|
334
|
-
|
|
335
|
-
indptr_gpu = cp.asarray(indptr_slice.copy())
|
|
336
|
-
nnz_in_chunk = indptr_gpu[-1].item()
|
|
337
|
-
cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
|
|
338
|
-
if len(indptr_gpu) > 1:
|
|
339
|
-
cell_boundary_markers[indptr_gpu[:-1]] = 1
|
|
340
|
-
row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
|
|
341
|
-
|
|
342
|
-
size_factors_for_chunk = cp.asarray(size_factors[i:end_row])
|
|
343
|
-
|
|
344
|
-
data_gpu /= size_factors_for_chunk[row_indices]
|
|
345
|
-
|
|
346
|
-
data_cpu = np.round(data_gpu.get())
|
|
347
|
-
|
|
348
|
-
num_cells_in_chunk = end_row - i
|
|
349
|
-
chunk_sp = sp_csr_matrix((data_cpu, indices_slice, indptr_slice),
|
|
350
|
-
shape=(num_cells_in_chunk, ng))
|
|
351
|
-
|
|
352
|
-
nnz_chunk = chunk_sp.nnz
|
|
353
|
-
out_data.resize(current_nnz + nnz_chunk, axis=0)
|
|
354
|
-
out_data[current_nnz:] = chunk_sp.data
|
|
355
|
-
|
|
356
|
-
out_indices.resize(current_nnz + nnz_chunk, axis=0)
|
|
357
|
-
out_indices[current_nnz:] = chunk_sp.indices
|
|
358
|
-
|
|
359
|
-
new_indptr_list = chunk_sp.indptr[1:].astype(np.int64) + current_nnz
|
|
360
|
-
out_indptr[i + 1 : end_row + 1] = new_indptr_list
|
|
361
|
-
|
|
362
|
-
current_nnz += nnz_chunk
|
|
363
|
-
|
|
364
|
-
del data_gpu, row_indices, size_factors_for_chunk, indptr_gpu
|
|
365
|
-
cp.get_default_memory_pool().free_all_blocks()
|
|
366
|
-
|
|
367
|
-
print(f"Phase [1/4]: COMPLETE{' '*50}")
|
|
113
|
+
# Check Basic (Depth-Norm)
|
|
114
|
+
check_basic = NBumiCheckFitFSGPU(
|
|
115
|
+
cleaned_filename, fit_basic, suppress_plot=True, chunk_size=chunk_size
|
|
116
|
+
)
|
|
117
|
+
print("Phase [2/3]: COMPLETE")
|
|
368
118
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
# [GOVERNOR INTEGRATION] Calculate chunk size for basic fit on the heavy normalized file
|
|
372
|
-
chunk_size_basic = get_optimal_chunk_size(basic_norm_filename, multiplier=10.0, is_dense=True)
|
|
119
|
+
# --- Phase 3: Plotting & Comparison ---
|
|
120
|
+
print("Phase [3/3]: Generating comparison...")
|
|
373
121
|
|
|
374
|
-
stats_basic = hidden_calc_valsGPU(basic_norm_filename) # hidden_calc uses its own governor internally
|
|
375
|
-
fit_basic = NBumiFitBasicModelGPU(basic_norm_filename, stats_basic, chunk_size=chunk_size_basic)
|
|
376
|
-
print("Phase [2/4]: COMPLETE")
|
|
377
|
-
|
|
378
|
-
print("Phase [3/4]: Evaluating fits of both models on ORIGINAL data...")
|
|
379
|
-
# [GOVERNOR INTEGRATION] Chunk size for check fit
|
|
380
|
-
chunk_size_check = get_optimal_chunk_size(cleaned_filename, multiplier=5.0, is_dense=True)
|
|
381
|
-
|
|
382
|
-
check_adjust = NBumiCheckFitFSGPU(cleaned_filename, fit_adjust, suppress_plot=True, chunk_size=chunk_size_check)
|
|
383
|
-
|
|
384
|
-
fit_basic_for_eval = {
|
|
385
|
-
'sizes': fit_basic['sizes'],
|
|
386
|
-
'vals': stats,
|
|
387
|
-
'var_obs': fit_basic['var_obs']
|
|
388
|
-
}
|
|
389
|
-
check_basic = NBumiCheckFitFSGPU(cleaned_filename, fit_basic_for_eval, suppress_plot=True, chunk_size=chunk_size_check)
|
|
390
|
-
print("Phase [3/4]: COMPLETE")
|
|
391
|
-
|
|
392
|
-
print("Phase [4/4]: Generating final comparison...")
|
|
393
122
|
nc_data = stats['nc']
|
|
394
123
|
mean_expr = stats['tjs'] / nc_data
|
|
395
124
|
observed_dropout = stats['djs'] / nc_data
|
|
@@ -432,72 +161,11 @@ def NBumiCompareModelsGPU(
|
|
|
432
161
|
plt.show()
|
|
433
162
|
|
|
434
163
|
plt.close()
|
|
435
|
-
print("Phase [4/4]: COMPLETE")
|
|
436
|
-
|
|
437
|
-
pipeline_end_time = time.time()
|
|
438
164
|
|
|
439
|
-
|
|
440
|
-
adata_meta.file.close() # Explicitly close the file handle
|
|
441
|
-
|
|
442
|
-
os.remove(basic_norm_filename)
|
|
443
|
-
print(f"STATUS: Temporary file '{basic_norm_filename}' removed.")
|
|
165
|
+
pipeline_end_time = time.time()
|
|
444
166
|
print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
|
|
445
167
|
|
|
446
168
|
return {
|
|
447
169
|
"errors": {"Depth-Adjusted": err_adj, "Basic": err_bas},
|
|
448
170
|
"comparison_df": comparison_df
|
|
449
171
|
}
|
|
450
|
-
|
|
451
|
-
def NBumiPlotDispVsMeanGPU(
|
|
452
|
-
fit: dict,
|
|
453
|
-
suppress_plot: bool = False,
|
|
454
|
-
plot_filename: str = None
|
|
455
|
-
):
|
|
456
|
-
"""
|
|
457
|
-
Generates a diagnostic plot of the dispersion vs. mean expression.
|
|
458
|
-
|
|
459
|
-
Args:
|
|
460
|
-
fit (dict): The 'fit' object from NBumiFitModelGPU.
|
|
461
|
-
suppress_plot (bool): If True, the plot will not be displayed on screen.
|
|
462
|
-
plot_filename (str, optional): Path to save the plot. If None, not saved.
|
|
463
|
-
"""
|
|
464
|
-
print("FUNCTION: NBumiPlotDispVsMean()")
|
|
465
|
-
|
|
466
|
-
# --- 1. Extract data and regression coefficients ---
|
|
467
|
-
mean_expression = fit['vals']['tjs'].values / fit['vals']['nc']
|
|
468
|
-
sizes = fit['sizes'].values
|
|
469
|
-
coeffs = NBumiFitDispVsMeanGPU(fit, suppress_plot=True)
|
|
470
|
-
intercept, slope = coeffs[0], coeffs[1]
|
|
471
|
-
|
|
472
|
-
# --- 2. Calculate the fitted line for plotting ---
|
|
473
|
-
# Create a smooth, continuous line using the regression coefficients
|
|
474
|
-
log_mean_expr_range = np.linspace(
|
|
475
|
-
np.log(mean_expression[mean_expression > 0].min()),
|
|
476
|
-
np.log(mean_expression.max()),
|
|
477
|
-
100
|
|
478
|
-
)
|
|
479
|
-
log_fitted_sizes = intercept + slope * log_mean_expr_range
|
|
480
|
-
fitted_sizes = np.exp(log_fitted_sizes)
|
|
481
|
-
|
|
482
|
-
# --- 3. Create the plot ---
|
|
483
|
-
plt.figure(figsize=(8, 6))
|
|
484
|
-
plt.scatter(mean_expression, sizes, label='Observed Dispersion', alpha=0.5, s=8)
|
|
485
|
-
plt.plot(np.exp(log_mean_expr_range), fitted_sizes, color='red', label='Regression Fit', linewidth=2)
|
|
486
|
-
|
|
487
|
-
plt.xscale('log')
|
|
488
|
-
plt.yscale('log')
|
|
489
|
-
plt.xlabel('Mean Expression')
|
|
490
|
-
plt.ylabel('Dispersion Parameter (Sizes)')
|
|
491
|
-
plt.title('Dispersion vs. Mean Expression')
|
|
492
|
-
plt.legend()
|
|
493
|
-
plt.grid(True, which="both", linestyle='--', alpha=0.6)
|
|
494
|
-
|
|
495
|
-
if plot_filename:
|
|
496
|
-
plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
|
|
497
|
-
print(f"STATUS: Diagnostic plot saved to '{plot_filename}'")
|
|
498
|
-
|
|
499
|
-
if not suppress_plot:
|
|
500
|
-
plt.show()
|
|
501
|
-
|
|
502
|
-
plt.close()
|
|
503
|
-
print("FUNCTION: NBumiPlotDispVsMean() COMPLETE\n")
|
|
@@ -2,11 +2,11 @@ m3Drop/__init__.py,sha256=yaUXhUArnwgLf01Zlpqa5qm9K1aByGqQupIoCaLYiDw,2462
|
|
|
2
2
|
m3Drop/coreCPU.py,sha256=3kPYlSVlYrJEhRUCIoVzmR8CYBaHpxVM5nx-3YQI4d4,17204
|
|
3
3
|
m3Drop/coreGPU.py,sha256=k7A06VNgfJ59J8g1VpfKxhTIKrEbW7Bj8pTbQqHaQL8,24571
|
|
4
4
|
m3Drop/diagnosticsCPU.py,sha256=BecOKTz2GDjzjs9ycXYsyrSHi2UVgsM58RBuNE62vmU,14273
|
|
5
|
-
m3Drop/diagnosticsGPU.py,sha256=
|
|
5
|
+
m3Drop/diagnosticsGPU.py,sha256=jmTEN1IkxecPylAw_4zBjYrWj3MFfTGu-m9bowYsVBk,6797
|
|
6
6
|
m3Drop/normalizationCPU.py,sha256=4ulCrDZZjxVFh2y0i4ayPkNCsZYaOP-Xq2Dnzu9WXtg,5697
|
|
7
7
|
m3Drop/normalizationGPU.py,sha256=mHu_Or4ma6qzujGQQQ0oN3D-yoEngLAN4UTknkArRAY,8596
|
|
8
|
-
m3drop-0.4.
|
|
9
|
-
m3drop-0.4.
|
|
10
|
-
m3drop-0.4.
|
|
11
|
-
m3drop-0.4.
|
|
12
|
-
m3drop-0.4.
|
|
8
|
+
m3drop-0.4.36.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
|
|
9
|
+
m3drop-0.4.36.dist-info/METADATA,sha256=P6wHTiOQHkAGLsF8sUaW4Dws0hpsK13j6mrOtaczj5M,5161
|
|
10
|
+
m3drop-0.4.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
m3drop-0.4.36.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
|
|
12
|
+
m3drop-0.4.36.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|