M3Drop 0.4.35__tar.gz → 0.4.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {m3drop-0.4.35 → m3drop-0.4.37/M3Drop.egg-info}/PKG-INFO +1 -1
- {m3drop-0.4.35/M3Drop.egg-info → m3drop-0.4.37}/PKG-INFO +1 -1
- {m3drop-0.4.35 → m3drop-0.4.37}/m3Drop/diagnosticsGPU.py +105 -142
- {m3drop-0.4.35 → m3drop-0.4.37}/setup.py +1 -1
- {m3drop-0.4.35 → m3drop-0.4.37}/LICENSE +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/M3Drop.egg-info/SOURCES.txt +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/M3Drop.egg-info/dependency_links.txt +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/M3Drop.egg-info/requires.txt +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/M3Drop.egg-info/top_level.txt +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/README.md +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/m3Drop/__init__.py +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/m3Drop/coreCPU.py +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/m3Drop/coreGPU.py +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/m3Drop/diagnosticsCPU.py +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/m3Drop/normalizationCPU.py +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/m3Drop/normalizationGPU.py +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/pyproject.toml +0 -0
- {m3drop-0.4.35 → m3drop-0.4.37}/setup.cfg +0 -0
|
@@ -10,17 +10,14 @@ import psutil
|
|
|
10
10
|
import gc
|
|
11
11
|
from scipy import sparse
|
|
12
12
|
from scipy import stats
|
|
13
|
-
import anndata
|
|
13
|
+
import anndata
|
|
14
14
|
|
|
15
|
-
# [GOVERNOR INTEGRATION]
|
|
15
|
+
# [GOVERNOR INTEGRATION]
|
|
16
16
|
from .coreGPU import hidden_calc_valsGPU, NBumiFitModelGPU, NBumiFitDispVsMeanGPU, get_optimal_chunk_size
|
|
17
17
|
from cupy.sparse import csr_matrix as cp_csr_matrix
|
|
18
18
|
import scipy.sparse as sp
|
|
19
19
|
from scipy.sparse import csr_matrix as sp_csr_matrix
|
|
20
|
-
|
|
21
20
|
import statsmodels.api as sm
|
|
22
|
-
from scipy.stats import norm
|
|
23
|
-
from statsmodels.stats.multitest import multipletests
|
|
24
21
|
|
|
25
22
|
def NBumiFitBasicModelGPU(
|
|
26
23
|
cleaned_filename: str,
|
|
@@ -140,7 +137,7 @@ def NBumiCheckFitFSGPU(
|
|
|
140
137
|
plot_filename=None
|
|
141
138
|
) -> dict:
|
|
142
139
|
"""
|
|
143
|
-
|
|
140
|
+
Calculates the fit errors (gene_error, cell_error) for a given model.
|
|
144
141
|
"""
|
|
145
142
|
start_time = time.perf_counter()
|
|
146
143
|
print(f"FUNCTION: NBumiCheckFitFS() | FILE: {cleaned_filename}")
|
|
@@ -171,14 +168,10 @@ def NBumiCheckFitFSGPU(
|
|
|
171
168
|
print("Phase [1/2]: COMPLETE")
|
|
172
169
|
|
|
173
170
|
# --- Phase 2: Calculate Expected Dropouts ---
|
|
174
|
-
print("Phase [2/2]: Calculating expected dropouts
|
|
175
|
-
|
|
176
|
-
# [GOVERNOR INTEGRATION] Removed naive calculation, utilizing Governor's chunk_size
|
|
177
|
-
optimal_chunk = chunk_size
|
|
178
|
-
print(f" Using governor chunk size: {optimal_chunk}")
|
|
171
|
+
print(f"Phase [2/2]: Calculating expected dropouts (Chunk: {chunk_size})...")
|
|
179
172
|
|
|
180
|
-
for i in range(0, nc,
|
|
181
|
-
end_col = min(i +
|
|
173
|
+
for i in range(0, nc, chunk_size):
|
|
174
|
+
end_col = min(i + chunk_size, nc)
|
|
182
175
|
print(f"Phase [2/2]: Processing: {end_col} of {nc} cells.", end='\r')
|
|
183
176
|
|
|
184
177
|
tis_chunk_gpu = tis_gpu[i:end_col]
|
|
@@ -199,11 +192,7 @@ def NBumiCheckFitFSGPU(
|
|
|
199
192
|
|
|
200
193
|
# Clean up
|
|
201
194
|
del mu_chunk_gpu, p_is_chunk_gpu, base, tis_chunk_gpu
|
|
202
|
-
|
|
203
|
-
# Periodic memory cleanup
|
|
204
|
-
mempool = cp.get_default_memory_pool()
|
|
205
|
-
if (i // optimal_chunk) % 10 == 0:
|
|
206
|
-
mempool.free_all_blocks()
|
|
195
|
+
cp.get_default_memory_pool().free_all_blocks()
|
|
207
196
|
|
|
208
197
|
print(f"Phase [2/2]: COMPLETE{' ' * 50}")
|
|
209
198
|
|
|
@@ -256,7 +245,7 @@ def NBumiCheckFitFSGPU(
|
|
|
256
245
|
}
|
|
257
246
|
|
|
258
247
|
def NBumiCompareModelsGPU(
|
|
259
|
-
raw_filename: str,
|
|
248
|
+
raw_filename: str, # Kept for API compatibility, but functionally we use cleaned_filename for indices
|
|
260
249
|
cleaned_filename: str,
|
|
261
250
|
stats: dict,
|
|
262
251
|
fit_adjust: dict,
|
|
@@ -265,131 +254,118 @@ def NBumiCompareModelsGPU(
|
|
|
265
254
|
plot_filename=None
|
|
266
255
|
) -> dict:
|
|
267
256
|
"""
|
|
268
|
-
OPTIMIZED VERSION -
|
|
257
|
+
OPTIMIZED VERSION (IN-MEMORY):
|
|
258
|
+
- Eliminates the 46GB '_basic_norm.h5ad' temporary file.
|
|
259
|
+
- Performs depth normalization and variance calculation on-the-fly in GPU VRAM.
|
|
260
|
+
- PRESERVED SCIENTIFIC LOGIC: Var(X) = E[X^2] - (E[X])^2 on normalized data.
|
|
269
261
|
"""
|
|
270
262
|
pipeline_start_time = time.time()
|
|
271
263
|
print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
|
|
272
264
|
|
|
273
|
-
# [GOVERNOR
|
|
265
|
+
# [GOVERNOR] High multiplier (12.0) because we hold Raw + Norm + Square in VRAM
|
|
274
266
|
if chunk_size is None:
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
print("Phase [1/4]: Creating temporary 'basic' normalized data file...")
|
|
280
|
-
basic_norm_filename = cleaned_filename.replace('.h5ad', '_basic_norm.h5ad')
|
|
281
|
-
|
|
282
|
-
# Read metadata. In 'backed' mode, this keeps a file handle open.
|
|
283
|
-
adata_meta = anndata.read_h5ad(cleaned_filename, backed='r')
|
|
284
|
-
nc, ng = adata_meta.shape
|
|
285
|
-
obs_df = adata_meta.obs.copy()
|
|
286
|
-
var_df = adata_meta.var.copy()
|
|
267
|
+
chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=12.0, is_dense=False)
|
|
268
|
+
|
|
269
|
+
# --- Phase 1: In-Memory "Basic Fit" (Normalization + Variance) ---
|
|
270
|
+
print("Phase [1/3]: Calculating Basic Model (Depth-Normalized) variance on-the-fly...")
|
|
287
271
|
|
|
288
|
-
|
|
289
|
-
|
|
272
|
+
# 1. Prepare Size Factors (CPU)
|
|
273
|
+
tjs = stats['tjs'].values # Gene sums
|
|
274
|
+
tis = stats['tis'].values # Cell sums
|
|
275
|
+
nc, ng = stats['nc'], stats['ng']
|
|
290
276
|
|
|
291
|
-
|
|
292
|
-
size_factors = np.ones_like(
|
|
293
|
-
non_zero_mask =
|
|
294
|
-
size_factors[non_zero_mask] =
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
|
|
308
|
-
out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
|
|
309
|
-
out_indptr = x_group_out.create_dataset('indptr', shape=(nc + 1,), dtype='int64')
|
|
310
|
-
out_indptr[0] = 0
|
|
311
|
-
current_nnz = 0
|
|
312
|
-
|
|
313
|
-
with h5py.File(cleaned_filename, 'r') as f_in:
|
|
314
|
-
h5_indptr = f_in['X']['indptr']
|
|
315
|
-
h5_data = f_in['X']['data']
|
|
316
|
-
h5_indices = f_in['X']['indices']
|
|
317
|
-
|
|
318
|
-
for i in range(0, nc, chunk_size):
|
|
319
|
-
end_row = min(i + chunk_size, nc)
|
|
320
|
-
print(f"Phase [1/4]: Normalizing: {end_row} of {nc} cells.", end='\r')
|
|
321
|
-
|
|
322
|
-
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
323
|
-
if start_idx == end_idx:
|
|
324
|
-
out_indptr[i + 1 : end_row + 1] = current_nnz
|
|
325
|
-
continue
|
|
326
|
-
|
|
327
|
-
# Read data for the chunk
|
|
328
|
-
data_slice = h5_data[start_idx:end_idx]
|
|
329
|
-
indices_slice = h5_indices[start_idx:end_idx]
|
|
330
|
-
indptr_slice = h5_indptr[i:end_row + 1] - start_idx
|
|
331
|
-
|
|
332
|
-
# Move to GPU for fast normalization
|
|
333
|
-
data_gpu = cp.asarray(data_slice.copy(), dtype=cp.float32)
|
|
334
|
-
|
|
335
|
-
indptr_gpu = cp.asarray(indptr_slice.copy())
|
|
336
|
-
nnz_in_chunk = indptr_gpu[-1].item()
|
|
337
|
-
cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
|
|
338
|
-
if len(indptr_gpu) > 1:
|
|
339
|
-
cell_boundary_markers[indptr_gpu[:-1]] = 1
|
|
340
|
-
row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
|
|
341
|
-
|
|
342
|
-
size_factors_for_chunk = cp.asarray(size_factors[i:end_row])
|
|
343
|
-
|
|
344
|
-
data_gpu /= size_factors_for_chunk[row_indices]
|
|
345
|
-
|
|
346
|
-
data_cpu = np.round(data_gpu.get())
|
|
347
|
-
|
|
348
|
-
num_cells_in_chunk = end_row - i
|
|
349
|
-
chunk_sp = sp_csr_matrix((data_cpu, indices_slice, indptr_slice),
|
|
350
|
-
shape=(num_cells_in_chunk, ng))
|
|
351
|
-
|
|
352
|
-
nnz_chunk = chunk_sp.nnz
|
|
353
|
-
out_data.resize(current_nnz + nnz_chunk, axis=0)
|
|
354
|
-
out_data[current_nnz:] = chunk_sp.data
|
|
355
|
-
|
|
356
|
-
out_indices.resize(current_nnz + nnz_chunk, axis=0)
|
|
357
|
-
out_indices[current_nnz:] = chunk_sp.indices
|
|
358
|
-
|
|
359
|
-
new_indptr_list = chunk_sp.indptr[1:].astype(np.int64) + current_nnz
|
|
360
|
-
out_indptr[i + 1 : end_row + 1] = new_indptr_list
|
|
361
|
-
|
|
362
|
-
current_nnz += nnz_chunk
|
|
277
|
+
median_sum = np.median(tis[tis > 0])
|
|
278
|
+
size_factors = np.ones_like(tis, dtype=np.float32)
|
|
279
|
+
non_zero_mask = tis > 0
|
|
280
|
+
size_factors[non_zero_mask] = tis[non_zero_mask] / median_sum
|
|
281
|
+
|
|
282
|
+
# 2. Prepare GPU Arrays
|
|
283
|
+
sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
|
|
284
|
+
sum_x_gpu = cp.zeros(ng, dtype=cp.float64) # Need sum(x) to calc mean(x) for variance
|
|
285
|
+
|
|
286
|
+
# 3. GPU Loop (Raw Data -> Normalize -> Accumulate)
|
|
287
|
+
# CRITICAL: We read CLEANED_FILENAME to ensure indices match 'stats'
|
|
288
|
+
with h5py.File(cleaned_filename, 'r') as f_in:
|
|
289
|
+
h5_indptr = f_in['X']['indptr']
|
|
290
|
+
h5_data = f_in['X']['data']
|
|
291
|
+
h5_indices = f_in['X']['indices']
|
|
363
292
|
|
|
364
|
-
|
|
365
|
-
|
|
293
|
+
for i in range(0, nc, chunk_size):
|
|
294
|
+
end_row = min(i + chunk_size, nc)
|
|
295
|
+
print(f"Phase [1/3]: Processing: {end_row} of {nc} cells.", end='\r')
|
|
296
|
+
|
|
297
|
+
start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
|
|
298
|
+
if start_idx == end_idx: continue
|
|
299
|
+
|
|
300
|
+
# Load Raw Chunk
|
|
301
|
+
data_gpu = cp.asarray(h5_data[start_idx:end_idx], dtype=cp.float32)
|
|
302
|
+
indices_gpu = cp.asarray(h5_indices[start_idx:end_idx])
|
|
303
|
+
indptr_gpu = cp.asarray(h5_indptr[i:end_row + 1] - start_idx)
|
|
304
|
+
|
|
305
|
+
# Expand Size Factors to match Data Structure
|
|
306
|
+
nnz_in_chunk = indptr_gpu[-1].item()
|
|
307
|
+
cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
|
|
308
|
+
if len(indptr_gpu) > 1:
|
|
309
|
+
cell_boundary_markers[indptr_gpu[:-1]] = 1
|
|
310
|
+
# row_indices maps every data point to its cell index (0 to chunk_size)
|
|
311
|
+
row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
|
|
312
|
+
|
|
313
|
+
# Get size factors for this chunk
|
|
314
|
+
sf_chunk = cp.asarray(size_factors[i:end_row])
|
|
315
|
+
|
|
316
|
+
# --- THE MAGIC: On-the-Fly Normalization ---
|
|
317
|
+
# data_norm = data_raw / size_factor
|
|
318
|
+
data_gpu /= sf_chunk[row_indices]
|
|
319
|
+
|
|
320
|
+
# Accumulate for Variance: E[X^2] and E[X]
|
|
321
|
+
cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
|
|
322
|
+
cp.add.at(sum_x_gpu, indices_gpu, data_gpu)
|
|
323
|
+
|
|
324
|
+
# Clean up VRAM
|
|
325
|
+
del data_gpu, indices_gpu, indptr_gpu, row_indices, sf_chunk, cell_boundary_markers
|
|
326
|
+
cp.get_default_memory_pool().free_all_blocks()
|
|
366
327
|
|
|
367
|
-
print(f"Phase [1/
|
|
328
|
+
print(f"Phase [1/3]: COMPLETE{' '*50}")
|
|
368
329
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
330
|
+
# 4. Finalize Basic Statistics
|
|
331
|
+
# Var(X) = E[X^2] - (E[X])^2
|
|
332
|
+
mean_x_sq_gpu = sum_x_sq_gpu / nc
|
|
333
|
+
mean_mu_gpu = sum_x_gpu / nc
|
|
334
|
+
my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
|
|
373
335
|
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
print("Phase [2/4]: COMPLETE")
|
|
336
|
+
# Dispersion = Mean^2 / (Var - Mean)
|
|
337
|
+
size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
|
|
377
338
|
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
339
|
+
# Safety Clamping
|
|
340
|
+
max_size_val = cp.nanmax(size_gpu) * 10
|
|
341
|
+
if cp.isnan(max_size_val): max_size_val = 1000
|
|
342
|
+
size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
|
|
343
|
+
size_gpu[size_gpu < 1e-10] = 1e-10
|
|
383
344
|
|
|
384
|
-
|
|
385
|
-
|
|
345
|
+
# Construct "Basic Fit" Object
|
|
346
|
+
fit_basic = {
|
|
347
|
+
'sizes': pd.Series(size_gpu.get(), index=stats['tjs'].index),
|
|
386
348
|
'vals': stats,
|
|
387
|
-
'var_obs':
|
|
349
|
+
'var_obs': pd.Series(my_rowvar_gpu.get(), index=stats['tjs'].index)
|
|
388
350
|
}
|
|
389
|
-
|
|
390
|
-
|
|
351
|
+
|
|
352
|
+
# --- Phase 2: Check Fit (Calculate Errors) ---
|
|
353
|
+
print("Phase [2/3]: Evaluating fit errors on ORIGINAL data...")
|
|
354
|
+
|
|
355
|
+
# Check Adjust (M3Drop) - uses its own governor
|
|
356
|
+
check_adjust = NBumiCheckFitFSGPU(
|
|
357
|
+
cleaned_filename, fit_adjust, suppress_plot=True
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Check Basic (Depth-Norm) - uses its own governor
|
|
361
|
+
check_basic = NBumiCheckFitFSGPU(
|
|
362
|
+
cleaned_filename, fit_basic, suppress_plot=True
|
|
363
|
+
)
|
|
364
|
+
print("Phase [2/3]: COMPLETE")
|
|
391
365
|
|
|
392
|
-
|
|
366
|
+
# --- Phase 3: Plotting & Comparison ---
|
|
367
|
+
print("Phase [3/3]: Generating comparison...")
|
|
368
|
+
|
|
393
369
|
nc_data = stats['nc']
|
|
394
370
|
mean_expr = stats['tjs'] / nc_data
|
|
395
371
|
observed_dropout = stats['djs'] / nc_data
|
|
@@ -432,15 +408,8 @@ def NBumiCompareModelsGPU(
|
|
|
432
408
|
plt.show()
|
|
433
409
|
|
|
434
410
|
plt.close()
|
|
435
|
-
print("Phase [4/4]: COMPLETE")
|
|
436
|
-
|
|
437
|
-
pipeline_end_time = time.time()
|
|
438
411
|
|
|
439
|
-
|
|
440
|
-
adata_meta.file.close() # Explicitly close the file handle
|
|
441
|
-
|
|
442
|
-
os.remove(basic_norm_filename)
|
|
443
|
-
print(f"STATUS: Temporary file '{basic_norm_filename}' removed.")
|
|
412
|
+
pipeline_end_time = time.time()
|
|
444
413
|
print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
|
|
445
414
|
|
|
446
415
|
return {
|
|
@@ -455,11 +424,6 @@ def NBumiPlotDispVsMeanGPU(
|
|
|
455
424
|
):
|
|
456
425
|
"""
|
|
457
426
|
Generates a diagnostic plot of the dispersion vs. mean expression.
|
|
458
|
-
|
|
459
|
-
Args:
|
|
460
|
-
fit (dict): The 'fit' object from NBumiFitModelGPU.
|
|
461
|
-
suppress_plot (bool): If True, the plot will not be displayed on screen.
|
|
462
|
-
plot_filename (str, optional): Path to save the plot. If None, not saved.
|
|
463
427
|
"""
|
|
464
428
|
print("FUNCTION: NBumiPlotDispVsMean()")
|
|
465
429
|
|
|
@@ -470,7 +434,6 @@ def NBumiPlotDispVsMeanGPU(
|
|
|
470
434
|
intercept, slope = coeffs[0], coeffs[1]
|
|
471
435
|
|
|
472
436
|
# --- 2. Calculate the fitted line for plotting ---
|
|
473
|
-
# Create a smooth, continuous line using the regression coefficients
|
|
474
437
|
log_mean_expr_range = np.linspace(
|
|
475
438
|
np.log(mean_expression[mean_expression > 0].min()),
|
|
476
439
|
np.log(mean_expression.max()),
|
|
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
|
5
5
|
|
|
6
6
|
setuptools.setup(
|
|
7
7
|
name="M3Drop",
|
|
8
|
-
version="0.4.
|
|
8
|
+
version="0.4.37", # Version bump
|
|
9
9
|
author="Tallulah Andrews",
|
|
10
10
|
author_email="tandrew6@uwo.ca",
|
|
11
11
|
description="A Python implementation of the M3Drop single-cell RNA-seq analysis tool.",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|