M3Drop 0.4.35__tar.gz → 0.4.37__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.35
3
+ Version: 0.4.37
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.35
3
+ Version: 0.4.37
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -10,17 +10,14 @@ import psutil
10
10
  import gc
11
11
  from scipy import sparse
12
12
  from scipy import stats
13
- import anndata # <--- FIXED: Added missing import
13
+ import anndata
14
14
 
15
- # [GOVERNOR INTEGRATION] Added get_optimal_chunk_size
15
+ # [GOVERNOR INTEGRATION]
16
16
  from .coreGPU import hidden_calc_valsGPU, NBumiFitModelGPU, NBumiFitDispVsMeanGPU, get_optimal_chunk_size
17
17
  from cupy.sparse import csr_matrix as cp_csr_matrix
18
18
  import scipy.sparse as sp
19
19
  from scipy.sparse import csr_matrix as sp_csr_matrix
20
-
21
20
  import statsmodels.api as sm
22
- from scipy.stats import norm
23
- from statsmodels.stats.multitest import multipletests
24
21
 
25
22
  def NBumiFitBasicModelGPU(
26
23
  cleaned_filename: str,
@@ -140,7 +137,7 @@ def NBumiCheckFitFSGPU(
140
137
  plot_filename=None
141
138
  ) -> dict:
142
139
  """
143
- FIXED VERSION - No cupy.errstate, proper GPU computation.
140
+ Calculates the fit errors (gene_error, cell_error) for a given model.
144
141
  """
145
142
  start_time = time.perf_counter()
146
143
  print(f"FUNCTION: NBumiCheckFitFS() | FILE: {cleaned_filename}")
@@ -171,14 +168,10 @@ def NBumiCheckFitFSGPU(
171
168
  print("Phase [1/2]: COMPLETE")
172
169
 
173
170
  # --- Phase 2: Calculate Expected Dropouts ---
174
- print("Phase [2/2]: Calculating expected dropouts from data chunks...")
175
-
176
- # [GOVERNOR INTEGRATION] Removed naive calculation, utilizing Governor's chunk_size
177
- optimal_chunk = chunk_size
178
- print(f" Using governor chunk size: {optimal_chunk}")
171
+ print(f"Phase [2/2]: Calculating expected dropouts (Chunk: {chunk_size})...")
179
172
 
180
- for i in range(0, nc, optimal_chunk):
181
- end_col = min(i + optimal_chunk, nc)
173
+ for i in range(0, nc, chunk_size):
174
+ end_col = min(i + chunk_size, nc)
182
175
  print(f"Phase [2/2]: Processing: {end_col} of {nc} cells.", end='\r')
183
176
 
184
177
  tis_chunk_gpu = tis_gpu[i:end_col]
@@ -199,11 +192,7 @@ def NBumiCheckFitFSGPU(
199
192
 
200
193
  # Clean up
201
194
  del mu_chunk_gpu, p_is_chunk_gpu, base, tis_chunk_gpu
202
-
203
- # Periodic memory cleanup
204
- mempool = cp.get_default_memory_pool()
205
- if (i // optimal_chunk) % 10 == 0:
206
- mempool.free_all_blocks()
195
+ cp.get_default_memory_pool().free_all_blocks()
207
196
 
208
197
  print(f"Phase [2/2]: COMPLETE{' ' * 50}")
209
198
 
@@ -256,7 +245,7 @@ def NBumiCheckFitFSGPU(
256
245
  }
257
246
 
258
247
  def NBumiCompareModelsGPU(
259
- raw_filename: str,
248
+ raw_filename: str, # Kept for API compatibility, but functionally we use cleaned_filename for indices
260
249
  cleaned_filename: str,
261
250
  stats: dict,
262
251
  fit_adjust: dict,
@@ -265,131 +254,118 @@ def NBumiCompareModelsGPU(
265
254
  plot_filename=None
266
255
  ) -> dict:
267
256
  """
268
- OPTIMIZED VERSION - Faster normalization and sparse matrix writing.
257
+ OPTIMIZED VERSION (IN-MEMORY):
258
+ - Eliminates the 46GB '_basic_norm.h5ad' temporary file.
259
+ - Performs depth normalization and variance calculation on-the-fly in GPU VRAM.
260
+ - PRESERVED SCIENTIFIC LOGIC: Var(X) = E[X^2] - (E[X])^2 on normalized data.
269
261
  """
270
262
  pipeline_start_time = time.time()
271
263
  print(f"FUNCTION: NBumiCompareModels() | Comparing models for {cleaned_filename}")
272
264
 
273
- # [GOVERNOR INTEGRATION] Calculate chunk size for normalization phase (heavy IO)
265
+ # [GOVERNOR] High multiplier (12.0) because we hold Raw + Norm + Square in VRAM
274
266
  if chunk_size is None:
275
- # Multiplier 10.0 for safety during normalization of massive dense expansion
276
- chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=10.0, is_dense=True)
277
-
278
- # --- Phase 1: OPTIMIZED Normalization ---
279
- print("Phase [1/4]: Creating temporary 'basic' normalized data file...")
280
- basic_norm_filename = cleaned_filename.replace('.h5ad', '_basic_norm.h5ad')
281
-
282
- # Read metadata. In 'backed' mode, this keeps a file handle open.
283
- adata_meta = anndata.read_h5ad(cleaned_filename, backed='r')
284
- nc, ng = adata_meta.shape
285
- obs_df = adata_meta.obs.copy()
286
- var_df = adata_meta.var.copy()
267
+ chunk_size = get_optimal_chunk_size(cleaned_filename, multiplier=12.0, is_dense=False)
268
+
269
+ # --- Phase 1: In-Memory "Basic Fit" (Normalization + Variance) ---
270
+ print("Phase [1/3]: Calculating Basic Model (Depth-Normalized) variance on-the-fly...")
287
271
 
288
- cell_sums = stats['tis'].values
289
- median_sum = np.median(cell_sums[cell_sums > 0])
272
+ # 1. Prepare Size Factors (CPU)
273
+ tjs = stats['tjs'].values # Gene sums
274
+ tis = stats['tis'].values # Cell sums
275
+ nc, ng = stats['nc'], stats['ng']
290
276
 
291
- # Avoid division by zero for cells with zero counts
292
- size_factors = np.ones_like(cell_sums, dtype=np.float32)
293
- non_zero_mask = cell_sums > 0
294
- size_factors[non_zero_mask] = cell_sums[non_zero_mask] / median_sum
295
-
296
- adata_out = anndata.AnnData(obs=obs_df, var=var_df)
297
- adata_out.write_h5ad(basic_norm_filename, compression="gzip")
298
-
299
- with h5py.File(basic_norm_filename, 'a') as f_out:
300
- if 'X' in f_out:
301
- del f_out['X']
302
- x_group_out = f_out.create_group('X')
303
- x_group_out.attrs['encoding-type'] = 'csr_matrix'
304
- x_group_out.attrs['encoding-version'] = '0.1.0'
305
- x_group_out.attrs['shape'] = np.array([nc, ng], dtype='int64')
306
-
307
- out_data = x_group_out.create_dataset('data', shape=(0,), maxshape=(None,), dtype='float32')
308
- out_indices = x_group_out.create_dataset('indices', shape=(0,), maxshape=(None,), dtype='int32')
309
- out_indptr = x_group_out.create_dataset('indptr', shape=(nc + 1,), dtype='int64')
310
- out_indptr[0] = 0
311
- current_nnz = 0
312
-
313
- with h5py.File(cleaned_filename, 'r') as f_in:
314
- h5_indptr = f_in['X']['indptr']
315
- h5_data = f_in['X']['data']
316
- h5_indices = f_in['X']['indices']
317
-
318
- for i in range(0, nc, chunk_size):
319
- end_row = min(i + chunk_size, nc)
320
- print(f"Phase [1/4]: Normalizing: {end_row} of {nc} cells.", end='\r')
321
-
322
- start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
323
- if start_idx == end_idx:
324
- out_indptr[i + 1 : end_row + 1] = current_nnz
325
- continue
326
-
327
- # Read data for the chunk
328
- data_slice = h5_data[start_idx:end_idx]
329
- indices_slice = h5_indices[start_idx:end_idx]
330
- indptr_slice = h5_indptr[i:end_row + 1] - start_idx
331
-
332
- # Move to GPU for fast normalization
333
- data_gpu = cp.asarray(data_slice.copy(), dtype=cp.float32)
334
-
335
- indptr_gpu = cp.asarray(indptr_slice.copy())
336
- nnz_in_chunk = indptr_gpu[-1].item()
337
- cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
338
- if len(indptr_gpu) > 1:
339
- cell_boundary_markers[indptr_gpu[:-1]] = 1
340
- row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
341
-
342
- size_factors_for_chunk = cp.asarray(size_factors[i:end_row])
343
-
344
- data_gpu /= size_factors_for_chunk[row_indices]
345
-
346
- data_cpu = np.round(data_gpu.get())
347
-
348
- num_cells_in_chunk = end_row - i
349
- chunk_sp = sp_csr_matrix((data_cpu, indices_slice, indptr_slice),
350
- shape=(num_cells_in_chunk, ng))
351
-
352
- nnz_chunk = chunk_sp.nnz
353
- out_data.resize(current_nnz + nnz_chunk, axis=0)
354
- out_data[current_nnz:] = chunk_sp.data
355
-
356
- out_indices.resize(current_nnz + nnz_chunk, axis=0)
357
- out_indices[current_nnz:] = chunk_sp.indices
358
-
359
- new_indptr_list = chunk_sp.indptr[1:].astype(np.int64) + current_nnz
360
- out_indptr[i + 1 : end_row + 1] = new_indptr_list
361
-
362
- current_nnz += nnz_chunk
277
+ median_sum = np.median(tis[tis > 0])
278
+ size_factors = np.ones_like(tis, dtype=np.float32)
279
+ non_zero_mask = tis > 0
280
+ size_factors[non_zero_mask] = tis[non_zero_mask] / median_sum
281
+
282
+ # 2. Prepare GPU Arrays
283
+ sum_x_sq_gpu = cp.zeros(ng, dtype=cp.float64)
284
+ sum_x_gpu = cp.zeros(ng, dtype=cp.float64) # Need sum(x) to calc mean(x) for variance
285
+
286
+ # 3. GPU Loop (Raw Data -> Normalize -> Accumulate)
287
+ # CRITICAL: We read CLEANED_FILENAME to ensure indices match 'stats'
288
+ with h5py.File(cleaned_filename, 'r') as f_in:
289
+ h5_indptr = f_in['X']['indptr']
290
+ h5_data = f_in['X']['data']
291
+ h5_indices = f_in['X']['indices']
363
292
 
364
- del data_gpu, row_indices, size_factors_for_chunk, indptr_gpu
365
- cp.get_default_memory_pool().free_all_blocks()
293
+ for i in range(0, nc, chunk_size):
294
+ end_row = min(i + chunk_size, nc)
295
+ print(f"Phase [1/3]: Processing: {end_row} of {nc} cells.", end='\r')
296
+
297
+ start_idx, end_idx = h5_indptr[i], h5_indptr[end_row]
298
+ if start_idx == end_idx: continue
299
+
300
+ # Load Raw Chunk
301
+ data_gpu = cp.asarray(h5_data[start_idx:end_idx], dtype=cp.float32)
302
+ indices_gpu = cp.asarray(h5_indices[start_idx:end_idx])
303
+ indptr_gpu = cp.asarray(h5_indptr[i:end_row + 1] - start_idx)
304
+
305
+ # Expand Size Factors to match Data Structure
306
+ nnz_in_chunk = indptr_gpu[-1].item()
307
+ cell_boundary_markers = cp.zeros(nnz_in_chunk, dtype=cp.int32)
308
+ if len(indptr_gpu) > 1:
309
+ cell_boundary_markers[indptr_gpu[:-1]] = 1
310
+ # row_indices maps every data point to its cell index (0 to chunk_size)
311
+ row_indices = cp.cumsum(cell_boundary_markers, axis=0) - 1
312
+
313
+ # Get size factors for this chunk
314
+ sf_chunk = cp.asarray(size_factors[i:end_row])
315
+
316
+ # --- THE MAGIC: On-the-Fly Normalization ---
317
+ # data_norm = data_raw / size_factor
318
+ data_gpu /= sf_chunk[row_indices]
319
+
320
+ # Accumulate for Variance: E[X^2] and E[X]
321
+ cp.add.at(sum_x_sq_gpu, indices_gpu, data_gpu**2)
322
+ cp.add.at(sum_x_gpu, indices_gpu, data_gpu)
323
+
324
+ # Clean up VRAM
325
+ del data_gpu, indices_gpu, indptr_gpu, row_indices, sf_chunk, cell_boundary_markers
326
+ cp.get_default_memory_pool().free_all_blocks()
366
327
 
367
- print(f"Phase [1/4]: COMPLETE{' '*50}")
328
+ print(f"Phase [1/3]: COMPLETE{' '*50}")
368
329
 
369
- print("Phase [2/4]: Fitting Basic Model on normalized data...")
370
-
371
- # [GOVERNOR INTEGRATION] Calculate chunk size for basic fit on the heavy normalized file
372
- chunk_size_basic = get_optimal_chunk_size(basic_norm_filename, multiplier=10.0, is_dense=True)
330
+ # 4. Finalize Basic Statistics
331
+ # Var(X) = E[X^2] - (E[X])^2
332
+ mean_x_sq_gpu = sum_x_sq_gpu / nc
333
+ mean_mu_gpu = sum_x_gpu / nc
334
+ my_rowvar_gpu = mean_x_sq_gpu - mean_mu_gpu**2
373
335
 
374
- stats_basic = hidden_calc_valsGPU(basic_norm_filename) # hidden_calc uses its own governor internally
375
- fit_basic = NBumiFitBasicModelGPU(basic_norm_filename, stats_basic, chunk_size=chunk_size_basic)
376
- print("Phase [2/4]: COMPLETE")
336
+ # Dispersion = Mean^2 / (Var - Mean)
337
+ size_gpu = mean_mu_gpu**2 / (my_rowvar_gpu - mean_mu_gpu)
377
338
 
378
- print("Phase [3/4]: Evaluating fits of both models on ORIGINAL data...")
379
- # [GOVERNOR INTEGRATION] Chunk size for check fit
380
- chunk_size_check = get_optimal_chunk_size(cleaned_filename, multiplier=5.0, is_dense=True)
381
-
382
- check_adjust = NBumiCheckFitFSGPU(cleaned_filename, fit_adjust, suppress_plot=True, chunk_size=chunk_size_check)
339
+ # Safety Clamping
340
+ max_size_val = cp.nanmax(size_gpu) * 10
341
+ if cp.isnan(max_size_val): max_size_val = 1000
342
+ size_gpu[cp.isnan(size_gpu) | (size_gpu <= 0)] = max_size_val
343
+ size_gpu[size_gpu < 1e-10] = 1e-10
383
344
 
384
- fit_basic_for_eval = {
385
- 'sizes': fit_basic['sizes'],
345
+ # Construct "Basic Fit" Object
346
+ fit_basic = {
347
+ 'sizes': pd.Series(size_gpu.get(), index=stats['tjs'].index),
386
348
  'vals': stats,
387
- 'var_obs': fit_basic['var_obs']
349
+ 'var_obs': pd.Series(my_rowvar_gpu.get(), index=stats['tjs'].index)
388
350
  }
389
- check_basic = NBumiCheckFitFSGPU(cleaned_filename, fit_basic_for_eval, suppress_plot=True, chunk_size=chunk_size_check)
390
- print("Phase [3/4]: COMPLETE")
351
+
352
+ # --- Phase 2: Check Fit (Calculate Errors) ---
353
+ print("Phase [2/3]: Evaluating fit errors on ORIGINAL data...")
354
+
355
+ # Check Adjust (M3Drop) - uses its own governor
356
+ check_adjust = NBumiCheckFitFSGPU(
357
+ cleaned_filename, fit_adjust, suppress_plot=True
358
+ )
359
+
360
+ # Check Basic (Depth-Norm) - uses its own governor
361
+ check_basic = NBumiCheckFitFSGPU(
362
+ cleaned_filename, fit_basic, suppress_plot=True
363
+ )
364
+ print("Phase [2/3]: COMPLETE")
391
365
 
392
- print("Phase [4/4]: Generating final comparison...")
366
+ # --- Phase 3: Plotting & Comparison ---
367
+ print("Phase [3/3]: Generating comparison...")
368
+
393
369
  nc_data = stats['nc']
394
370
  mean_expr = stats['tjs'] / nc_data
395
371
  observed_dropout = stats['djs'] / nc_data
@@ -432,15 +408,8 @@ def NBumiCompareModelsGPU(
432
408
  plt.show()
433
409
 
434
410
  plt.close()
435
- print("Phase [4/4]: COMPLETE")
436
-
437
- pipeline_end_time = time.time()
438
411
 
439
- # --- ADD THIS LINE TO FIX THE ERROR ---
440
- adata_meta.file.close() # Explicitly close the file handle
441
-
442
- os.remove(basic_norm_filename)
443
- print(f"STATUS: Temporary file '{basic_norm_filename}' removed.")
412
+ pipeline_end_time = time.time()
444
413
  print(f"Total time: {pipeline_end_time - pipeline_start_time:.2f} seconds.\n")
445
414
 
446
415
  return {
@@ -455,11 +424,6 @@ def NBumiPlotDispVsMeanGPU(
455
424
  ):
456
425
  """
457
426
  Generates a diagnostic plot of the dispersion vs. mean expression.
458
-
459
- Args:
460
- fit (dict): The 'fit' object from NBumiFitModelGPU.
461
- suppress_plot (bool): If True, the plot will not be displayed on screen.
462
- plot_filename (str, optional): Path to save the plot. If None, not saved.
463
427
  """
464
428
  print("FUNCTION: NBumiPlotDispVsMean()")
465
429
 
@@ -470,7 +434,6 @@ def NBumiPlotDispVsMeanGPU(
470
434
  intercept, slope = coeffs[0], coeffs[1]
471
435
 
472
436
  # --- 2. Calculate the fitted line for plotting ---
473
- # Create a smooth, continuous line using the regression coefficients
474
437
  log_mean_expr_range = np.linspace(
475
438
  np.log(mean_expression[mean_expression > 0].min()),
476
439
  np.log(mean_expression.max()),
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setuptools.setup(
7
7
  name="M3Drop",
8
- version="0.4.35", # Version bump
8
+ version="0.4.37", # Version bump
9
9
  author="Tallulah Andrews",
10
10
  author_email="tandrew6@uwo.ca",
11
11
  description="A Python implementation of the M3Drop single-cell RNA-seq analysis tool.",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes