M3Drop 0.4.48__py3-none-any.whl → 0.4.50__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -122,9 +122,17 @@ def NBumiPearsonResidualsCombinedCPU(
122
122
  adata_out_approx = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
123
123
  adata_out_approx.write_h5ad(output_filename_approx, compression=None)
124
124
 
125
+ # --- CHUNK SIZE FIX ---
125
126
  # Calculate appropriate H5 storage chunks
126
127
  storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
127
- if storage_chunk_rows < 1: storage_chunk_rows = 1
128
+
129
+ # [CRITICAL FIX] Clamp chunk size to total rows (nc)
130
+ if storage_chunk_rows > nc:
131
+ storage_chunk_rows = nc
132
+
133
+ if storage_chunk_rows < 1:
134
+ storage_chunk_rows = 1
135
+ # ----------------------
128
136
 
129
137
  # Open both files for writing simultaneously
130
138
  with h5py.File(output_filename_full, 'a') as f_full, h5py.File(output_filename_approx, 'a') as f_approx:
@@ -6,10 +6,13 @@ import h5py
6
6
  import anndata
7
7
  import pandas as pd
8
8
  import os
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
9
11
 
10
12
  try:
11
13
  import cupy
12
14
  from cupy.sparse import csr_matrix as cp_csr_matrix
15
+ import cupyx
13
16
  HAS_GPU = True
14
17
  except ImportError:
15
18
  cupy = None
@@ -19,7 +22,6 @@ except ImportError:
19
22
  try:
20
23
  from .ControlDeviceGPU import ControlDevice
21
24
  except ImportError:
22
- # Fallback for direct script execution (debugging)
23
25
  try:
24
26
  from ControlDeviceGPU import ControlDevice
25
27
  except ImportError:
@@ -58,11 +60,14 @@ def NBumiPearsonResidualsCombinedGPU(
58
60
  stats_filename: str,
59
61
  output_filename_full: str,
60
62
  output_filename_approx: str,
63
+ plot_summary_filename: str = None,
64
+ plot_detail_filename: str = None,
61
65
  mode: str = "auto",
62
66
  manual_target: int = 3000
63
67
  ):
64
68
  """
65
- UPGRADED: Calculates Full and Approximate residuals in a SINGLE PASS.
69
+ Calculates Full and Approximate residuals in a SINGLE PASS.
70
+ Includes "Sidecar" Visualization logic (Streaming Stats + Subsampling).
66
71
  """
67
72
  start_time = time.perf_counter()
68
73
  print(f"FUNCTION: NBumiPearsonResidualsCombined() | FILE: {raw_filename}")
@@ -73,22 +78,22 @@ def NBumiPearsonResidualsCombinedGPU(
73
78
  ng_filtered = int(cupy.sum(mask_gpu))
74
79
 
75
80
  # 2. Manual Init
76
- with h5py.File(raw_filename, 'r') as f: indptr_cpu = f['X']['indptr'][:]; total_rows = len(indptr_cpu) - 1
81
+ with h5py.File(raw_filename, 'r') as f:
82
+ indptr_cpu = f['X']['indptr'][:]
83
+ total_rows = len(indptr_cpu) - 1
84
+
77
85
  device = ControlDevice(indptr=indptr_cpu, total_rows=total_rows, n_genes=ng_filtered, mode=mode, manual_target=manual_target)
78
86
  nc = device.total_rows
79
87
 
80
88
  print("Phase [1/2]: Initializing parameters...")
81
- # Load parameters for both calculations
89
+ # Load parameters
82
90
  with open(fit_filename, 'rb') as f: fit = pickle.load(f)
83
- with open(stats_filename, 'rb') as f: stats = pickle.load(f)
84
91
 
85
92
  # Common params
86
93
  total = fit['vals']['total']
87
94
  tjs_gpu = cupy.asarray(fit['vals']['tjs'].values, dtype=cupy.float64)
88
95
  tis_gpu = cupy.asarray(fit['vals']['tis'].values, dtype=cupy.float64)
89
-
90
- # Specific params
91
- sizes_gpu = cupy.asarray(fit['sizes'].values, dtype=cupy.float64) # For Full
96
+ sizes_gpu = cupy.asarray(fit['sizes'].values, dtype=cupy.float64)
92
97
 
93
98
  # Setup Output Files
94
99
  adata_in = anndata.read_h5ad(raw_filename, backed='r')
@@ -101,20 +106,46 @@ def NBumiPearsonResidualsCombinedGPU(
101
106
  adata_out_approx = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
102
107
  adata_out_approx.write_h5ad(output_filename_approx, compression=None)
103
108
 
109
+ # --- VISUALIZATION SETUP (THE SIDECAR) ---
110
+ # 1. Sampling Rate (Target 5 Million Max)
111
+ TARGET_SAMPLES = 5_000_000
112
+ total_points = nc * ng_filtered
113
+
114
+ if total_points <= TARGET_SAMPLES:
115
+ sampling_rate = 1.0 # Take everything
116
+ else:
117
+ sampling_rate = TARGET_SAMPLES / total_points
118
+
119
+ print(f" > Visualization Sampling Rate: {sampling_rate*100:.4f}% (Target: {TARGET_SAMPLES:,} points)")
120
+
121
+ # 2. Accumulators for Plot 1 (Variance) - EXACT MATH
122
+ # We need Sum(x) and Sum(x^2) for: Raw, Approx, Full
123
+ acc_raw_sum = cupy.zeros(ng_filtered, dtype=cupy.float64)
124
+ # acc_raw_sq = cupy.zeros(ng_filtered, dtype=cupy.float64) # Not strictly needed for Mean X-axis, but good for completeness. Skipping to save VRAM.
125
+
126
+ acc_approx_sum = cupy.zeros(ng_filtered, dtype=cupy.float64)
127
+ acc_approx_sq = cupy.zeros(ng_filtered, dtype=cupy.float64)
128
+
129
+ acc_full_sum = cupy.zeros(ng_filtered, dtype=cupy.float64)
130
+ acc_full_sq = cupy.zeros(ng_filtered, dtype=cupy.float64)
131
+
132
+ # 3. Lists for Plots 2 & 3 (Scatter/KDE) - SAMPLED
133
+ viz_approx_samples = []
134
+ viz_full_samples = []
135
+ # -----------------------------------------
136
+
137
+ # Storage Chunk Calc
104
138
  storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
139
+ if storage_chunk_rows > nc: storage_chunk_rows = nc
105
140
  if storage_chunk_rows < 1: storage_chunk_rows = 1
106
141
 
107
- # Open both files for writing simultaneously
142
+ # Open files
108
143
  with h5py.File(output_filename_full, 'a') as f_full, h5py.File(output_filename_approx, 'a') as f_approx:
109
144
  if 'X' in f_full: del f_full['X']
110
145
  if 'X' in f_approx: del f_approx['X']
111
146
 
112
- out_x_full = f_full.create_dataset(
113
- 'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
114
- )
115
- out_x_approx = f_approx.create_dataset(
116
- 'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
117
- )
147
+ out_x_full = f_full.create_dataset('X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64')
148
+ out_x_approx = f_approx.create_dataset('X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64')
118
149
 
119
150
  with h5py.File(raw_filename, 'r') as f_in:
120
151
  h5_indptr = f_in['X']['indptr']
@@ -123,7 +154,8 @@ def NBumiPearsonResidualsCombinedGPU(
123
154
 
124
155
  current_row = 0
125
156
  while current_row < nc:
126
- end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=3.0) # Higher overhead for double write
157
+ # [SAFE MODE] Multiplier 3.0 is safe for Index Sampling
158
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=3.0)
127
159
  if end_row is None or end_row <= current_row: break
128
160
 
129
161
  chunk_size = end_row - current_row
@@ -131,7 +163,7 @@ def NBumiPearsonResidualsCombinedGPU(
131
163
 
132
164
  start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
133
165
 
134
- # Load & Filter
166
+ # Load Raw
135
167
  data_gpu_raw = cupy.asarray(h5_data[start_idx:end_idx], dtype=cupy.float64)
136
168
  indices_gpu_raw = cupy.asarray(h5_indices[start_idx:end_idx])
137
169
  indptr_gpu_raw = cupy.asarray(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
@@ -145,7 +177,23 @@ def NBumiPearsonResidualsCombinedGPU(
145
177
  del chunk_gpu, data_gpu_raw, indices_gpu_raw, indptr_gpu_raw
146
178
  cupy.get_default_memory_pool().free_all_blocks()
147
179
 
148
- # --- CALC 1: APPROX (Cheaper, do first) ---
180
+ # --- VIZ ACCUMULATION 1: RAW MEAN ---
181
+ # Add raw sums to accumulator (column-wise sum)
182
+ acc_raw_sum += cupy.sum(counts_dense, axis=0)
183
+
184
+ # --- VIZ SAMPLING: GENERATE INDICES ---
185
+ # We pick indices NOW so we can grab the same points from both Approx and Full
186
+ chunk_total_items = chunk_size * ng_filtered
187
+ n_samples_chunk = int(chunk_total_items * sampling_rate)
188
+
189
+ if n_samples_chunk > 0:
190
+ # Index Sampling: Zero VRAM overhead compared to Masking
191
+ # Use flatten indices
192
+ sample_indices = cupy.random.choice(chunk_total_items, size=n_samples_chunk, replace=False)
193
+ else:
194
+ sample_indices = None
195
+
196
+ # --- CALC 1: APPROX ---
149
197
  approx_out = cupy.empty_like(counts_dense)
150
198
  pearson_approx_kernel(
151
199
  counts_dense,
@@ -154,10 +202,22 @@ def NBumiPearsonResidualsCombinedGPU(
154
202
  total,
155
203
  approx_out
156
204
  )
205
+
206
+ # [VIZ UPDATE: APPROX]
207
+ acc_approx_sum += cupy.sum(approx_out, axis=0)
208
+ acc_approx_sq += cupy.sum(approx_out**2, axis=0)
209
+
210
+ if sample_indices is not None:
211
+ # Flatten temporarily to sample, then return to CPU
212
+ # Note: take() returns a new array, small size
213
+ sampled_vals = approx_out.ravel().take(sample_indices)
214
+ viz_approx_samples.append(cupy.asnumpy(sampled_vals))
215
+
216
+ # [DISK WRITE: APPROX]
157
217
  out_x_approx[current_row:end_row, :] = approx_out.get()
158
218
  del approx_out
159
219
 
160
- # --- CALC 2: FULL (In-place on counts_dense to save VRAM) ---
220
+ # --- CALC 2: FULL (In-place) ---
161
221
  pearson_residual_kernel(
162
222
  counts_dense,
163
223
  tjs_gpu,
@@ -166,13 +226,130 @@ def NBumiPearsonResidualsCombinedGPU(
166
226
  total,
167
227
  counts_dense # Overwrite input
168
228
  )
229
+
230
+ # [VIZ UPDATE: FULL]
231
+ acc_full_sum += cupy.sum(counts_dense, axis=0)
232
+ acc_full_sq += cupy.sum(counts_dense**2, axis=0)
233
+
234
+ if sample_indices is not None:
235
+ sampled_vals = counts_dense.ravel().take(sample_indices)
236
+ viz_full_samples.append(cupy.asnumpy(sampled_vals))
237
+
238
+ # [DISK WRITE: FULL]
169
239
  out_x_full[current_row:end_row, :] = counts_dense.get()
170
240
 
171
- del counts_dense
241
+ del counts_dense, sample_indices
172
242
  cupy.get_default_memory_pool().free_all_blocks()
173
243
  current_row = end_row
174
244
 
175
245
  print(f"\nPhase [2/2]: COMPLETE{' '*50}")
246
+
247
+ # ==========================================
248
+ # VIZ GENERATION (POST-PROCESS)
249
+ # ==========================================
250
+ if plot_summary_filename and plot_detail_filename:
251
+ print("Phase [Viz]: Generating Diagnostics...")
252
+
253
+ # 1. Finalize Variance Stats (GPU -> CPU)
254
+ # Var = E[X^2] - (E[X])^2
255
+ # Mean = Sum / N
256
+
257
+ # Pull everything to CPU once
258
+ raw_sum = cupy.asnumpy(acc_raw_sum)
259
+
260
+ approx_sum = cupy.asnumpy(acc_approx_sum)
261
+ approx_sq = cupy.asnumpy(acc_approx_sq)
262
+
263
+ full_sum = cupy.asnumpy(acc_full_sum)
264
+ full_sq = cupy.asnumpy(acc_full_sq)
265
+
266
+ # Calculate
267
+ mean_raw = raw_sum / nc
268
+
269
+ mean_approx = approx_sum / nc
270
+ mean_sq_approx = approx_sq / nc
271
+ var_approx = mean_sq_approx - (mean_approx**2)
272
+
273
+ mean_full = full_sum / nc
274
+ mean_sq_full = full_sq / nc
275
+ var_full = mean_sq_full - (mean_full**2)
276
+
277
+ # 2. Finalize Samples
278
+ if viz_approx_samples:
279
+ flat_approx = np.concatenate(viz_approx_samples)
280
+ flat_full = np.concatenate(viz_full_samples)
281
+ else:
282
+ flat_approx = np.array([])
283
+ flat_full = np.array([])
284
+
285
+ print(f" > Samples Collected: {len(flat_approx):,} points")
286
+
287
+ # --- FILE 1: SUMMARY (1080p) ---
288
+ print(f" > Saving Summary Plot: {plot_summary_filename}")
289
+ fig1, ax1 = plt.subplots(1, 2, figsize=(16, 7)) # 16x7 inches ~ 1080p aspect
290
+
291
+ # Plot 1: Variance Stabilization
292
+ ax = ax1[0]
293
+ ax.scatter(mean_raw, var_approx, s=2, alpha=0.5, color='red', label='Approx (Poisson)')
294
+ ax.scatter(mean_raw, var_full, s=2, alpha=0.5, color='blue', label='Full (NB Pearson)')
295
+ ax.axhline(1.0, color='black', linestyle='--', linewidth=1)
296
+ ax.set_xscale('log')
297
+ ax.set_yscale('log')
298
+ ax.set_title("Variance Stabilization Check")
299
+ ax.set_xlabel("Mean Raw Expression (log)")
300
+ ax.set_ylabel("Variance of Residuals (log)")
301
+ ax.legend()
302
+ ax.grid(True, alpha=0.3)
303
+ ax.text(0.5, -0.15, "Goal: Blue dots should form a flat line at y=1",
304
+ transform=ax.transAxes, ha='center', fontsize=9,
305
+ bbox=dict(facecolor='#f0f0f0', edgecolor='black', alpha=0.7))
306
+
307
+ # Plot 3: Distribution
308
+ ax = ax1[1]
309
+ if len(flat_approx) > 100:
310
+ # Clip for cleaner KDE
311
+ mask_kde = (flat_approx > -10) & (flat_approx < 10)
312
+ sns.kdeplot(flat_approx[mask_kde], fill=True, color='red', alpha=0.3, label='Approx', ax=ax, warn_singular=False)
313
+ sns.kdeplot(flat_full[mask_kde], fill=True, color='blue', alpha=0.3, label='Full', ax=ax, warn_singular=False)
314
+ ax.set_xlim(-5, 5)
315
+ ax.set_title("Distribution of Residuals")
316
+ ax.set_xlabel("Residual Value")
317
+ ax.legend()
318
+ ax.grid(True, alpha=0.3)
319
+ ax.text(0.5, -0.15, "Goal: Blue curve should be tighter (narrower) than Red",
320
+ transform=ax.transAxes, ha='center', fontsize=9,
321
+ bbox=dict(facecolor='#f0f0f0', edgecolor='black', alpha=0.7))
322
+
323
+ plt.tight_layout()
324
+ plt.savefig(plot_summary_filename, dpi=120) # 120 DPI * 16 inch = 1920 width
325
+ plt.close()
326
+
327
+ # --- FILE 2: DETAIL (4K) ---
328
+ print(f" > Saving Detail Plot: {plot_detail_filename}")
329
+ fig2, ax2 = plt.subplots(figsize=(20, 11)) # 20x11 inches ~ 4K aspect
330
+
331
+ if len(flat_approx) > 0:
332
+ ax2.scatter(flat_approx, flat_full, s=1, alpha=0.5, color='purple')
333
+
334
+ # Diagonal line
335
+ lims = [
336
+ np.min([ax2.get_xlim(), ax2.get_ylim()]),
337
+ np.max([ax2.get_xlim(), ax2.get_ylim()]),
338
+ ]
339
+ ax2.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
340
+
341
+ ax2.set_title("Residual Shrinkage (Sampled)")
342
+ ax2.set_xlabel("Approx Residuals")
343
+ ax2.set_ylabel("Full Residuals")
344
+ ax2.grid(True, alpha=0.3)
345
+ ax2.text(0.5, -0.1, "Goal: Points below diagonal = Dispersion Penalty Working",
346
+ transform=ax2.transAxes, ha='center', fontsize=12,
347
+ bbox=dict(facecolor='#f0f0f0', edgecolor='black', alpha=0.7))
348
+
349
+ plt.tight_layout()
350
+ plt.savefig(plot_detail_filename, dpi=200) # 200 DPI * 20 inch = 4000 width (4Kish)
351
+ plt.close()
352
+
176
353
 
177
354
  if hasattr(adata_in, "file") and adata_in.file is not None: adata_in.file.close()
178
355
  print(f"Total time: {time.perf_counter() - start_time:.2f} seconds.\n")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.48
3
+ Version: 0.4.50
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -4,11 +4,11 @@ m3Drop/CoreCPU.py,sha256=csRg5TLQx1Sup7k3lDJm9OO5Oe5-1aC3u_6ldE_GIX8,18679
4
4
  m3Drop/CoreGPU.py,sha256=6LToLuWyHxX_7sC2z0Xnvy_qqgmpew5DmnCV0PxmTZQ,19785
5
5
  m3Drop/DiagnosticsCPU.py,sha256=l0Imkh3F3zo4ovihUjx7cYWYgzPdztWCN1hcBFO43nY,12943
6
6
  m3Drop/DiagnosticsGPU.py,sha256=bsatHyHszgbufneeJvFvHBTLzDuY006nP2yHPHs8s7M,14389
7
- m3Drop/NormalizationCPU.py,sha256=Mm8VzWDu-NONbp-ngAt4PLjCKAGc7gJZKf-Yd-U95r0,7255
8
- m3Drop/NormalizationGPU.py,sha256=1XRDZhNVkIbQMv_ggNoNEnIxRMY1NHDjOtOq4QGVRwY,7011
7
+ m3Drop/NormalizationCPU.py,sha256=DmqvjcpHwkNZicEb2GBqTDBVyvtBeUSLmFRwRFDk0ms,7458
8
+ m3Drop/NormalizationGPU.py,sha256=dePlap2nk85yEo4uUzRUCqTggRBuL16L0bJnAuJHWHI,14760
9
9
  m3Drop/__init__.py,sha256=W_TQ9P8_7Tdsa6kDZ6IJKT0FMkX_JFvBqiP821CZIrk,2180
10
- m3drop-0.4.48.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
11
- m3drop-0.4.48.dist-info/METADATA,sha256=Q3r9QYqBYVTaBqeK_DUs0-Ygt1zkSz4gGY5G5d2XY8M,5248
12
- m3drop-0.4.48.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
- m3drop-0.4.48.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
14
- m3drop-0.4.48.dist-info/RECORD,,
10
+ m3drop-0.4.50.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
11
+ m3drop-0.4.50.dist-info/METADATA,sha256=SHH4ifncxDvHPZoDw86WgAu48dy0BAs99Gr8zuS6ItI,5248
12
+ m3drop-0.4.50.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
+ m3drop-0.4.50.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
14
+ m3drop-0.4.50.dist-info/RECORD,,