M3Drop 0.4.49__py3-none-any.whl → 0.4.51__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,10 +6,13 @@ import h5py
6
6
  import anndata
7
7
  import pandas as pd
8
8
  import os
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
9
11
 
10
12
  try:
11
13
  import cupy
12
14
  from cupy.sparse import csr_matrix as cp_csr_matrix
15
+ import cupyx
13
16
  HAS_GPU = True
14
17
  except ImportError:
15
18
  cupy = None
@@ -19,7 +22,6 @@ except ImportError:
19
22
  try:
20
23
  from .ControlDeviceGPU import ControlDevice
21
24
  except ImportError:
22
- # Fallback for direct script execution (debugging)
23
25
  try:
24
26
  from ControlDeviceGPU import ControlDevice
25
27
  except ImportError:
@@ -58,11 +60,14 @@ def NBumiPearsonResidualsCombinedGPU(
58
60
  stats_filename: str,
59
61
  output_filename_full: str,
60
62
  output_filename_approx: str,
63
+ plot_summary_filename: str = None,
64
+ plot_detail_filename: str = None,
61
65
  mode: str = "auto",
62
66
  manual_target: int = 3000
63
67
  ):
64
68
  """
65
- UPGRADED: Calculates Full and Approximate residuals in a SINGLE PASS.
69
+ Calculates Full and Approximate residuals in a SINGLE PASS.
70
+ Includes "Sidecar" Visualization logic (Streaming Stats + Subsampling).
66
71
  """
67
72
  start_time = time.perf_counter()
68
73
  print(f"FUNCTION: NBumiPearsonResidualsCombined() | FILE: {raw_filename}")
@@ -73,22 +78,22 @@ def NBumiPearsonResidualsCombinedGPU(
73
78
  ng_filtered = int(cupy.sum(mask_gpu))
74
79
 
75
80
  # 2. Manual Init
76
- with h5py.File(raw_filename, 'r') as f: indptr_cpu = f['X']['indptr'][:]; total_rows = len(indptr_cpu) - 1
81
+ with h5py.File(raw_filename, 'r') as f:
82
+ indptr_cpu = f['X']['indptr'][:]
83
+ total_rows = len(indptr_cpu) - 1
84
+
77
85
  device = ControlDevice(indptr=indptr_cpu, total_rows=total_rows, n_genes=ng_filtered, mode=mode, manual_target=manual_target)
78
86
  nc = device.total_rows
79
87
 
80
88
  print("Phase [1/2]: Initializing parameters...")
81
- # Load parameters for both calculations
89
+ # Load parameters
82
90
  with open(fit_filename, 'rb') as f: fit = pickle.load(f)
83
- with open(stats_filename, 'rb') as f: stats = pickle.load(f)
84
91
 
85
92
  # Common params
86
93
  total = fit['vals']['total']
87
94
  tjs_gpu = cupy.asarray(fit['vals']['tjs'].values, dtype=cupy.float64)
88
95
  tis_gpu = cupy.asarray(fit['vals']['tis'].values, dtype=cupy.float64)
89
-
90
- # Specific params
91
- sizes_gpu = cupy.asarray(fit['sizes'].values, dtype=cupy.float64) # For Full
96
+ sizes_gpu = cupy.asarray(fit['sizes'].values, dtype=cupy.float64)
92
97
 
93
98
  # Setup Output Files
94
99
  adata_in = anndata.read_h5ad(raw_filename, backed='r')
@@ -101,29 +106,46 @@ def NBumiPearsonResidualsCombinedGPU(
101
106
  adata_out_approx = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
102
107
  adata_out_approx.write_h5ad(output_filename_approx, compression=None)
103
108
 
104
- # --- CHUNK SIZE FIX ---
105
- # Calculate rows needed to fill ~1GB
106
- storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
109
+ # --- VISUALIZATION SETUP (THE SIDECAR) ---
110
+ # 1. Sampling Rate (Target 5 Million Max)
111
+ TARGET_SAMPLES = 5_000_000
112
+ total_points = nc * ng_filtered
107
113
 
108
- # [CRITICAL FIX] Clamp chunk size to total rows (nc)
109
- if storage_chunk_rows > nc:
110
- storage_chunk_rows = nc
114
+ if total_points <= TARGET_SAMPLES:
115
+ sampling_rate = 1.0 # Take everything
116
+ else:
117
+ sampling_rate = TARGET_SAMPLES / total_points
111
118
 
112
- if storage_chunk_rows < 1:
113
- storage_chunk_rows = 1
114
- # ----------------------
119
+ print(f" > Visualization Sampling Rate: {sampling_rate*100:.4f}% (Target: {TARGET_SAMPLES:,} points)")
120
+
121
+ # 2. Accumulators for Plot 1 (Variance) - EXACT MATH
122
+ # We need Sum(x) and Sum(x^2) for: Raw, Approx, Full
123
+ acc_raw_sum = cupy.zeros(ng_filtered, dtype=cupy.float64)
124
+ # acc_raw_sq = cupy.zeros(ng_filtered, dtype=cupy.float64) # Not strictly needed for Mean X-axis, but good for completeness. Skipping to save VRAM.
125
+
126
+ acc_approx_sum = cupy.zeros(ng_filtered, dtype=cupy.float64)
127
+ acc_approx_sq = cupy.zeros(ng_filtered, dtype=cupy.float64)
115
128
 
116
- # Open both files for writing simultaneously
129
+ acc_full_sum = cupy.zeros(ng_filtered, dtype=cupy.float64)
130
+ acc_full_sq = cupy.zeros(ng_filtered, dtype=cupy.float64)
131
+
132
+ # 3. Lists for Plots 2 & 3 (Scatter/KDE) - SAMPLED
133
+ viz_approx_samples = []
134
+ viz_full_samples = []
135
+ # -----------------------------------------
136
+
137
+ # Storage Chunk Calc
138
+ storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
139
+ if storage_chunk_rows > nc: storage_chunk_rows = nc
140
+ if storage_chunk_rows < 1: storage_chunk_rows = 1
141
+
142
+ # Open files
117
143
  with h5py.File(output_filename_full, 'a') as f_full, h5py.File(output_filename_approx, 'a') as f_approx:
118
144
  if 'X' in f_full: del f_full['X']
119
145
  if 'X' in f_approx: del f_approx['X']
120
146
 
121
- out_x_full = f_full.create_dataset(
122
- 'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
123
- )
124
- out_x_approx = f_approx.create_dataset(
125
- 'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
126
- )
147
+ out_x_full = f_full.create_dataset('X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64')
148
+ out_x_approx = f_approx.create_dataset('X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64')
127
149
 
128
150
  with h5py.File(raw_filename, 'r') as f_in:
129
151
  h5_indptr = f_in['X']['indptr']
@@ -132,7 +154,8 @@ def NBumiPearsonResidualsCombinedGPU(
132
154
 
133
155
  current_row = 0
134
156
  while current_row < nc:
135
- end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=3.0) # Higher overhead for double write
157
+ # [SAFE MODE] Multiplier 3.0 is safe for Index Sampling
158
+ end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=3.0)
136
159
  if end_row is None or end_row <= current_row: break
137
160
 
138
161
  chunk_size = end_row - current_row
@@ -140,7 +163,7 @@ def NBumiPearsonResidualsCombinedGPU(
140
163
 
141
164
  start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
142
165
 
143
- # Load & Filter
166
+ # Load Raw
144
167
  data_gpu_raw = cupy.asarray(h5_data[start_idx:end_idx], dtype=cupy.float64)
145
168
  indices_gpu_raw = cupy.asarray(h5_indices[start_idx:end_idx])
146
169
  indptr_gpu_raw = cupy.asarray(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
@@ -154,7 +177,24 @@ def NBumiPearsonResidualsCombinedGPU(
154
177
  del chunk_gpu, data_gpu_raw, indices_gpu_raw, indptr_gpu_raw
155
178
  cupy.get_default_memory_pool().free_all_blocks()
156
179
 
157
- # --- CALC 1: APPROX (Cheaper, do first) ---
180
+ # --- VIZ ACCUMULATION 1: RAW MEAN ---
181
+ # Add raw sums to accumulator (column-wise sum)
182
+ acc_raw_sum += cupy.sum(counts_dense, axis=0)
183
+
184
+ # --- VIZ SAMPLING: GENERATE INDICES ---
185
+ # We pick indices NOW so we can grab the same points from both Approx and Full
186
+ chunk_total_items = chunk_size * ng_filtered
187
+ n_samples_chunk = int(chunk_total_items * sampling_rate)
188
+
189
+ if n_samples_chunk > 0:
190
+ # Index Sampling: Zero VRAM overhead compared to Masking
191
+ # Use flatten indices
192
+ # [FIXED LINE BELOW] Added int() cast for safety
193
+ sample_indices = cupy.random.choice(int(chunk_total_items), size=n_samples_chunk, replace=False)
194
+ else:
195
+ sample_indices = None
196
+
197
+ # --- CALC 1: APPROX ---
158
198
  approx_out = cupy.empty_like(counts_dense)
159
199
  pearson_approx_kernel(
160
200
  counts_dense,
@@ -163,10 +203,22 @@ def NBumiPearsonResidualsCombinedGPU(
163
203
  total,
164
204
  approx_out
165
205
  )
206
+
207
+ # [VIZ UPDATE: APPROX]
208
+ acc_approx_sum += cupy.sum(approx_out, axis=0)
209
+ acc_approx_sq += cupy.sum(approx_out**2, axis=0)
210
+
211
+ if sample_indices is not None:
212
+ # Flatten temporarily to sample, then return to CPU
213
+ # Note: take() returns a new array, small size
214
+ sampled_vals = approx_out.ravel().take(sample_indices)
215
+ viz_approx_samples.append(cupy.asnumpy(sampled_vals))
216
+
217
+ # [DISK WRITE: APPROX]
166
218
  out_x_approx[current_row:end_row, :] = approx_out.get()
167
219
  del approx_out
168
220
 
169
- # --- CALC 2: FULL (In-place on counts_dense to save VRAM) ---
221
+ # --- CALC 2: FULL (In-place) ---
170
222
  pearson_residual_kernel(
171
223
  counts_dense,
172
224
  tjs_gpu,
@@ -175,13 +227,130 @@ def NBumiPearsonResidualsCombinedGPU(
175
227
  total,
176
228
  counts_dense # Overwrite input
177
229
  )
230
+
231
+ # [VIZ UPDATE: FULL]
232
+ acc_full_sum += cupy.sum(counts_dense, axis=0)
233
+ acc_full_sq += cupy.sum(counts_dense**2, axis=0)
234
+
235
+ if sample_indices is not None:
236
+ sampled_vals = counts_dense.ravel().take(sample_indices)
237
+ viz_full_samples.append(cupy.asnumpy(sampled_vals))
238
+
239
+ # [DISK WRITE: FULL]
178
240
  out_x_full[current_row:end_row, :] = counts_dense.get()
179
241
 
180
- del counts_dense
242
+ del counts_dense, sample_indices
181
243
  cupy.get_default_memory_pool().free_all_blocks()
182
244
  current_row = end_row
183
245
 
184
246
  print(f"\nPhase [2/2]: COMPLETE{' '*50}")
247
+
248
+ # ==========================================
249
+ # VIZ GENERATION (POST-PROCESS)
250
+ # ==========================================
251
+ if plot_summary_filename and plot_detail_filename:
252
+ print("Phase [Viz]: Generating Diagnostics...")
253
+
254
+ # 1. Finalize Variance Stats (GPU -> CPU)
255
+ # Var = E[X^2] - (E[X])^2
256
+ # Mean = Sum / N
257
+
258
+ # Pull everything to CPU once
259
+ raw_sum = cupy.asnumpy(acc_raw_sum)
260
+
261
+ approx_sum = cupy.asnumpy(acc_approx_sum)
262
+ approx_sq = cupy.asnumpy(acc_approx_sq)
263
+
264
+ full_sum = cupy.asnumpy(acc_full_sum)
265
+ full_sq = cupy.asnumpy(acc_full_sq)
266
+
267
+ # Calculate
268
+ mean_raw = raw_sum / nc
269
+
270
+ mean_approx = approx_sum / nc
271
+ mean_sq_approx = approx_sq / nc
272
+ var_approx = mean_sq_approx - (mean_approx**2)
273
+
274
+ mean_full = full_sum / nc
275
+ mean_sq_full = full_sq / nc
276
+ var_full = mean_sq_full - (mean_full**2)
277
+
278
+ # 2. Finalize Samples
279
+ if viz_approx_samples:
280
+ flat_approx = np.concatenate(viz_approx_samples)
281
+ flat_full = np.concatenate(viz_full_samples)
282
+ else:
283
+ flat_approx = np.array([])
284
+ flat_full = np.array([])
285
+
286
+ print(f" > Samples Collected: {len(flat_approx):,} points")
287
+
288
+ # --- FILE 1: SUMMARY (1080p) ---
289
+ print(f" > Saving Summary Plot: {plot_summary_filename}")
290
+ fig1, ax1 = plt.subplots(1, 2, figsize=(16, 7)) # 16x7 inches ~ 1080p aspect
291
+
292
+ # Plot 1: Variance Stabilization
293
+ ax = ax1[0]
294
+ ax.scatter(mean_raw, var_approx, s=2, alpha=0.5, color='red', label='Approx (Poisson)')
295
+ ax.scatter(mean_raw, var_full, s=2, alpha=0.5, color='blue', label='Full (NB Pearson)')
296
+ ax.axhline(1.0, color='black', linestyle='--', linewidth=1)
297
+ ax.set_xscale('log')
298
+ ax.set_yscale('log')
299
+ ax.set_title("Variance Stabilization Check")
300
+ ax.set_xlabel("Mean Raw Expression (log)")
301
+ ax.set_ylabel("Variance of Residuals (log)")
302
+ ax.legend()
303
+ ax.grid(True, alpha=0.3)
304
+ ax.text(0.5, -0.15, "Goal: Blue dots should form a flat line at y=1",
305
+ transform=ax.transAxes, ha='center', fontsize=9,
306
+ bbox=dict(facecolor='#f0f0f0', edgecolor='black', alpha=0.7))
307
+
308
+ # Plot 3: Distribution
309
+ ax = ax1[1]
310
+ if len(flat_approx) > 100:
311
+ # Clip for cleaner KDE
312
+ mask_kde = (flat_approx > -10) & (flat_approx < 10)
313
+ sns.kdeplot(flat_approx[mask_kde], fill=True, color='red', alpha=0.3, label='Approx', ax=ax, warn_singular=False)
314
+ sns.kdeplot(flat_full[mask_kde], fill=True, color='blue', alpha=0.3, label='Full', ax=ax, warn_singular=False)
315
+ ax.set_xlim(-5, 5)
316
+ ax.set_title("Distribution of Residuals")
317
+ ax.set_xlabel("Residual Value")
318
+ ax.legend()
319
+ ax.grid(True, alpha=0.3)
320
+ ax.text(0.5, -0.15, "Goal: Blue curve should be tighter (narrower) than Red",
321
+ transform=ax.transAxes, ha='center', fontsize=9,
322
+ bbox=dict(facecolor='#f0f0f0', edgecolor='black', alpha=0.7))
323
+
324
+ plt.tight_layout()
325
+ plt.savefig(plot_summary_filename, dpi=120) # 120 DPI * 16 inch = 1920 width
326
+ plt.close()
327
+
328
+ # --- FILE 2: DETAIL (4K) ---
329
+ print(f" > Saving Detail Plot: {plot_detail_filename}")
330
+ fig2, ax2 = plt.subplots(figsize=(20, 11)) # 20x11 inches ~ 4K aspect
331
+
332
+ if len(flat_approx) > 0:
333
+ ax2.scatter(flat_approx, flat_full, s=1, alpha=0.5, color='purple')
334
+
335
+ # Diagonal line
336
+ lims = [
337
+ np.min([ax2.get_xlim(), ax2.get_ylim()]),
338
+ np.max([ax2.get_xlim(), ax2.get_ylim()]),
339
+ ]
340
+ ax2.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
341
+
342
+ ax2.set_title("Residual Shrinkage (Sampled)")
343
+ ax2.set_xlabel("Approx Residuals")
344
+ ax2.set_ylabel("Full Residuals")
345
+ ax2.grid(True, alpha=0.3)
346
+ ax2.text(0.5, -0.1, "Goal: Points below diagonal = Dispersion Penalty Working",
347
+ transform=ax2.transAxes, ha='center', fontsize=12,
348
+ bbox=dict(facecolor='#f0f0f0', edgecolor='black', alpha=0.7))
349
+
350
+ plt.tight_layout()
351
+ plt.savefig(plot_detail_filename, dpi=200) # 200 DPI * 20 inch = 4000 width (4Kish)
352
+ plt.close()
353
+
185
354
 
186
355
  if hasattr(adata_in, "file") and adata_in.file is not None: adata_in.file.close()
187
356
  print(f"Total time: {time.perf_counter() - start_time:.2f} seconds.\n")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.49
3
+ Version: 0.4.51
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -5,10 +5,10 @@ m3Drop/CoreGPU.py,sha256=6LToLuWyHxX_7sC2z0Xnvy_qqgmpew5DmnCV0PxmTZQ,19785
5
5
  m3Drop/DiagnosticsCPU.py,sha256=l0Imkh3F3zo4ovihUjx7cYWYgzPdztWCN1hcBFO43nY,12943
6
6
  m3Drop/DiagnosticsGPU.py,sha256=bsatHyHszgbufneeJvFvHBTLzDuY006nP2yHPHs8s7M,14389
7
7
  m3Drop/NormalizationCPU.py,sha256=DmqvjcpHwkNZicEb2GBqTDBVyvtBeUSLmFRwRFDk0ms,7458
8
- m3Drop/NormalizationGPU.py,sha256=Kl5QvR4HCSgooUOf97-nu53J6wf3apdNvc3BFlTFiEM,7264
8
+ m3Drop/NormalizationGPU.py,sha256=JmXyRWaN64IrSJscuowmnQEjCjuxbk6gwFIs6wgO-Ps,14835
9
9
  m3Drop/__init__.py,sha256=W_TQ9P8_7Tdsa6kDZ6IJKT0FMkX_JFvBqiP821CZIrk,2180
10
- m3drop-0.4.49.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
11
- m3drop-0.4.49.dist-info/METADATA,sha256=aUj_G6pHzrSKr70GwbOptvcAP7HCGviG8hYjq6OiqMk,5248
12
- m3drop-0.4.49.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
- m3drop-0.4.49.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
14
- m3drop-0.4.49.dist-info/RECORD,,
10
+ m3drop-0.4.51.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
11
+ m3drop-0.4.51.dist-info/METADATA,sha256=YnC3WlbNfbpNbajZqd4ENA6sbrTr5bCIgZ7o1Y7cXOA,5248
12
+ m3drop-0.4.51.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
+ m3drop-0.4.51.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
14
+ m3drop-0.4.51.dist-info/RECORD,,