M3Drop 0.4.54__py3-none-any.whl → 0.4.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,8 @@ import h5py
6
6
  import anndata
7
7
  import pandas as pd
8
8
  import os
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
9
11
  from scipy import sparse
10
12
 
11
13
  try:
@@ -14,7 +16,7 @@ except ImportError:
14
16
  print("CRITICAL ERROR: 'numba' not found. Please install it (pip install numba).")
15
17
  sys.exit(1)
16
18
 
17
- # [FIX] Strict Relative Import
19
+ # Strict Relative Import
18
20
  from .ControlDeviceCPU import ControlDevice
19
21
 
20
22
  # ==========================================
@@ -23,25 +25,16 @@ from .ControlDeviceCPU import ControlDevice
23
25
 
24
26
  @jit(nopython=True, parallel=True, fastmath=True)
25
27
  def pearson_residual_kernel_cpu(counts, tj, ti, theta, total, out_matrix):
26
- """
27
- Calculates Pearson residuals using Negative Binomial logic.
28
- Parallelized across CPU cores.
29
- """
30
28
  rows = counts.shape[0]
31
29
  cols = counts.shape[1]
32
-
33
30
  for r in prange(rows):
34
31
  ti_val = ti[r]
35
32
  for c in range(cols):
36
33
  count_val = counts[r, c]
37
34
  mu = (tj[c] * ti_val) / total
38
-
39
- # theta is vector of size cols (genes)
40
35
  theta_val = theta[c]
41
-
42
36
  denom_sq = mu + ((mu * mu) / theta_val)
43
37
  denom = np.sqrt(denom_sq)
44
-
45
38
  if denom < 1e-12:
46
39
  out_matrix[r, c] = 0.0
47
40
  else:
@@ -49,20 +42,14 @@ def pearson_residual_kernel_cpu(counts, tj, ti, theta, total, out_matrix):
49
42
 
50
43
  @jit(nopython=True, parallel=True, fastmath=True)
51
44
  def pearson_approx_kernel_cpu(counts, tj, ti, total, out_matrix):
52
- """
53
- Calculates Approximate Pearson residuals (Poisson limit).
54
- """
55
45
  rows = counts.shape[0]
56
46
  cols = counts.shape[1]
57
-
58
47
  for r in prange(rows):
59
48
  ti_val = ti[r]
60
49
  for c in range(cols):
61
50
  count_val = counts[r, c]
62
51
  mu = (tj[c] * ti_val) / total
63
-
64
52
  denom = np.sqrt(mu)
65
-
66
53
  if denom < 1e-12:
67
54
  out_matrix[r, c] = 0.0
68
55
  else:
@@ -79,12 +66,14 @@ def NBumiPearsonResidualsCombinedCPU(
79
66
  stats_filename: str,
80
67
  output_filename_full: str,
81
68
  output_filename_approx: str,
69
+ plot_summary_filename: str = None,
70
+ plot_detail_filename: str = None,
82
71
  mode: str = "auto",
83
72
  manual_target: int = 3000
84
73
  ):
85
74
  """
86
75
  CPU-Optimized: Calculates Full and Approximate residuals in a SINGLE PASS.
87
- Uses Numba for acceleration on L3-sized dense chunks.
76
+ Includes "Sidecar" Visualization logic (Streaming Stats + Subsampling).
88
77
  """
89
78
  start_time = time.perf_counter()
90
79
  print(f"FUNCTION: NBumiPearsonResidualsCombinedCPU() | FILE: {raw_filename}")
@@ -99,53 +88,57 @@ def NBumiPearsonResidualsCombinedCPU(
99
88
  nc = device.total_rows
100
89
 
101
90
  print("Phase [1/2]: Initializing parameters...")
102
- # Load parameters
103
91
  with open(fit_filename, 'rb') as f: fit = pickle.load(f)
104
- with open(stats_filename, 'rb') as f: stats = pickle.load(f)
105
92
 
106
- # Common params (Numpy Arrays)
107
93
  total = fit['vals']['total']
108
94
  tjs = fit['vals']['tjs'].values.astype(np.float64)
109
95
  tis = fit['vals']['tis'].values.astype(np.float64)
110
-
111
- # Specific params
112
- sizes = fit['sizes'].values.astype(np.float64) # For Full
96
+ sizes = fit['sizes'].values.astype(np.float64)
113
97
 
114
98
  # Setup Output Files
115
99
  adata_in = anndata.read_h5ad(raw_filename, backed='r')
116
100
  filtered_var = adata_in.var[mask]
117
101
 
118
- # Create skeletons
119
102
  adata_out_full = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
120
103
  adata_out_full.write_h5ad(output_filename_full, compression=None)
121
104
 
122
105
  adata_out_approx = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
123
106
  adata_out_approx.write_h5ad(output_filename_approx, compression=None)
124
107
 
125
- # --- CHUNK SIZE FIX ---
126
- # Calculate appropriate H5 storage chunks
127
- storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
108
+ # --- VISUALIZATION SETUP (THE SIDECAR) ---
109
+ # 1. Sampling Rate (Strict Cap to prevent CPU RAM explosion)
110
+ TARGET_SAMPLES = 5_000_000
111
+ total_points = nc * ng_filtered
128
112
 
129
- # [CRITICAL FIX] Clamp chunk size to total rows (nc)
130
- if storage_chunk_rows > nc:
131
- storage_chunk_rows = nc
113
+ if total_points <= TARGET_SAMPLES:
114
+ sampling_rate = 1.0
115
+ else:
116
+ sampling_rate = TARGET_SAMPLES / total_points
132
117
 
133
- if storage_chunk_rows < 1:
134
- storage_chunk_rows = 1
135
- # ----------------------
118
+ print(f" > Visualization Sampling Rate: {sampling_rate*100:.4f}% (Target: {TARGET_SAMPLES:,} points)")
119
+
120
+ # 2. Accumulators (Numpy Arrays - Small memory footprint)
121
+ acc_raw_sum = np.zeros(ng_filtered, dtype=np.float64)
122
+ acc_approx_sum = np.zeros(ng_filtered, dtype=np.float64)
123
+ acc_approx_sq = np.zeros(ng_filtered, dtype=np.float64)
124
+ acc_full_sum = np.zeros(ng_filtered, dtype=np.float64)
125
+ acc_full_sq = np.zeros(ng_filtered, dtype=np.float64)
126
+
127
+ # 3. Lists for Plots (Sampled Only)
128
+ viz_approx_samples = []
129
+ viz_full_samples = []
130
+ # -----------------------------------------
131
+
132
+ storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
133
+ if storage_chunk_rows > nc: storage_chunk_rows = nc
134
+ if storage_chunk_rows < 1: storage_chunk_rows = 1
136
135
 
137
- # Open both files for writing simultaneously
138
136
  with h5py.File(output_filename_full, 'a') as f_full, h5py.File(output_filename_approx, 'a') as f_approx:
139
137
  if 'X' in f_full: del f_full['X']
140
138
  if 'X' in f_approx: del f_approx['X']
141
139
 
142
- # Float64 output
143
- out_x_full = f_full.create_dataset(
144
- 'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
145
- )
146
- out_x_approx = f_approx.create_dataset(
147
- 'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
148
- )
140
+ out_x_full = f_full.create_dataset('X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64')
141
+ out_x_approx = f_approx.create_dataset('X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64')
149
142
 
150
143
  with h5py.File(raw_filename, 'r') as f_in:
151
144
  h5_indptr = f_in['X']['indptr']
@@ -154,7 +147,6 @@ def NBumiPearsonResidualsCombinedCPU(
154
147
 
155
148
  current_row = 0
156
149
  while current_row < nc:
157
- # Dense mode is faster for Numba
158
150
  end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=3.0)
159
151
  if end_row is None or end_row <= current_row: break
160
152
 
@@ -163,7 +155,6 @@ def NBumiPearsonResidualsCombinedCPU(
163
155
 
164
156
  start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
165
157
 
166
- # Load & Filter
167
158
  data = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
168
159
  indices = np.array(h5_indices[start_idx:end_idx])
169
160
  indptr = np.array(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
@@ -172,8 +163,19 @@ def NBumiPearsonResidualsCombinedCPU(
172
163
  chunk_csr = chunk_csr[:, mask]
173
164
  chunk_csr.data = np.ceil(chunk_csr.data)
174
165
 
175
- # Convert to Dense for Numba (faster than sparse iteration for dense ops)
166
+ # Numba needs dense
176
167
  counts_dense = chunk_csr.toarray()
168
+
169
+ # --- VIZ ACCUMULATION 1: RAW MEAN ---
170
+ acc_raw_sum += np.sum(counts_dense, axis=0)
171
+
172
+ # --- VIZ SAMPLING: GENERATE INDICES ---
173
+ chunk_total_items = chunk_size * ng_filtered
174
+ n_samples_chunk = int(chunk_total_items * sampling_rate)
175
+ sample_indices = None
176
+
177
+ if n_samples_chunk > 0:
178
+ sample_indices = np.random.randint(0, int(chunk_total_items), size=n_samples_chunk)
177
179
 
178
180
  # --- CALC 1: APPROX ---
179
181
  approx_out = np.empty_like(counts_dense)
@@ -184,11 +186,24 @@ def NBumiPearsonResidualsCombinedCPU(
184
186
  total,
185
187
  approx_out
186
188
  )
189
+
190
+ # Accumulate
191
+ acc_approx_sum += np.sum(approx_out, axis=0)
192
+
193
+ # Sample
194
+ if sample_indices is not None:
195
+ # Ravel creates a view, take copies the data. Safe.
196
+ viz_approx_samples.append(np.take(approx_out.ravel(), sample_indices))
197
+
198
+ # Write
187
199
  out_x_approx[current_row:end_row, :] = approx_out
200
+
201
+ # Square (Explicit multiplication for safety)
202
+ approx_out = approx_out * approx_out
203
+ acc_approx_sq += np.sum(approx_out, axis=0)
188
204
  del approx_out
189
205
 
190
206
  # --- CALC 2: FULL (In-place on counts_dense) ---
191
- # We can reuse the counts_dense buffer for output to save RAM
192
207
  pearson_residual_kernel_cpu(
193
208
  counts_dense,
194
209
  tjs,
@@ -197,11 +212,112 @@ def NBumiPearsonResidualsCombinedCPU(
197
212
  total,
198
213
  counts_dense # Overwrite input
199
214
  )
215
+
216
+ # Accumulate
217
+ acc_full_sum += np.sum(counts_dense, axis=0)
218
+
219
+ # Sample
220
+ if sample_indices is not None:
221
+ viz_full_samples.append(np.take(counts_dense.ravel(), sample_indices))
222
+
223
+ # Write
200
224
  out_x_full[current_row:end_row, :] = counts_dense
201
225
 
226
+ # Square
227
+ counts_dense = counts_dense * counts_dense
228
+ acc_full_sq += np.sum(counts_dense, axis=0)
229
+
202
230
  current_row = end_row
203
231
 
204
232
  print(f"\nPhase [2/2]: COMPLETE{' '*50}")
205
-
233
+
234
+ # ==========================================
235
+ # VIZ GENERATION (POST-PROCESS)
236
+ # ==========================================
237
+ if plot_summary_filename and plot_detail_filename:
238
+ print("Phase [Viz]: Generating Diagnostics (CPU)...")
239
+
240
+ # 1. Finalize Variance Stats
241
+ mean_raw = acc_raw_sum / nc
242
+
243
+ mean_approx = acc_approx_sum / nc
244
+ mean_sq_approx = acc_approx_sq / nc
245
+ var_approx = mean_sq_approx - (mean_approx**2)
246
+
247
+ mean_full = acc_full_sum / nc
248
+ mean_sq_full = acc_full_sq / nc
249
+ var_full = mean_sq_full - (mean_full**2)
250
+
251
+ # 2. Finalize Samples
252
+ if viz_approx_samples:
253
+ flat_approx = np.concatenate(viz_approx_samples)
254
+ flat_full = np.concatenate(viz_full_samples)
255
+ else:
256
+ flat_approx = np.array([])
257
+ flat_full = np.array([])
258
+
259
+ print(f" > Samples Collected: {len(flat_approx):,} points")
260
+
261
+ # --- FILE 1: SUMMARY (1080p) ---
262
+ print(f" > Saving Summary Plot: {plot_summary_filename}")
263
+ fig1, ax1 = plt.subplots(1, 2, figsize=(16, 7))
264
+
265
+ # Plot 1: Variance Stabilization
266
+ ax = ax1[0]
267
+ ax.scatter(mean_raw, var_approx, s=2, alpha=0.5, color='red', label='Approx (Poisson)')
268
+ ax.scatter(mean_raw, var_full, s=2, alpha=0.5, color='blue', label='Full (NB Pearson)')
269
+ ax.axhline(1.0, color='black', linestyle='--', linewidth=1)
270
+ ax.set_xscale('log')
271
+ ax.set_yscale('log')
272
+ ax.set_title("Variance Stabilization Check")
273
+ ax.set_xlabel("Mean Raw Expression (log)")
274
+ ax.set_ylabel("Variance of Residuals (log)")
275
+ ax.legend()
276
+ ax.grid(True, which='both', linestyle='--', alpha=0.5)
277
+
278
+ # Plot 2: Distribution (Histogram + KDE Overlay)
279
+ ax = ax1[1]
280
+ if len(flat_approx) > 100:
281
+ mask_kde = (flat_approx > -10) & (flat_approx < 10)
282
+ bins = np.linspace(-5, 5, 100)
283
+ ax.hist(flat_approx[mask_kde], bins=bins, color='red', alpha=0.2, density=True, label='_nolegend_')
284
+ ax.hist(flat_full[mask_kde], bins=bins, color='blue', alpha=0.2, density=True, label='_nolegend_')
285
+
286
+ sns.kdeplot(flat_approx[mask_kde], fill=False, color='red', linewidth=2, label='Approx', ax=ax, warn_singular=False)
287
+ sns.kdeplot(flat_full[mask_kde], fill=False, color='blue', linewidth=2, label='Full', ax=ax, warn_singular=False)
288
+
289
+ ax.set_yscale('log')
290
+ ax.set_ylim(bottom=0.001)
291
+ ax.set_xlim(-5, 5)
292
+ ax.set_title("Distribution of Residuals (Log Scale)")
293
+ ax.set_xlabel("Residual Value")
294
+ ax.legend()
295
+ ax.grid(True, alpha=0.3)
296
+
297
+ plt.tight_layout()
298
+ plt.savefig(plot_summary_filename, dpi=120)
299
+ plt.close()
300
+
301
+ # --- FILE 2: DETAIL (4K) ---
302
+ print(f" > Saving Detail Plot: {plot_detail_filename}")
303
+ fig2, ax2 = plt.subplots(figsize=(20, 11))
304
+
305
+ if len(flat_approx) > 0:
306
+ ax2.scatter(flat_approx, flat_full, s=1, alpha=0.5, color='purple')
307
+ lims = [
308
+ np.min([ax2.get_xlim(), ax2.get_ylim()]),
309
+ np.max([ax2.get_xlim(), ax2.get_ylim()]),
310
+ ]
311
+ ax2.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
312
+
313
+ ax2.set_title("Residual Shrinkage (Sampled)")
314
+ ax2.set_xlabel("Approx Residuals")
315
+ ax2.set_ylabel("Full Residuals")
316
+ ax2.grid(True, alpha=0.3)
317
+
318
+ plt.tight_layout()
319
+ plt.savefig(plot_detail_filename, dpi=200)
320
+ plt.close()
321
+
206
322
  if hasattr(adata_in, "file") and adata_in.file is not None: adata_in.file.close()
207
323
  print(f"Total time: {time.perf_counter() - start_time:.2f} seconds.\n")
@@ -306,12 +306,12 @@ def NBumiPearsonResidualsCombinedGPU(
306
306
  ax.set_xlabel("Mean Raw Expression (log)")
307
307
  ax.set_ylabel("Variance of Residuals (log)")
308
308
  ax.legend()
309
- ax.grid(True, which='both', linestyle='--', alpha=0.5) # Enhanced Grid
309
+ ax.grid(True, which='both', linestyle='--', alpha=0.5)
310
310
  ax.text(0.5, -0.15, "Goal: Blue dots should form a flat line at y=1",
311
311
  transform=ax.transAxes, ha='center', fontsize=9,
312
312
  bbox=dict(facecolor='#f0f0f0', edgecolor='black', alpha=0.7))
313
313
 
314
- # Plot 3: Distribution (Histogram + KDE Overlay)
314
+ # Plot 3: Distribution (Histogram + KDE Overlay) - LOG SCALE FIXED
315
315
  ax = ax1[1]
316
316
  if len(flat_approx) > 100:
317
317
  mask_kde = (flat_approx > -10) & (flat_approx < 10)
@@ -325,8 +325,10 @@ def NBumiPearsonResidualsCombinedGPU(
325
325
  sns.kdeplot(flat_approx[mask_kde], fill=False, color='red', linewidth=2, label='Approx', ax=ax, warn_singular=False)
326
326
  sns.kdeplot(flat_full[mask_kde], fill=False, color='blue', linewidth=2, label='Full', ax=ax, warn_singular=False)
327
327
 
328
+ ax.set_yscale('log') # <--- THE CRITICAL FIX FOR "SHIT GRAPH"
329
+ ax.set_ylim(bottom=0.001) # Safety floor for log(0)
328
330
  ax.set_xlim(-5, 5)
329
- ax.set_title("Distribution of Residuals")
331
+ ax.set_title("Distribution of Residuals (Log Scale)")
330
332
  ax.set_xlabel("Residual Value")
331
333
  ax.legend()
332
334
  ax.grid(True, alpha=0.3)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: M3Drop
3
- Version: 0.4.54
3
+ Version: 0.4.56
4
4
  Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
5
5
  Home-page: https://github.com/PragalvhaSharma/m3DropNew
6
6
  Author: Tallulah Andrews
@@ -4,11 +4,11 @@ m3Drop/CoreCPU.py,sha256=csRg5TLQx1Sup7k3lDJm9OO5Oe5-1aC3u_6ldE_GIX8,18679
4
4
  m3Drop/CoreGPU.py,sha256=6LToLuWyHxX_7sC2z0Xnvy_qqgmpew5DmnCV0PxmTZQ,19785
5
5
  m3Drop/DiagnosticsCPU.py,sha256=l0Imkh3F3zo4ovihUjx7cYWYgzPdztWCN1hcBFO43nY,12943
6
6
  m3Drop/DiagnosticsGPU.py,sha256=bsatHyHszgbufneeJvFvHBTLzDuY006nP2yHPHs8s7M,14389
7
- m3Drop/NormalizationCPU.py,sha256=DmqvjcpHwkNZicEb2GBqTDBVyvtBeUSLmFRwRFDk0ms,7458
8
- m3Drop/NormalizationGPU.py,sha256=Y6RxfK6L907EQq8PoQADLmpMlXS_Vl1QiitqehgGcAw,15250
7
+ m3Drop/NormalizationCPU.py,sha256=mJUirm2nFRHRCcOgpweh7EayY1E_H-AmwkWbjaa0IFY,12660
8
+ m3Drop/NormalizationGPU.py,sha256=diYWgEutnsyzPBW5GdLFRvi593ogo0EjK3zm1tfTV-o,15397
9
9
  m3Drop/__init__.py,sha256=W_TQ9P8_7Tdsa6kDZ6IJKT0FMkX_JFvBqiP821CZIrk,2180
10
- m3drop-0.4.54.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
11
- m3drop-0.4.54.dist-info/METADATA,sha256=_EO2kAiscwMTDamIgELDn5qsv3G3sPCcUHP6QnFxYZc,5248
12
- m3drop-0.4.54.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
- m3drop-0.4.54.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
14
- m3drop-0.4.54.dist-info/RECORD,,
10
+ m3drop-0.4.56.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
11
+ m3drop-0.4.56.dist-info/METADATA,sha256=6AFDL9jDoz9TvSrnBZ76itcal6gMBC6F6LYvnSS9PXw,5248
12
+ m3drop-0.4.56.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
+ m3drop-0.4.56.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
14
+ m3drop-0.4.56.dist-info/RECORD,,