PyPI - M3Drop - Versions diffs - 0.4.55__py3-none-any.whl → 0.4.56__py3-none-any.whl - Mend

M3Drop 0.4.55py3-none-any.whl → 0.4.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

m3Drop/NormalizationCPU.py CHANGED Viewed

@@ -6,6 +6,8 @@ import h5py
 import anndata
 import pandas as pd
 import os
+import matplotlib.pyplot as plt
+import seaborn as sns
 from scipy import sparse
 try:
@@ -14,7 +16,7 @@ except ImportError:
     print("CRITICAL ERROR: 'numba' not found. Please install it (pip install numba).")
     sys.exit(1)
-# [FIX] Strict Relative Import
+# Strict Relative Import
 from .ControlDeviceCPU import ControlDevice
 # ==========================================
@@ -23,25 +25,16 @@ from .ControlDeviceCPU import ControlDevice
 @jit(nopython=True, parallel=True, fastmath=True)
 def pearson_residual_kernel_cpu(counts, tj, ti, theta, total, out_matrix):
-    """
-    Calculates Pearson residuals using Negative Binomial logic.
-    Parallelized across CPU cores.
-    """
     rows = counts.shape[0]
     cols = counts.shape[1]
     for r in prange(rows):
         ti_val = ti[r]
         for c in range(cols):
             count_val = counts[r, c]
             mu = (tj[c] * ti_val) / total
-            # theta is vector of size cols (genes)
             theta_val = theta[c]
             denom_sq = mu + ((mu * mu) / theta_val)
             denom = np.sqrt(denom_sq)
             if denom < 1e-12:
                 out_matrix[r, c] = 0.0
             else:
@@ -49,20 +42,14 @@ def pearson_residual_kernel_cpu(counts, tj, ti, theta, total, out_matrix):
 @jit(nopython=True, parallel=True, fastmath=True)
 def pearson_approx_kernel_cpu(counts, tj, ti, total, out_matrix):
-    """
-    Calculates Approximate Pearson residuals (Poisson limit).
-    """
     rows = counts.shape[0]
     cols = counts.shape[1]
     for r in prange(rows):
         ti_val = ti[r]
         for c in range(cols):
             count_val = counts[r, c]
             mu = (tj[c] * ti_val) / total
             denom = np.sqrt(mu)
             if denom < 1e-12:
                 out_matrix[r, c] = 0.0
             else:
@@ -79,12 +66,14 @@ def NBumiPearsonResidualsCombinedCPU(
     stats_filename: str,
     output_filename_full: str,
     output_filename_approx: str,
+    plot_summary_filename: str = None,
+    plot_detail_filename: str = None,
     mode: str = "auto",
     manual_target: int = 3000
 ):
     """
     CPU-Optimized: Calculates Full and Approximate residuals in a SINGLE PASS.
-    Uses Numba for acceleration on L3-sized dense chunks.
+    Includes "Sidecar" Visualization logic (Streaming Stats + Subsampling).
     """
     start_time = time.perf_counter()
     print(f"FUNCTION: NBumiPearsonResidualsCombinedCPU() | FILE: {raw_filename}")
@@ -99,53 +88,57 @@ def NBumiPearsonResidualsCombinedCPU(
     nc = device.total_rows
     print("Phase [1/2]: Initializing parameters...")
-    # Load parameters
     with open(fit_filename, 'rb') as f: fit = pickle.load(f)
-    with open(stats_filename, 'rb') as f: stats = pickle.load(f)
-    # Common params (Numpy Arrays)
     total = fit['vals']['total']
     tjs = fit['vals']['tjs'].values.astype(np.float64)
     tis = fit['vals']['tis'].values.astype(np.float64)
-    # Specific params
-    sizes = fit['sizes'].values.astype(np.float64) # For Full
+    sizes = fit['sizes'].values.astype(np.float64)
     # Setup Output Files
     adata_in = anndata.read_h5ad(raw_filename, backed='r')
     filtered_var = adata_in.var[mask]
-    # Create skeletons
     adata_out_full = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
     adata_out_full.write_h5ad(output_filename_full, compression=None)
     adata_out_approx = anndata.AnnData(obs=adata_in.obs, var=filtered_var)
     adata_out_approx.write_h5ad(output_filename_approx, compression=None)
-    # --- CHUNK SIZE FIX ---
-    # Calculate appropriate H5 storage chunks
-    storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
+    # --- VISUALIZATION SETUP (THE SIDECAR) ---
+    # 1. Sampling Rate (Strict Cap to prevent CPU RAM explosion)
+    TARGET_SAMPLES = 5_000_000
+    total_points = nc * ng_filtered
-    # [CRITICAL FIX] Clamp chunk size to total rows (nc)
-    if storage_chunk_rows > nc:
-        storage_chunk_rows = nc
+    if total_points <= TARGET_SAMPLES:
+        sampling_rate = 1.0
+    else:
+        sampling_rate = TARGET_SAMPLES / total_points
-    if storage_chunk_rows < 1:
-        storage_chunk_rows = 1
-    # ----------------------
+    print(f"   > Visualization Sampling Rate: {sampling_rate*100:.4f}% (Target: {TARGET_SAMPLES:,} points)")
+    # 2. Accumulators (Numpy Arrays - Small memory footprint)
+    acc_raw_sum    = np.zeros(ng_filtered, dtype=np.float64)
+    acc_approx_sum = np.zeros(ng_filtered, dtype=np.float64)
+    acc_approx_sq  = np.zeros(ng_filtered, dtype=np.float64)
+    acc_full_sum   = np.zeros(ng_filtered, dtype=np.float64)
+    acc_full_sq    = np.zeros(ng_filtered, dtype=np.float64)
+    # 3. Lists for Plots (Sampled Only)
+    viz_approx_samples = []
+    viz_full_samples = []
+    # -----------------------------------------
+    storage_chunk_rows = int(1_000_000_000 / (ng_filtered * 8))
+    if storage_chunk_rows > nc: storage_chunk_rows = nc
+    if storage_chunk_rows < 1: storage_chunk_rows = 1
-    # Open both files for writing simultaneously
     with h5py.File(output_filename_full, 'a') as f_full, h5py.File(output_filename_approx, 'a') as f_approx:
         if 'X' in f_full: del f_full['X']
         if 'X' in f_approx: del f_approx['X']
-        # Float64 output
-        out_x_full = f_full.create_dataset(
-            'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
-        )
-        out_x_approx = f_approx.create_dataset(
-            'X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64'
-        )
+        out_x_full = f_full.create_dataset('X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64')
+        out_x_approx = f_approx.create_dataset('X', shape=(nc, ng_filtered), chunks=(storage_chunk_rows, ng_filtered), dtype='float64')
         with h5py.File(raw_filename, 'r') as f_in:
             h5_indptr = f_in['X']['indptr']
@@ -154,7 +147,6 @@ def NBumiPearsonResidualsCombinedCPU(
             current_row = 0
             while current_row < nc:
-                # Dense mode is faster for Numba
                 end_row = device.get_next_chunk(current_row, mode='dense', overhead_multiplier=3.0)
                 if end_row is None or end_row <= current_row: break
@@ -163,7 +155,6 @@ def NBumiPearsonResidualsCombinedCPU(
                 start_idx, end_idx = h5_indptr[current_row], h5_indptr[end_row]
-                # Load & Filter
                 data = np.array(h5_data[start_idx:end_idx], dtype=np.float64)
                 indices = np.array(h5_indices[start_idx:end_idx])
                 indptr = np.array(h5_indptr[current_row:end_row+1] - h5_indptr[current_row])
@@ -172,8 +163,19 @@ def NBumiPearsonResidualsCombinedCPU(
                 chunk_csr = chunk_csr[:, mask]
                 chunk_csr.data = np.ceil(chunk_csr.data)
-                # Convert to Dense for Numba (faster than sparse iteration for dense ops)
+                # Numba needs dense
                 counts_dense = chunk_csr.toarray()
+                # --- VIZ ACCUMULATION 1: RAW MEAN ---
+                acc_raw_sum += np.sum(counts_dense, axis=0)
+                # --- VIZ SAMPLING: GENERATE INDICES ---
+                chunk_total_items = chunk_size * ng_filtered
+                n_samples_chunk = int(chunk_total_items * sampling_rate)
+                sample_indices = None
+                if n_samples_chunk > 0:
+                    sample_indices = np.random.randint(0, int(chunk_total_items), size=n_samples_chunk)
                 # --- CALC 1: APPROX ---
                 approx_out = np.empty_like(counts_dense)
@@ -184,11 +186,24 @@ def NBumiPearsonResidualsCombinedCPU(
                     total,
                     approx_out
                 )
+                # Accumulate
+                acc_approx_sum += np.sum(approx_out, axis=0)
+                # Sample
+                if sample_indices is not None:
+                    # Ravel creates a view, take copies the data. Safe.
+                    viz_approx_samples.append(np.take(approx_out.ravel(), sample_indices))
+                # Write
                 out_x_approx[current_row:end_row, :] = approx_out
+                # Square (Explicit multiplication for safety)
+                approx_out = approx_out * approx_out
+                acc_approx_sq += np.sum(approx_out, axis=0)
                 del approx_out
                 # --- CALC 2: FULL (In-place on counts_dense) ---
-                # We can reuse the counts_dense buffer for output to save RAM
                 pearson_residual_kernel_cpu(
                     counts_dense,
                     tjs,
@@ -197,11 +212,112 @@ def NBumiPearsonResidualsCombinedCPU(
                     total,
                     counts_dense # Overwrite input
                 )
+                # Accumulate
+                acc_full_sum += np.sum(counts_dense, axis=0)
+                # Sample
+                if sample_indices is not None:
+                    viz_full_samples.append(np.take(counts_dense.ravel(), sample_indices))
+                # Write
                 out_x_full[current_row:end_row, :] = counts_dense
+                # Square
+                counts_dense = counts_dense * counts_dense
+                acc_full_sq += np.sum(counts_dense, axis=0)
                 current_row = end_row
         print(f"\nPhase [2/2]: COMPLETE{' '*50}")
+    # ==========================================
+    #        VIZ GENERATION (POST-PROCESS)
+    # ==========================================
+    if plot_summary_filename and plot_detail_filename:
+        print("Phase [Viz]: Generating Diagnostics (CPU)...")
+        # 1. Finalize Variance Stats
+        mean_raw = acc_raw_sum / nc
+        mean_approx = acc_approx_sum / nc
+        mean_sq_approx = acc_approx_sq / nc
+        var_approx = mean_sq_approx - (mean_approx**2)
+        mean_full = acc_full_sum / nc
+        mean_sq_full = acc_full_sq / nc
+        var_full = mean_sq_full - (mean_full**2)
+        # 2. Finalize Samples
+        if viz_approx_samples:
+            flat_approx = np.concatenate(viz_approx_samples)
+            flat_full   = np.concatenate(viz_full_samples)
+        else:
+            flat_approx = np.array([])
+            flat_full = np.array([])
+        print(f"   > Samples Collected: {len(flat_approx):,} points")
+        # --- FILE 1: SUMMARY (1080p) ---
+        print(f"   > Saving Summary Plot: {plot_summary_filename}")
+        fig1, ax1 = plt.subplots(1, 2, figsize=(16, 7))
+        # Plot 1: Variance Stabilization
+        ax = ax1[0]
+        ax.scatter(mean_raw, var_approx, s=2, alpha=0.5, color='red', label='Approx (Poisson)')
+        ax.scatter(mean_raw, var_full, s=2, alpha=0.5, color='blue', label='Full (NB Pearson)')
+        ax.axhline(1.0, color='black', linestyle='--', linewidth=1)
+        ax.set_xscale('log')
+        ax.set_yscale('log')
+        ax.set_title("Variance Stabilization Check")
+        ax.set_xlabel("Mean Raw Expression (log)")
+        ax.set_ylabel("Variance of Residuals (log)")
+        ax.legend()
+        ax.grid(True, which='both', linestyle='--', alpha=0.5)
+        # Plot 2: Distribution (Histogram + KDE Overlay)
+        ax = ax1[1]
+        if len(flat_approx) > 100:
+            mask_kde = (flat_approx > -10) & (flat_approx < 10)
+            bins = np.linspace(-5, 5, 100)
+            ax.hist(flat_approx[mask_kde], bins=bins, color='red', alpha=0.2, density=True, label='_nolegend_')
+            ax.hist(flat_full[mask_kde], bins=bins, color='blue', alpha=0.2, density=True, label='_nolegend_')
+            sns.kdeplot(flat_approx[mask_kde], fill=False, color='red', linewidth=2, label='Approx', ax=ax, warn_singular=False)
+            sns.kdeplot(flat_full[mask_kde], fill=False, color='blue', linewidth=2, label='Full', ax=ax, warn_singular=False)
+        ax.set_yscale('log')
+        ax.set_ylim(bottom=0.001)
+        ax.set_xlim(-5, 5)
+        ax.set_title("Distribution of Residuals (Log Scale)")
+        ax.set_xlabel("Residual Value")
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(plot_summary_filename, dpi=120)
+        plt.close()
+        # --- FILE 2: DETAIL (4K) ---
+        print(f"   > Saving Detail Plot: {plot_detail_filename}")
+        fig2, ax2 = plt.subplots(figsize=(20, 11))
+        if len(flat_approx) > 0:
+            ax2.scatter(flat_approx, flat_full, s=1, alpha=0.5, color='purple')
+            lims = [
+                np.min([ax2.get_xlim(), ax2.get_ylim()]),
+                np.max([ax2.get_xlim(), ax2.get_ylim()]),
+            ]
+            ax2.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
+        ax2.set_title("Residual Shrinkage (Sampled)")
+        ax2.set_xlabel("Approx Residuals")
+        ax2.set_ylabel("Full Residuals")
+        ax2.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(plot_detail_filename, dpi=200)
+        plt.close()
     if hasattr(adata_in, "file") and adata_in.file is not None: adata_in.file.close()
     print(f"Total time: {time.perf_counter() - start_time:.2f} seconds.\n")

{m3drop-0.4.55.dist-info → m3drop-0.4.56.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: M3Drop
-Version: 0.4.55
+Version: 0.4.56
 Summary: A Python implementation of the M3Drop single-cell RNA-seq analysis tool.
 Home-page: https://github.com/PragalvhaSharma/m3DropNew
 Author: Tallulah Andrews

{m3drop-0.4.55.dist-info → m3drop-0.4.56.dist-info}/RECORD RENAMED Viewed

@@ -4,11 +4,11 @@ m3Drop/CoreCPU.py,sha256=csRg5TLQx1Sup7k3lDJm9OO5Oe5-1aC3u_6ldE_GIX8,18679
 m3Drop/CoreGPU.py,sha256=6LToLuWyHxX_7sC2z0Xnvy_qqgmpew5DmnCV0PxmTZQ,19785
 m3Drop/DiagnosticsCPU.py,sha256=l0Imkh3F3zo4ovihUjx7cYWYgzPdztWCN1hcBFO43nY,12943
 m3Drop/DiagnosticsGPU.py,sha256=bsatHyHszgbufneeJvFvHBTLzDuY006nP2yHPHs8s7M,14389
-m3Drop/NormalizationCPU.py,sha256=DmqvjcpHwkNZicEb2GBqTDBVyvtBeUSLmFRwRFDk0ms,7458
+m3Drop/NormalizationCPU.py,sha256=mJUirm2nFRHRCcOgpweh7EayY1E_H-AmwkWbjaa0IFY,12660
 m3Drop/NormalizationGPU.py,sha256=diYWgEutnsyzPBW5GdLFRvi593ogo0EjK3zm1tfTV-o,15397
 m3Drop/__init__.py,sha256=W_TQ9P8_7Tdsa6kDZ6IJKT0FMkX_JFvBqiP821CZIrk,2180
-m3drop-0.4.55.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
-m3drop-0.4.55.dist-info/METADATA,sha256=vIYCrS0eBWJMKi7OJy-8oOohHVa4j6KDMXlwA5jIijc,5248
-m3drop-0.4.55.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-m3drop-0.4.55.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
-m3drop-0.4.55.dist-info/RECORD,,
+m3drop-0.4.56.dist-info/licenses/LICENSE,sha256=44Iqpp8Fc10Xzd5T7cT9UhO31Qftk3gBiCjtpwilP_k,1074
+m3drop-0.4.56.dist-info/METADATA,sha256=6AFDL9jDoz9TvSrnBZ76itcal6gMBC6F6LYvnSS9PXw,5248
+m3drop-0.4.56.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+m3drop-0.4.56.dist-info/top_level.txt,sha256=AEULFEFIgFtAwS-KBlIFoYXrqczX_rwqrEcdK46GIrA,7
+m3drop-0.4.56.dist-info/RECORD,,

{m3drop-0.4.55.dist-info → m3drop-0.4.56.dist-info}/WHEEL RENAMED Viewed

File without changes

{m3drop-0.4.55.dist-info → m3drop-0.4.56.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{m3drop-0.4.55.dist-info → m3drop-0.4.56.dist-info}/top_level.txt RENAMED Viewed

File without changes

M3Drop 0.4.55__py3-none-any.whl → 0.4.56__py3-none-any.whl

M3Drop 0.4.55py3-none-any.whl → 0.4.56py3-none-any.whl