PyPI - pythonflex - Versions diffs - 0.1.5__tar.gz → 0.1.6__tar.gz - Mend

pythonflex 0.1.5tar.gz → 0.1.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

pythonflex-0.1.6/.vscode/settings.json ADDED Viewed

@@ -0,0 +1,5 @@
+{
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda",
+    "python-envs.pythonProjects": []
+}

{pythonflex-0.1.5 → pythonflex-0.1.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pythonflex
-Version: 0.1.5
+Version: 0.1.6
 Summary: pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data.
 Author-email: Yasir Demirtaş <tyasird@hotmail.com>
 Requires-Python: >=3.9

{pythonflex-0.1.5 → pythonflex-0.1.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "pythonflex"
-version = "0.1.5"
+version = "0.1.6"
 description = "pythonFLEX is a benchmarking toolkit for evaluating CRISPR screen results against biological gold standards. The toolkit computes gene-level and complex-level performance metrics, helping researchers systematically assess the biological relevance and resolution of their CRISPR screening data."
 readme = "README.md"
 authors = [

{pythonflex-0.1.5 → pythonflex-0.1.6}/src/pythonflex/analysis.py RENAMED Viewed

@@ -172,15 +172,14 @@ def pra(dataset_name, matrix, is_corr=False):
         pr_auc = metrics.auc(recall, precision)
         df["precision"] = precision
         df["recall"] = recall
     log.info(f"PR-AUC: {pr_auc:.4f}, Number of true positives: {df['prediction'].sum()}")
     dsave(df, "pra", dataset_name)
     dsave(pr_auc, "pr_auc", dataset_name)
-    log.done(f"Global PRA completed for {dataset_name}")
-    return df, pr_auc
+    dsave( _corrected_auc(df) , "corrected_pr_auc", dataset_name)
+    log.done(f"Global PRA completed for {dataset_name}")
+    return df
@@ -189,6 +188,9 @@ def pra(dataset_name, matrix, is_corr=False):
 # helper functions for PRA per-complex analysis
 # --------------------------------------------------------------------------
+def _corrected_auc(df: pd.DataFrame) -> float:
+    return np.trapz(df["precision"], df["recall"]) - df["precision"].iloc[-1]
 def _build_gene_to_pair_indices(pairwise_df):
     indices = pairwise_df.index.values
     genes = pd.concat([pairwise_df['gene1'], pairwise_df['gene2']], ignore_index=True)
@@ -240,10 +242,15 @@ def _dump_pairwise_memmap(df: pd.DataFrame, tag: str) -> Path:
-def _init_worker(memmap_path, gene_to_pair_indices):
+# Global variables for worker processes (compatible with older joblib)
+PAIRWISE_DF = None
+GENE2IDX = None
+def _init_worker_globals(memmap_path, gene_to_pair_indices):
+    """Initialize global variables for worker processes"""
     global PAIRWISE_DF, GENE2IDX
     PAIRWISE_DF = load(memmap_path)
-    GENE2IDX    = gene_to_pair_indices
+    GENE2IDX = gene_to_pair_indices
@@ -263,42 +270,52 @@ def delete_memmap(memmap_path, log, wait_seconds=0.1):
 # --------------------------------------------------------------------------
 # Process each chunk of terms
 # --------------------------------------------------------------------------
-def _process_chunk(chunk_terms, min_genes):
-    pairwise_df = PAIRWISE_DF
-    gene_to_pair_indices = GENE2IDX
-    local_auc_scores = {}
-    for idx, row in chunk_terms.iterrows():
-        gene_set = set(row.used_genes)
-        if len(gene_set) < min_genes:
-            continue
+def _process_chunk(chunk_terms, min_genes, memmap_path, gene_to_pair_indices):
+    """Process a chunk of terms - compatible with older joblib versions"""
+    try:
+        # Load data in each worker (compatible with older joblib)
+        pairwise_df = load(memmap_path)
+        local_auc_scores = {}
+        local_corrected_auc_scores = {}
+        for idx, row in chunk_terms.iterrows():
+            gene_set = set(row.used_genes)
+            if len(gene_set) < min_genes:
+                continue
-        candidate_indices = bitarray(len(pairwise_df))
-        for g in gene_set:
-            if g in gene_to_pair_indices:
-                candidate_indices[gene_to_pair_indices[g]] = True
-        if not candidate_indices.any():
-            continue
+            candidate_indices = bitarray(len(pairwise_df))
+            for g in gene_set:
+                if g in gene_to_pair_indices:
+                    candidate_indices[gene_to_pair_indices[g]] = True
+            if not candidate_indices.any():
+                continue
-        selected = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
-        sub_df   = pairwise_df.iloc[selected]
+            selected = np.unpackbits(candidate_indices).view(bool)[:len(pairwise_df)]
+            sub_df   = pairwise_df.iloc[selected]
-        complex_id = str(idx)
-        pattern    = r'(?:^|;)' + re.escape(complex_id) + r'(?:;|$)'
-        true_label = sub_df["complex_ids"].str.contains(pattern, regex=True).astype(int)
-        mask       = (sub_df["complex_ids"] == "") | (true_label == 1)
-        preds      = true_label[mask]
+            complex_id = str(idx)
+            pattern    = r'(?:^|;)' + re.escape(complex_id) + r'(?:;|$)'
+            true_label = sub_df["complex_ids"].str.contains(pattern, regex=True).astype(int)
+            mask       = (sub_df["complex_ids"] == "") | (true_label == 1)
+            preds      = true_label[mask]
-        if preds.sum() == 0:
-            continue
+            if preds.sum() == 0:
+                continue
-        tp_cum   = preds.cumsum()
-        precision = tp_cum / (np.arange(len(preds)) + 1)
-        recall    = tp_cum / tp_cum.iloc[-1]
-        if len(recall) >= 2 and recall.iloc[-1] != 0:
-            local_auc_scores[idx] = metrics.auc(recall, precision)
+            tp_cum   = preds.cumsum()
+            precision = tp_cum / (np.arange(len(preds)) + 1)
+            recall    = tp_cum / tp_cum.iloc[-1]
+            if len(recall) >= 2 and recall.iloc[-1] != 0:
+                # Compute regular AUC
+                local_auc_scores[idx] = metrics.auc(recall, precision)
+                # Compute corrected AUC using the same logic as _corrected_auc function
+                local_corrected_auc_scores[idx] = np.trapz(precision, recall) - precision.iloc[-1]
-    return local_auc_scores
+        return {'auc': local_auc_scores, 'corrected_auc': local_corrected_auc_scores}
+    except Exception as e:
+        # Return error info for debugging
+        return {'error': str(e), 'chunk_size': len(chunk_terms)}
@@ -345,26 +362,23 @@ def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
     results = None
     try:
-        # Simplified parallel execution without progress callback interference
+        # Compatible parallel execution for older joblib versions
         log.started("Processing chunks in parallel")
-        with tqdm(total=len(chunks), desc="Per-complex PRA") as pbar:
-            results = Parallel(
-                n_jobs=8,
-                temp_folder=os.path.dirname(memmap_path),
-                max_nbytes=None,
-                mmap_mode="r",
-                initializer=_init_worker,
-                initargs=(memmap_path, gene_to_pair_indices),
-                verbose=0  # Reduce joblib verbosity
-            )(delayed(_process_chunk)(chunk, min_genes) for chunk in chunks)
-            # Update progress bar once all tasks are complete
-            pbar.update(len(chunks))
+        # Use a more conservative approach with older joblib
+        results = Parallel(
+            n_jobs=min(4, len(chunks)),  # Limit to 4 workers or number of chunks
+            temp_folder=os.path.dirname(memmap_path),
+            max_nbytes='100M',  # Set memory limit
+            verbose=1  # Show progress
+        )(delayed(_process_chunk)(chunk, min_genes, memmap_path, gene_to_pair_indices)
+          for chunk in tqdm(chunks, desc="Per-complex PRA"))
         log.done("Processing chunks in parallel")
     except Exception as e:
         log.error(f"Error during parallel processing: {e}")
+        log.error(f"Error type: {type(e).__name__}")
         # Still try to clean up the memmap file
         try:
             if os.path.exists(memmap_path):
@@ -383,19 +397,29 @@ def pra_percomplex(dataset_name, matrix, is_corr=False, chunk_size=200):
         except OSError as e:
             log.warning(f"Failed to remove memmap file {memmap_path}: {e}")
-    # Merge results with error handling
+    # Merge results with enhanced error handling
     auc_scores = {}
+    corrected_auc_scores = {}
     if results:
-        for res in results:
+        for i, res in enumerate(results):
             if isinstance(res, dict):
-                auc_scores.update(res)
-            elif isinstance(res, tuple) and res[0] is None:
-                log.error(res[1])  # Log the error message from the chunk
+                if 'error' in res:
+                    log.error(f"Error in chunk {i}: {res['error']}")
+                elif 'auc' in res and 'corrected_auc' in res:
+                    # New format with both AUC types
+                    auc_scores.update(res['auc'])
+                    corrected_auc_scores.update(res['corrected_auc'])
+                else:
+                    # Fallback for old format (backward compatibility)
+                    auc_scores.update(res)
+            elif isinstance(res, tuple) and len(res) >= 2 and res[0] is None:
+                log.error(f"Chunk {i} error: {res[1]}")
             else:
-                log.error(f"Ignoring unexpected chunk result: {res}")
+                log.warning(f"Unexpected result type from chunk {i}: {type(res)} - {res}")
     # Add the computed AUC scores to the terms DataFrame.
     terms["auc_score"] = pd.Series(auc_scores)
+    terms["corrected_auc_score"] = pd.Series(corrected_auc_scores)
     terms.drop(columns=["hash"], inplace=True)
     dsave(terms, "pra_percomplex", dataset_name)
     log.done(f"Per-complex PRA completed.")
@@ -1296,4 +1320,3 @@ def save_results_to_csv(categories = ["complex_contributions", "pr_auc", "pra_pe
 #     dsave(pr_auc, "pr_auc", dataset_name)
 #     log.done(f"Global PRA completed for {dataset_name}")
 #     return df, pr_auc

{pythonflex-0.1.5 → pythonflex-0.1.6}/src/pythonflex/examples/basic_usage.py RENAMED Viewed

@@ -6,16 +6,21 @@ Demonstrates initialization, data loading, analysis, and plotting.
 import pythonflex as flex
 inputs = {
-    "SNF": {
-        "path":  "C:/Users/yd/Desktop/projects/datasets/fused_similarity_network.csv",
+    "Melanoma (63 Screens)": {
+        "path": flex.get_example_data_path("melanoma_cell_lines_500_genes.csv"),
         "sort": "high"
     },
-    "miss_SNF": {
-        "path":  "C:/Users/yd/Desktop/projects/datasets/miss_snf_fused_similarity_network.csv",
+    "Liver (24 Screens)": {
+        "path": flex.get_example_data_path("liver_cell_lines_500_genes.csv"),
         "sort": "high"
-    }
+    },
+    "Neuroblastoma (37 Screens)": {
+        "path": flex.get_example_data_path("neuroblastoma_cell_lines_500_genes.csv"),
+        "sort": "high"
+    },
 }
 #%%
 default_config = {
@@ -51,8 +56,8 @@ terms, genes_in_terms = flex.load_gold_standard()
 #%%
 # Run analysis
 for name, dataset in data.items():
-    df, pr_auc = flex.pra(name, dataset, is_corr=True)
-    fpc = flex.pra_percomplex(name, dataset, is_corr=True)
+    pra = flex.pra(name, dataset, is_corr=False)
+    fpc = flex.pra_percomplex(name, dataset, is_corr=False)
     cc = flex.complex_contributions(name)
@@ -60,7 +65,7 @@ for name, dataset in data.items():
 # Generate plots
 flex.plot_auc_scores()
 flex.plot_precision_recall_curve()
-flex.plot_percomplex_scatter()
+flex.plot_percomplex_scatter(n_top=20)
 flex.plot_percomplex_scatter_bysize()
 flex.plot_significant_complexes()
 flex.plot_complex_contributions()
@@ -82,27 +87,3 @@ flex.save_results_to_csv()
-# %%
-import os
-import glob
-inputs = {
-    "depmap all": {
-        "path":  "../../../../datasets/depmap/24Q4/depmap_geneeffect_all_cellines.csv",
-        "sort": "high"
-    }
-}
-# Now auto-discover the rest of the CSVs in the folder
-DATA_DIR = "../../../../datasets/depmap/24Q4/subset/"
-for path in glob.glob(os.path.join(DATA_DIR, "*.csv")):
-    # Derive the key name from filename (without extension)
-    key = os.path.splitext(os.path.basename(path))[0]
-    inputs[key] = {
-        "path": path,
-        "sort": "high"
-    }
-# inputs now has "depmap all" first, then one entry per CSV in DATA_DIR
-print(inputs)

{pythonflex-0.1.5 → pythonflex-0.1.6}/src/pythonflex/examples/dataset_filtering.py RENAMED Viewed

@@ -8,6 +8,8 @@ model = pd.read_csv("../../../../datasets/depmap/24Q4/Model.csv",index_col=0)
 df.columns = df.columns.str.split(" \\(").str[0]
 df = df.T
+#%%
 # %%
 # get ModelID of selected disease for example OncotreePrimaryDisease==Melanoma
 melanoma = model[model.OncotreePrimaryDisease=="Melanoma"].index.unique().values

{pythonflex-0.1.5 → pythonflex-0.1.6}/src/pythonflex/plotting.py RENAMED Viewed

@@ -470,9 +470,10 @@ def plot_auc_scores():
     plot_config = config["plotting"]
     pra_dict = dload("pr_auc")
-    # Prepare data
-    datasets = list(pra_dict.keys())
-    auc_scores = list(pra_dict.values())
+    sorted_items = sorted(pra_dict.items(), key=lambda x: x[1], reverse=True)
+    datasets = [k for k, _ in sorted_items]
+    auc_scores = [v for _, v in sorted_items]
     # Create figure and axis
     fig, ax = plt.subplots()
@@ -483,7 +484,7 @@ def plot_auc_scores():
     colors = [cmap(i / (num_datasets + 1)) for i in range(1, num_datasets + 1)]
     # Plot bars
-    bars = ax.bar(datasets, auc_scores, color=colors, edgecolor="black")
+    ax.bar(datasets, auc_scores, color=colors, edgecolor="black")
     # Set y-axis limits dynamically
     ax.set_ylim(0, max(auc_scores) + 0.01)
@@ -491,6 +492,7 @@ def plot_auc_scores():
     # Set title and labels
     ax.set_title("AUC scores for the datasets")
     ax.set_ylabel("AUC score")
+    plt.xticks(rotation=45, ha="right")
     # Add grid (already handled by rcParams)
     ax.grid(axis='y')

pythonflex-0.1.6/test/test_corrected_auc.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""
+Simple test to verify the corrected AUC implementation in pra_percomplex function.
+"""
+import pandas as pd
+import numpy as np
+# Create a simple test DataFrame to simulate the corrected_auc calculation
+def test_corrected_auc():
+    # Create test data with precision and recall values
+    precision = np.array([1.0, 0.67, 0.75, 0.8, 0.6])
+    recall = np.array([0.2, 0.4, 0.6, 0.8, 1.0])
+    # Expected corrected AUC calculation: trapz(precision, recall) - precision[-1]
+    expected_corrected_auc = np.trapz(precision, recall) - precision[-1]
+    print(f"Expected corrected AUC: {expected_corrected_auc:.6f}")
+    # Components of the calculation
+    regular_auc = np.trapz(precision, recall)  # This is the area under the curve
+    last_precision = precision[-1]
+    corrected_auc = regular_auc - last_precision
+    print(f"Regular AUC (trapz): {regular_auc:.6f}")
+    print(f"Last precision: {last_precision:.6f}")
+    print(f"Corrected AUC: {corrected_auc:.6f}")
+    # Verify they match
+    assert np.isclose(expected_corrected_auc, corrected_auc), "Corrected AUC calculation mismatch!"
+    print("✓ Corrected AUC calculation is correct!")
+if __name__ == "__main__":
+    test_corrected_auc()
+    print("\nThe corrected AUC implementation in pra_percomplex function should work correctly.")
+    print("Both regular AUC and corrected AUC will be computed for each complex term.")

pythonflex-0.1.6/test/test_inputs.py ADDED Viewed

@@ -0,0 +1,44 @@
+#%%
+import os
+# # Define specific cell line types you're interested in
+DATA_DIR = "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/subset/"
+# Specific cell lines of interest with "_cell_lines" suffix removed
+cell_line_files = [
+    "soft_tissue_cell_lines.csv",
+    "skin_cell_lines.csv",
+    "lung_cell_lines.csv",
+    "head_and_neck_cell_lines.csv",
+    "esophagus_stomach_cell_lines.csv",
+    "pleura_cell_lines.csv"
+]
+inputs = {}
+# Create inputs dict with shortened names (removing "_cell_lines" suffix)
+for filename in cell_line_files:
+    # Remove .csv extension and _cell_lines suffix
+    key = filename.replace("_cell_lines.csv", "")
+    full_path = os.path.join(DATA_DIR, filename)
+    inputs[key] = {
+        "path": full_path,
+        "sort": "high"
+    }
+inputs = {}
+inputs['depmap'] = {
+    "path": "C:/Users/yd/Desktop/projects/_datasets/depmap/25Q2/gene_effect.csv",
+    "sort": "high"
+}
+# Print the resulting inputs dictionary
+print("Configured inputs:")
+for key, value in inputs.items():
+    print(f"  {key}: {value['path']}")